PostgreSQL Source Code  git master
nbtree.h File Reference
#include "access/amapi.h"
#include "access/itup.h"
#include "access/sdir.h"
#include "access/xlogreader.h"
#include "catalog/pg_index.h"
#include "lib/stringinfo.h"
#include "storage/bufmgr.h"
#include "storage/shm_toc.h"
Include dependency graph for nbtree.h:
This graph shows which files directly or indirectly include this file:

Go to the source code of this file.

Data Structures

struct  BTPageOpaqueData
 
struct  BTMetaPageData
 
struct  BTStackData
 
struct  BTScanInsertData
 
struct  BTInsertStateData
 
struct  BTScanPosItem
 
struct  BTScanPosData
 
struct  BTArrayKeyInfo
 
struct  BTScanOpaqueData
 

Macros

#define BTP_LEAF   (1 << 0) /* leaf page, i.e. not internal page */
 
#define BTP_ROOT   (1 << 1) /* root page (has no parent) */
 
#define BTP_DELETED   (1 << 2) /* page has been deleted from tree */
 
#define BTP_META   (1 << 3) /* meta-page */
 
#define BTP_HALF_DEAD   (1 << 4) /* empty, but still in tree */
 
#define BTP_SPLIT_END   (1 << 5) /* rightmost page of split group */
 
#define BTP_HAS_GARBAGE   (1 << 6) /* page has LP_DEAD tuples */
 
#define BTP_INCOMPLETE_SPLIT   (1 << 7) /* right sibling's downlink is missing */
 
#define MAX_BT_CYCLE_ID   0xFF7F
 
#define BTPageGetMeta(p)   ((BTMetaPageData *) PageGetContents(p))
 
#define BTREE_METAPAGE   0 /* first page is meta */
 
#define BTREE_MAGIC   0x053162 /* magic number in metapage */
 
#define BTREE_VERSION   4 /* current version number */
 
#define BTREE_MIN_VERSION   2 /* minimal supported version number */
 
#define BTREE_NOVAC_VERSION   3 /* minimal version with all meta fields */
 
#define BTMaxItemSize(page)
 
#define BTMaxItemSizeNoHeapTid(page)
 
#define BTREE_MIN_FILLFACTOR   10
 
#define BTREE_DEFAULT_FILLFACTOR   90
 
#define BTREE_NONLEAF_FILLFACTOR   70
 
#define BTREE_SINGLEVAL_FILLFACTOR   96
 
#define P_NONE   0
 
#define P_LEFTMOST(opaque)   ((opaque)->btpo_prev == P_NONE)
 
#define P_RIGHTMOST(opaque)   ((opaque)->btpo_next == P_NONE)
 
#define P_ISLEAF(opaque)   (((opaque)->btpo_flags & BTP_LEAF) != 0)
 
#define P_ISROOT(opaque)   (((opaque)->btpo_flags & BTP_ROOT) != 0)
 
#define P_ISDELETED(opaque)   (((opaque)->btpo_flags & BTP_DELETED) != 0)
 
#define P_ISMETA(opaque)   (((opaque)->btpo_flags & BTP_META) != 0)
 
#define P_ISHALFDEAD(opaque)   (((opaque)->btpo_flags & BTP_HALF_DEAD) != 0)
 
#define P_IGNORE(opaque)   (((opaque)->btpo_flags & (BTP_DELETED|BTP_HALF_DEAD)) != 0)
 
#define P_HAS_GARBAGE(opaque)   (((opaque)->btpo_flags & BTP_HAS_GARBAGE) != 0)
 
#define P_INCOMPLETE_SPLIT(opaque)   (((opaque)->btpo_flags & BTP_INCOMPLETE_SPLIT) != 0)
 
#define P_HIKEY   ((OffsetNumber) 1)
 
#define P_FIRSTKEY   ((OffsetNumber) 2)
 
#define P_FIRSTDATAKEY(opaque)   (P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY)
 
#define INDEX_ALT_TID_MASK   INDEX_AM_RESERVED_BIT
 
#define BT_RESERVED_OFFSET_MASK   0xF000
 
#define BT_N_KEYS_OFFSET_MASK   0x0FFF
 
#define BT_HEAP_TID_ATTR   0x1000
 
#define BTreeInnerTupleGetDownLink(itup)   ItemPointerGetBlockNumberNoCheck(&((itup)->t_tid))
 
#define BTreeInnerTupleSetDownLink(itup, blkno)   ItemPointerSetBlockNumber(&((itup)->t_tid), (blkno))
 
#define BTreeTupleGetTopParent(itup)   ItemPointerGetBlockNumberNoCheck(&((itup)->t_tid))
 
#define BTreeTupleSetTopParent(itup, blkno)
 
#define BTreeTupleGetNAtts(itup, rel)
 
#define BTreeTupleSetNAtts(itup, n)
 
#define BTreeTupleGetHeapTID(itup)
 
#define BTreeTupleSetAltHeapTID(itup)
 
#define BTCommuteStrategyNumber(strat)   (BTMaxStrategyNumber + 1 - (strat))
 
#define BTORDER_PROC   1
 
#define BTSORTSUPPORT_PROC   2
 
#define BTINRANGE_PROC   3
 
#define BTNProcs   3
 
#define BT_READ   BUFFER_LOCK_SHARE
 
#define BT_WRITE   BUFFER_LOCK_EXCLUSIVE
 
#define BTScanPosIsPinned(scanpos)
 
#define BTScanPosUnpin(scanpos)
 
#define BTScanPosUnpinIfPinned(scanpos)
 
#define BTScanPosIsValid(scanpos)
 
#define BTScanPosInvalidate(scanpos)
 
#define SK_BT_REQFWD   0x00010000 /* required to continue forward scan */
 
#define SK_BT_REQBKWD   0x00020000 /* required to continue backward scan */
 
#define SK_BT_INDOPTION_SHIFT   24 /* must clear the above bits */
 
#define SK_BT_DESC   (INDOPTION_DESC << SK_BT_INDOPTION_SHIFT)
 
#define SK_BT_NULLS_FIRST   (INDOPTION_NULLS_FIRST << SK_BT_INDOPTION_SHIFT)
 
#define PROGRESS_BTREE_PHASE_INDEXBUILD_TABLESCAN   2
 
#define PROGRESS_BTREE_PHASE_PERFORMSORT_1   3
 
#define PROGRESS_BTREE_PHASE_PERFORMSORT_2   4
 
#define PROGRESS_BTREE_PHASE_LEAF_LOAD   5
 

Typedefs

typedef uint16 BTCycleId
 
typedef struct BTPageOpaqueData BTPageOpaqueData
 
typedef BTPageOpaqueDataBTPageOpaque
 
typedef struct BTMetaPageData BTMetaPageData
 
typedef struct BTStackData BTStackData
 
typedef BTStackDataBTStack
 
typedef struct BTScanInsertData BTScanInsertData
 
typedef BTScanInsertDataBTScanInsert
 
typedef struct BTInsertStateData BTInsertStateData
 
typedef BTInsertStateDataBTInsertState
 
typedef struct BTScanPosItem BTScanPosItem
 
typedef struct BTScanPosData BTScanPosData
 
typedef BTScanPosDataBTScanPos
 
typedef struct BTArrayKeyInfo BTArrayKeyInfo
 
typedef struct BTScanOpaqueData BTScanOpaqueData
 
typedef BTScanOpaqueDataBTScanOpaque
 

Functions

void btbuildempty (Relation index)
 
bool btinsert (Relation rel, Datum *values, bool *isnull, ItemPointer ht_ctid, Relation heapRel, IndexUniqueCheck checkUnique, struct IndexInfo *indexInfo)
 
IndexScanDesc btbeginscan (Relation rel, int nkeys, int norderbys)
 
Size btestimateparallelscan (void)
 
void btinitparallelscan (void *target)
 
bool btgettuple (IndexScanDesc scan, ScanDirection dir)
 
int64 btgetbitmap (IndexScanDesc scan, TIDBitmap *tbm)
 
void btrescan (IndexScanDesc scan, ScanKey scankey, int nscankeys, ScanKey orderbys, int norderbys)
 
void btparallelrescan (IndexScanDesc scan)
 
void btendscan (IndexScanDesc scan)
 
void btmarkpos (IndexScanDesc scan)
 
void btrestrpos (IndexScanDesc scan)
 
IndexBulkDeleteResultbtbulkdelete (IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state)
 
IndexBulkDeleteResultbtvacuumcleanup (IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
 
bool btcanreturn (Relation index, int attno)
 
bool _bt_parallel_seize (IndexScanDesc scan, BlockNumber *pageno)
 
void _bt_parallel_release (IndexScanDesc scan, BlockNumber scan_page)
 
void _bt_parallel_done (IndexScanDesc scan)
 
void _bt_parallel_advance_array_keys (IndexScanDesc scan)
 
bool _bt_doinsert (Relation rel, IndexTuple itup, IndexUniqueCheck checkUnique, Relation heapRel)
 
Buffer _bt_getstackbuf (Relation rel, BTStack stack)
 
void _bt_finish_split (Relation rel, Buffer bbuf, BTStack stack)
 
OffsetNumber _bt_findsplitloc (Relation rel, Page page, OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem, bool *newitemonleft)
 
void _bt_initmetapage (Page page, BlockNumber rootbknum, uint32 level)
 
void _bt_update_meta_cleanup_info (Relation rel, TransactionId oldestBtpoXact, float8 numHeapTuples)
 
void _bt_upgrademetapage (Page page)
 
Buffer _bt_getroot (Relation rel, int access)
 
Buffer _bt_gettrueroot (Relation rel)
 
int _bt_getrootheight (Relation rel)
 
bool _bt_heapkeyspace (Relation rel)
 
void _bt_checkpage (Relation rel, Buffer buf)
 
Buffer _bt_getbuf (Relation rel, BlockNumber blkno, int access)
 
Buffer _bt_relandgetbuf (Relation rel, Buffer obuf, BlockNumber blkno, int access)
 
void _bt_relbuf (Relation rel, Buffer buf)
 
void _bt_pageinit (Page page, Size size)
 
bool _bt_page_recyclable (Page page)
 
void _bt_delitems_delete (Relation rel, Buffer buf, OffsetNumber *itemnos, int nitems, Relation heapRel)
 
void _bt_delitems_vacuum (Relation rel, Buffer buf, OffsetNumber *itemnos, int nitems, BlockNumber lastBlockVacuumed)
 
int _bt_pagedel (Relation rel, Buffer buf)
 
BTStack _bt_search (Relation rel, BTScanInsert key, Buffer *bufP, int access, Snapshot snapshot)
 
Buffer _bt_moveright (Relation rel, BTScanInsert key, Buffer buf, bool forupdate, BTStack stack, int access, Snapshot snapshot)
 
OffsetNumber _bt_binsrch_insert (Relation rel, BTInsertState insertstate)
 
int32 _bt_compare (Relation rel, BTScanInsert key, Page page, OffsetNumber offnum)
 
bool _bt_first (IndexScanDesc scan, ScanDirection dir)
 
bool _bt_next (IndexScanDesc scan, ScanDirection dir)
 
Buffer _bt_get_endpoint (Relation rel, uint32 level, bool rightmost, Snapshot snapshot)
 
BTScanInsert _bt_mkscankey (Relation rel, IndexTuple itup)
 
void _bt_freestack (BTStack stack)
 
void _bt_preprocess_array_keys (IndexScanDesc scan)
 
void _bt_start_array_keys (IndexScanDesc scan, ScanDirection dir)
 
bool _bt_advance_array_keys (IndexScanDesc scan, ScanDirection dir)
 
void _bt_mark_array_keys (IndexScanDesc scan)
 
void _bt_restore_array_keys (IndexScanDesc scan)
 
void _bt_preprocess_keys (IndexScanDesc scan)
 
bool _bt_checkkeys (IndexScanDesc scan, IndexTuple tuple, int tupnatts, ScanDirection dir, bool *continuescan)
 
void _bt_killitems (IndexScanDesc scan)
 
BTCycleId _bt_vacuum_cycleid (Relation rel)
 
BTCycleId _bt_start_vacuum (Relation rel)
 
void _bt_end_vacuum (Relation rel)
 
void _bt_end_vacuum_callback (int code, Datum arg)
 
Size BTreeShmemSize (void)
 
void BTreeShmemInit (void)
 
byteabtoptions (Datum reloptions, bool validate)
 
bool btproperty (Oid index_oid, int attno, IndexAMProperty prop, const char *propname, bool *res, bool *isnull)
 
char * btbuildphasename (int64 phasenum)
 
IndexTuple _bt_truncate (Relation rel, IndexTuple lastleft, IndexTuple firstright, BTScanInsert itup_key)
 
int _bt_keep_natts_fast (Relation rel, IndexTuple lastleft, IndexTuple firstright)
 
bool _bt_check_natts (Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum)
 
void _bt_check_third_page (Relation rel, Relation heap, bool needheaptidspace, Page page, IndexTuple newtup)
 
bool btvalidate (Oid opclassoid)
 
IndexBuildResultbtbuild (Relation heap, Relation index, struct IndexInfo *indexInfo)
 
void _bt_parallel_build_main (dsm_segment *seg, shm_toc *toc)
 

Macro Definition Documentation

◆ BT_HEAP_TID_ATTR

#define BT_HEAP_TID_ATTR   0x1000

Definition at line 298 of file nbtree.h.

◆ BT_N_KEYS_OFFSET_MASK

#define BT_N_KEYS_OFFSET_MASK   0x0FFF

Definition at line 297 of file nbtree.h.

Referenced by _bt_check_natts().

◆ BT_READ

◆ BT_RESERVED_OFFSET_MASK

#define BT_RESERVED_OFFSET_MASK   0xF000

Definition at line 296 of file nbtree.h.

◆ BT_WRITE

◆ BTCommuteStrategyNumber

#define BTCommuteStrategyNumber (   strat)    (BTMaxStrategyNumber + 1 - (strat))

Definition at line 374 of file nbtree.h.

Referenced by _bt_compare_scankey_args(), and _bt_fix_scankey_strategy().

◆ BTINRANGE_PROC

#define BTINRANGE_PROC   3

Definition at line 394 of file nbtree.h.

Referenced by assignProcTypes(), btvalidate(), and transformFrameOffset().

◆ BTMaxItemSize

#define BTMaxItemSize (   page)
Value:
3*sizeof(ItemIdData) + \
3*sizeof(ItemPointerData)) - \
MAXALIGN(sizeof(BTPageOpaqueData))) / 3)
#define SizeOfPageHeaderData
Definition: bufpage.h:216
#define PageGetPageSize(page)
Definition: bufpage.h:268
#define MAXALIGN(LEN)
Definition: c.h:685
#define MAXALIGN_DOWN(LEN)
Definition: c.h:697

Definition at line 147 of file nbtree.h.

Referenced by _bt_buildadd(), _bt_check_third_page(), _bt_findinsertloc(), and bt_target_page_check().

◆ BTMaxItemSizeNoHeapTid

#define BTMaxItemSizeNoHeapTid (   page)
Value:
MAXALIGN(sizeof(BTPageOpaqueData))) / 3)
#define SizeOfPageHeaderData
Definition: bufpage.h:216
#define PageGetPageSize(page)
Definition: bufpage.h:268
#define MAXALIGN(LEN)
Definition: c.h:685
#define MAXALIGN_DOWN(LEN)
Definition: c.h:697

Definition at line 153 of file nbtree.h.

Referenced by _bt_check_third_page(), and bt_target_page_check().

◆ BTNProcs

#define BTNProcs   3

Definition at line 395 of file nbtree.h.

Referenced by bthandler().

◆ BTORDER_PROC

◆ BTP_DELETED

#define BTP_DELETED   (1 << 2) /* page has been deleted from tree */

Definition at line 73 of file nbtree.h.

Referenced by _bt_unlink_halfdead_page(), and btree_xlog_unlink_page().

◆ BTP_HALF_DEAD

#define BTP_HALF_DEAD   (1 << 4) /* empty, but still in tree */

◆ BTP_HAS_GARBAGE

#define BTP_HAS_GARBAGE   (1 << 6) /* page has LP_DEAD tuples */

◆ BTP_INCOMPLETE_SPLIT

#define BTP_INCOMPLETE_SPLIT   (1 << 7) /* right sibling's downlink is missing */

◆ BTP_LEAF

#define BTP_LEAF   (1 << 0) /* leaf page, i.e. not internal page */

◆ BTP_META

#define BTP_META   (1 << 3) /* meta-page */

Definition at line 74 of file nbtree.h.

Referenced by _bt_initmetapage(), _bt_restore_meta(), and _bt_upgrademetapage().

◆ BTP_ROOT

#define BTP_ROOT   (1 << 1) /* root page (has no parent) */

Definition at line 72 of file nbtree.h.

Referenced by _bt_getroot(), _bt_newroot(), _bt_split(), _bt_uppershutdown(), and btree_xlog_newroot().

◆ BTP_SPLIT_END

#define BTP_SPLIT_END   (1 << 5) /* rightmost page of split group */

Definition at line 76 of file nbtree.h.

Referenced by _bt_split(), btree_mask(), and btvacuumpage().

◆ BTPageGetMeta

◆ BTREE_DEFAULT_FILLFACTOR

#define BTREE_DEFAULT_FILLFACTOR   90

Definition at line 169 of file nbtree.h.

Referenced by _bt_findsplitloc(), and _bt_pagestate().

◆ BTREE_MAGIC

#define BTREE_MAGIC   0x053162 /* magic number in metapage */

◆ BTREE_METAPAGE

◆ BTREE_MIN_FILLFACTOR

#define BTREE_MIN_FILLFACTOR   10

Definition at line 168 of file nbtree.h.

◆ BTREE_MIN_VERSION

#define BTREE_MIN_VERSION   2 /* minimal supported version number */

◆ BTREE_NONLEAF_FILLFACTOR

#define BTREE_NONLEAF_FILLFACTOR   70

Definition at line 170 of file nbtree.h.

Referenced by _bt_findsplitloc(), and _bt_pagestate().

◆ BTREE_NOVAC_VERSION

◆ BTREE_SINGLEVAL_FILLFACTOR

#define BTREE_SINGLEVAL_FILLFACTOR   96

Definition at line 171 of file nbtree.h.

Referenced by _bt_findsplitloc().

◆ BTREE_VERSION

#define BTREE_VERSION   4 /* current version number */

◆ BTreeInnerTupleGetDownLink

◆ BTreeInnerTupleSetDownLink

#define BTreeInnerTupleSetDownLink (   itup,
  blkno 
)    ItemPointerSetBlockNumber(&((itup)->t_tid), (blkno))

◆ BTreeTupleGetHeapTID

#define BTreeTupleGetHeapTID (   itup)
Value:
( \
(itup)->t_info & INDEX_ALT_TID_MASK && \
( \
(ItemPointer) (((char *) (itup) + IndexTupleSize(itup)) - \
sizeof(ItemPointerData)) \
) \
: (itup)->t_info & INDEX_ALT_TID_MASK ? NULL : (ItemPointer) &((itup)->t_tid) \
)
#define ItemPointerGetOffsetNumberNoCheck(pointer)
Definition: itemptr.h:108
#define BT_HEAP_TID_ATTR
Definition: nbtree.h:298
#define INDEX_ALT_TID_MASK
Definition: nbtree.h:293
#define IndexTupleSize(itup)
Definition: itup.h:71

Definition at line 346 of file nbtree.h.

Referenced by _bt_check_natts(), _bt_compare(), _bt_mkscankey(), bt_target_page_check(), and BTreeTupleGetHeapTIDCareful().

◆ BTreeTupleGetNAtts

#define BTreeTupleGetNAtts (   itup,
  rel 
)
Value:
( \
(itup)->t_info & INDEX_ALT_TID_MASK ? \
( \
) \
: \
)
#define ItemPointerGetOffsetNumberNoCheck(pointer)
Definition: itemptr.h:108
#define INDEX_ALT_TID_MASK
Definition: nbtree.h:293
#define IndexRelationGetNumberOfAttributes(relation)
Definition: rel.h:426
#define BT_N_KEYS_OFFSET_MASK
Definition: nbtree.h:297

Definition at line 327 of file nbtree.h.

Referenced by _bt_buildadd(), _bt_check_natts(), _bt_checkkeys(), _bt_compare(), _bt_insertonpg(), _bt_mkscankey(), _bt_newroot(), _bt_readpage(), _bt_split(), _bt_truncate(), _bt_uppershutdown(), and bt_target_page_check().

◆ BTreeTupleGetTopParent

#define BTreeTupleGetTopParent (   itup)    ItemPointerGetBlockNumberNoCheck(&((itup)->t_tid))

Definition at line 312 of file nbtree.h.

Referenced by _bt_unlink_halfdead_page(), and bt_downlink_missing_check().

◆ BTreeTupleSetAltHeapTID

#define BTreeTupleSetAltHeapTID (   itup)
Value:
do { \
Assert((itup)->t_info & INDEX_ALT_TID_MASK); \
ItemPointerSetOffsetNumber(&(itup)->t_tid, \
} while(0)
#define ItemPointerGetOffsetNumberNoCheck(pointer)
Definition: itemptr.h:108
#define BT_HEAP_TID_ATTR
Definition: nbtree.h:298
#define INDEX_ALT_TID_MASK
Definition: nbtree.h:293

Definition at line 360 of file nbtree.h.

Referenced by _bt_truncate().

◆ BTreeTupleSetNAtts

#define BTreeTupleSetNAtts (   itup,
 
)
Value:
do { \
(itup)->t_info |= INDEX_ALT_TID_MASK; \
ItemPointerSetOffsetNumber(&(itup)->t_tid, (n) & BT_N_KEYS_OFFSET_MASK); \
} while(0)
#define INDEX_ALT_TID_MASK
Definition: nbtree.h:293
#define BT_N_KEYS_OFFSET_MASK
Definition: nbtree.h:297

Definition at line 336 of file nbtree.h.

Referenced by _bt_buildadd(), _bt_newroot(), _bt_pgaddtup(), _bt_sortaddtup(), and _bt_truncate().

◆ BTreeTupleSetTopParent

#define BTreeTupleSetTopParent (   itup,
  blkno 
)
Value:
do { \
ItemPointerSetBlockNumber(&((itup)->t_tid), (blkno)); \
BTreeTupleSetNAtts((itup), 0); \
} while(0)

Definition at line 314 of file nbtree.h.

Referenced by _bt_mark_page_halfdead(), _bt_unlink_halfdead_page(), btree_xlog_mark_page_halfdead(), and btree_xlog_unlink_page().

◆ BTScanPosInvalidate

#define BTScanPosInvalidate (   scanpos)
Value:
do { \
(scanpos).currPage = InvalidBlockNumber; \
(scanpos).nextPage = InvalidBlockNumber; \
(scanpos).buf = InvalidBuffer; \
(scanpos).lsn = InvalidXLogRecPtr; \
(scanpos).nextTupleOffset = 0; \
} while (0);
#define InvalidXLogRecPtr
Definition: xlogdefs.h:28
#define InvalidBuffer
Definition: buf.h:25
static char * buf
Definition: pg_test_fsync.c:68
#define InvalidBlockNumber
Definition: block.h:33

Definition at line 610 of file nbtree.h.

Referenced by _bt_endpoint(), _bt_first(), _bt_readnextpage(), _bt_steppage(), btbeginscan(), btmarkpos(), btrescan(), and btrestrpos().

◆ BTScanPosIsPinned

#define BTScanPosIsPinned (   scanpos)
Value:
( \
AssertMacro(BlockNumberIsValid((scanpos).currPage) || \
!BufferIsValid((scanpos).buf)), \
BufferIsValid((scanpos).buf) \
)
static char * buf
Definition: pg_test_fsync.c:68
#define BlockNumberIsValid(blockNumber)
Definition: block.h:70
#define BufferIsValid(bufnum)
Definition: bufmgr.h:113

Definition at line 587 of file nbtree.h.

Referenced by _bt_killitems(), _bt_readnextpage(), _bt_readpage(), _bt_steppage(), and btrestrpos().

◆ BTScanPosIsValid

#define BTScanPosIsValid (   scanpos)
Value:
( \
AssertMacro(BlockNumberIsValid((scanpos).currPage) || \
!BufferIsValid((scanpos).buf)), \
BlockNumberIsValid((scanpos).currPage) \
)
static char * buf
Definition: pg_test_fsync.c:68
#define BlockNumberIsValid(blockNumber)
Definition: block.h:70
#define BufferIsValid(bufnum)
Definition: bufmgr.h:113

Definition at line 604 of file nbtree.h.

Referenced by _bt_first(), _bt_killitems(), _bt_steppage(), btendscan(), btgettuple(), btmarkpos(), btrescan(), and btrestrpos().

◆ BTScanPosUnpin

#define BTScanPosUnpin (   scanpos)
Value:
do { \
ReleaseBuffer((scanpos).buf); \
(scanpos).buf = InvalidBuffer; \
} while (0)
#define InvalidBuffer
Definition: buf.h:25
static char * buf
Definition: pg_test_fsync.c:68

Definition at line 593 of file nbtree.h.

◆ BTScanPosUnpinIfPinned

#define BTScanPosUnpinIfPinned (   scanpos)
Value:
do { \
if (BTScanPosIsPinned(scanpos)) \
BTScanPosUnpin(scanpos); \
} while (0)
#define BTScanPosIsPinned(scanpos)
Definition: nbtree.h:587

Definition at line 598 of file nbtree.h.

Referenced by _bt_readnextpage(), _bt_steppage(), btendscan(), btmarkpos(), btrescan(), and btrestrpos().

◆ BTSORTSUPPORT_PROC

#define BTSORTSUPPORT_PROC   2

Definition at line 393 of file nbtree.h.

Referenced by assignProcTypes(), btvalidate(), FinishSortSupportFunction(), and MJExamineQuals().

◆ INDEX_ALT_TID_MASK

#define INDEX_ALT_TID_MASK   INDEX_AM_RESERVED_BIT

Definition at line 293 of file nbtree.h.

Referenced by _bt_check_natts().

◆ MAX_BT_CYCLE_ID

#define MAX_BT_CYCLE_ID   0xFF7F

Definition at line 87 of file nbtree.h.

Referenced by _bt_start_vacuum().

◆ P_FIRSTDATAKEY

◆ P_FIRSTKEY

#define P_FIRSTKEY   ((OffsetNumber) 2)

◆ P_HAS_GARBAGE

#define P_HAS_GARBAGE (   opaque)    (((opaque)->btpo_flags & BTP_HAS_GARBAGE) != 0)

Definition at line 195 of file nbtree.h.

Referenced by _bt_findinsertloc(), and palloc_btree_page().

◆ P_HIKEY

◆ P_IGNORE

◆ P_INCOMPLETE_SPLIT

◆ P_ISDELETED

◆ P_ISHALFDEAD

#define P_ISHALFDEAD (   opaque)    (((opaque)->btpo_flags & BTP_HALF_DEAD) != 0)

◆ P_ISLEAF

◆ P_ISMETA

#define P_ISMETA (   opaque)    (((opaque)->btpo_flags & BTP_META) != 0)

◆ P_ISROOT

#define P_ISROOT (   opaque)    (((opaque)->btpo_flags & BTP_ROOT) != 0)

◆ P_LEFTMOST

#define P_LEFTMOST (   opaque)    ((opaque)->btpo_prev == P_NONE)

◆ P_NONE

◆ P_RIGHTMOST

◆ PROGRESS_BTREE_PHASE_INDEXBUILD_TABLESCAN

#define PROGRESS_BTREE_PHASE_INDEXBUILD_TABLESCAN   2

Definition at line 689 of file nbtree.h.

Referenced by _bt_spools_heapscan(), and btbuildphasename().

◆ PROGRESS_BTREE_PHASE_LEAF_LOAD

#define PROGRESS_BTREE_PHASE_LEAF_LOAD   5

Definition at line 692 of file nbtree.h.

Referenced by _bt_leafbuild(), and btbuildphasename().

◆ PROGRESS_BTREE_PHASE_PERFORMSORT_1

#define PROGRESS_BTREE_PHASE_PERFORMSORT_1   3

Definition at line 690 of file nbtree.h.

Referenced by _bt_leafbuild(), and btbuildphasename().

◆ PROGRESS_BTREE_PHASE_PERFORMSORT_2

#define PROGRESS_BTREE_PHASE_PERFORMSORT_2   4

Definition at line 691 of file nbtree.h.

Referenced by _bt_leafbuild(), and btbuildphasename().

◆ SK_BT_DESC

◆ SK_BT_INDOPTION_SHIFT

#define SK_BT_INDOPTION_SHIFT   24 /* must clear the above bits */

Definition at line 680 of file nbtree.h.

Referenced by _bt_fix_scankey_strategy(), and _bt_mkscankey().

◆ SK_BT_NULLS_FIRST

◆ SK_BT_REQBKWD

#define SK_BT_REQBKWD   0x00020000 /* required to continue backward scan */

Definition at line 679 of file nbtree.h.

Referenced by _bt_check_rowcompare(), _bt_checkkeys(), and _bt_mark_scankey_required().

◆ SK_BT_REQFWD

#define SK_BT_REQFWD   0x00010000 /* required to continue forward scan */

Definition at line 678 of file nbtree.h.

Referenced by _bt_check_rowcompare(), _bt_checkkeys(), and _bt_mark_scankey_required().

Typedef Documentation

◆ BTArrayKeyInfo

◆ BTCycleId

Definition at line 27 of file nbtree.h.

◆ BTInsertState

Definition at line 513 of file nbtree.h.

◆ BTInsertStateData

◆ BTMetaPageData

◆ BTPageOpaque

Definition at line 68 of file nbtree.h.

◆ BTPageOpaqueData

◆ BTScanInsert

Definition at line 481 of file nbtree.h.

◆ BTScanInsertData

◆ BTScanOpaque

Definition at line 671 of file nbtree.h.

◆ BTScanOpaqueData

◆ BTScanPos

Definition at line 585 of file nbtree.h.

◆ BTScanPosData

◆ BTScanPosItem

◆ BTStack

Definition at line 423 of file nbtree.h.

◆ BTStackData

Function Documentation

◆ _bt_advance_array_keys()

bool _bt_advance_array_keys ( IndexScanDesc  scan,
ScanDirection  dir 
)

Definition at line 546 of file nbtutils.c.

References _bt_parallel_advance_array_keys(), BTScanOpaqueData::arrayKeyData, BTScanOpaqueData::arrayKeys, BTArrayKeyInfo::cur_elem, BTArrayKeyInfo::elem_values, i, BTArrayKeyInfo::num_elems, BTScanOpaqueData::numArrayKeys, IndexScanDescData::opaque, IndexScanDescData::parallel_scan, BTArrayKeyInfo::scan_key, ScanDirectionIsBackward, and ScanKeyData::sk_argument.

Referenced by btgetbitmap(), and btgettuple().

547 {
548  BTScanOpaque so = (BTScanOpaque) scan->opaque;
549  bool found = false;
550  int i;
551 
552  /*
553  * We must advance the last array key most quickly, since it will
554  * correspond to the lowest-order index column among the available
555  * qualifications. This is necessary to ensure correct ordering of output
556  * when there are multiple array keys.
557  */
558  for (i = so->numArrayKeys - 1; i >= 0; i--)
559  {
560  BTArrayKeyInfo *curArrayKey = &so->arrayKeys[i];
561  ScanKey skey = &so->arrayKeyData[curArrayKey->scan_key];
562  int cur_elem = curArrayKey->cur_elem;
563  int num_elems = curArrayKey->num_elems;
564 
565  if (ScanDirectionIsBackward(dir))
566  {
567  if (--cur_elem < 0)
568  {
569  cur_elem = num_elems - 1;
570  found = false; /* need to advance next array key */
571  }
572  else
573  found = true;
574  }
575  else
576  {
577  if (++cur_elem >= num_elems)
578  {
579  cur_elem = 0;
580  found = false; /* need to advance next array key */
581  }
582  else
583  found = true;
584  }
585 
586  curArrayKey->cur_elem = cur_elem;
587  skey->sk_argument = curArrayKey->elem_values[cur_elem];
588  if (found)
589  break;
590  }
591 
592  /* advance parallel scan */
593  if (scan->parallel_scan != NULL)
595 
596  return found;
597 }
Datum * elem_values
Definition: nbtree.h:626
#define ScanDirectionIsBackward(direction)
Definition: sdir.h:41
BTScanOpaqueData * BTScanOpaque
Definition: nbtree.h:671
int cur_elem
Definition: nbtree.h:623
int numArrayKeys
Definition: nbtree.h:638
ScanKey arrayKeyData
Definition: nbtree.h:637
struct ParallelIndexScanDescData * parallel_scan
Definition: relscan.h:151
int num_elems
Definition: nbtree.h:625
void _bt_parallel_advance_array_keys(IndexScanDesc scan)
Definition: nbtree.c:763
BTArrayKeyInfo * arrayKeys
Definition: nbtree.h:642
int i
Datum sk_argument
Definition: skey.h:72
int scan_key
Definition: nbtree.h:622

◆ _bt_binsrch_insert()

OffsetNumber _bt_binsrch_insert ( Relation  rel,
BTInsertState  insertstate 
)

Definition at line 445 of file nbtsearch.c.

References _bt_compare(), Assert, BTInsertStateData::bounds_valid, BTInsertStateData::buf, BufferGetPage, InvalidOffsetNumber, BTInsertStateData::itup_key, sort-test::key, BTInsertStateData::low, BTScanInsertData::nextkey, P_FIRSTDATAKEY, P_ISLEAF, PageGetMaxOffsetNumber, PageGetSpecialPointer, BTInsertStateData::stricthigh, and unlikely.

Referenced by _bt_check_unique(), _bt_findinsertloc(), and bt_rootdescend().

446 {
447  BTScanInsert key = insertstate->itup_key;
448  Page page;
449  BTPageOpaque opaque;
450  OffsetNumber low,
451  high,
452  stricthigh;
453  int32 result,
454  cmpval;
455 
456  page = BufferGetPage(insertstate->buf);
457  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
458 
459  Assert(P_ISLEAF(opaque));
460  Assert(!key->nextkey);
461 
462  if (!insertstate->bounds_valid)
463  {
464  /* Start new binary search */
465  low = P_FIRSTDATAKEY(opaque);
466  high = PageGetMaxOffsetNumber(page);
467  }
468  else
469  {
470  /* Restore result of previous binary search against same page */
471  low = insertstate->low;
472  high = insertstate->stricthigh;
473  }
474 
475  /* If there are no keys on the page, return the first available slot */
476  if (unlikely(high < low))
477  {
478  /* Caller can't reuse bounds */
479  insertstate->low = InvalidOffsetNumber;
480  insertstate->stricthigh = InvalidOffsetNumber;
481  insertstate->bounds_valid = false;
482  return low;
483  }
484 
485  /*
486  * Binary search to find the first key on the page >= scan key. (nextkey
487  * is always false when inserting).
488  *
489  * The loop invariant is: all slots before 'low' are < scan key, all slots
490  * at or after 'high' are >= scan key. 'stricthigh' is > scan key, and is
491  * maintained to save additional search effort for caller.
492  *
493  * We can fall out when high == low.
494  */
495  if (!insertstate->bounds_valid)
496  high++; /* establish the loop invariant for high */
497  stricthigh = high; /* high initially strictly higher */
498 
499  cmpval = 1; /* !nextkey comparison value */
500 
501  while (high > low)
502  {
503  OffsetNumber mid = low + ((high - low) / 2);
504 
505  /* We have low <= mid < high, so mid points at a real slot */
506 
507  result = _bt_compare(rel, key, page, mid);
508 
509  if (result >= cmpval)
510  low = mid + 1;
511  else
512  {
513  high = mid;
514  if (result != 0)
515  stricthigh = high;
516  }
517  }
518 
519  /*
520  * On a leaf page, a binary search always returns the first key >= scan
521  * key (at least in !nextkey case), which could be the last slot + 1. This
522  * is also the lower bound of cached search.
523  *
524  * stricthigh may also be the last slot + 1, which prevents caller from
525  * using bounds directly, but is still useful to us if we're called a
526  * second time with cached bounds (cached low will be < stricthigh when
527  * that happens).
528  */
529  insertstate->low = low;
530  insertstate->stricthigh = stricthigh;
531  insertstate->bounds_valid = true;
532 
533  return low;
534 }
bool bounds_valid
Definition: nbtree.h:508
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:219
OffsetNumber stricthigh
Definition: nbtree.h:510
OffsetNumber low
Definition: nbtree.h:509
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:357
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:68
signed int int32
Definition: c.h:346
uint16 OffsetNumber
Definition: off.h:24
int32 _bt_compare(Relation rel, BTScanInsert key, Page page, OffsetNumber offnum)
Definition: nbtsearch.c:559
#define BufferGetPage(buffer)
Definition: bufmgr.h:159
#define InvalidOffsetNumber
Definition: off.h:26
BTScanInsert itup_key
Definition: nbtree.h:498
#define Assert(condition)
Definition: c.h:732
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
#define unlikely(x)
Definition: c.h:208
Pointer Page
Definition: bufpage.h:78
#define P_ISLEAF(opaque)
Definition: nbtree.h:189

◆ _bt_check_natts()

bool _bt_check_natts ( Relation  rel,
bool  heapkeyspace,
Page  page,
OffsetNumber  offnum 
)

Definition at line 2382 of file nbtutils.c.

References Assert, BT_N_KEYS_OFFSET_MASK, BTreeTupleGetHeapTID, BTreeTupleGetNAtts, FirstOffsetNumber, INDEX_ALT_TID_MASK, INDEX_MAX_KEYS, IndexRelationGetNumberOfAttributes, IndexRelationGetNumberOfKeyAttributes, ItemPointerGetOffsetNumber, P_FIRSTDATAKEY, P_HIKEY, P_IGNORE, P_ISLEAF, P_RIGHTMOST, PageGetItem, PageGetItemId, PageGetMaxOffsetNumber, PageGetSpecialPointer, StaticAssertStmt, IndexTupleData::t_info, and IndexTupleData::t_tid.

Referenced by _bt_compare(), and bt_target_page_check().

2383 {
2387  IndexTuple itup;
2388  int tupnatts;
2389 
2390  /*
2391  * We cannot reliably test a deleted or half-deleted page, since they have
2392  * dummy high keys
2393  */
2394  if (P_IGNORE(opaque))
2395  return true;
2396 
2397  Assert(offnum >= FirstOffsetNumber &&
2398  offnum <= PageGetMaxOffsetNumber(page));
2399 
2400  /*
2401  * Mask allocated for number of keys in index tuple must be able to fit
2402  * maximum possible number of index attributes
2403  */
2405  "BT_N_KEYS_OFFSET_MASK can't fit INDEX_MAX_KEYS");
2406 
2407  itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
2408  tupnatts = BTreeTupleGetNAtts(itup, rel);
2409 
2410  if (P_ISLEAF(opaque))
2411  {
2412  if (offnum >= P_FIRSTDATAKEY(opaque))
2413  {
2414  /*
2415  * Non-pivot tuples currently never use alternative heap TID
2416  * representation -- even those within heapkeyspace indexes
2417  */
2418  if ((itup->t_info & INDEX_ALT_TID_MASK) != 0)
2419  return false;
2420 
2421  /*
2422  * Leaf tuples that are not the page high key (non-pivot tuples)
2423  * should never be truncated. (Note that tupnatts must have been
2424  * inferred, rather than coming from an explicit on-disk
2425  * representation.)
2426  */
2427  return tupnatts == natts;
2428  }
2429  else
2430  {
2431  /*
2432  * Rightmost page doesn't contain a page high key, so tuple was
2433  * checked above as ordinary leaf tuple
2434  */
2435  Assert(!P_RIGHTMOST(opaque));
2436 
2437  /*
2438  * !heapkeyspace high key tuple contains only key attributes. Note
2439  * that tupnatts will only have been explicitly represented in
2440  * !heapkeyspace indexes that happen to have non-key attributes.
2441  */
2442  if (!heapkeyspace)
2443  return tupnatts == nkeyatts;
2444 
2445  /* Use generic heapkeyspace pivot tuple handling */
2446  }
2447  }
2448  else /* !P_ISLEAF(opaque) */
2449  {
2450  if (offnum == P_FIRSTDATAKEY(opaque))
2451  {
2452  /*
2453  * The first tuple on any internal page (possibly the first after
2454  * its high key) is its negative infinity tuple. Negative
2455  * infinity tuples are always truncated to zero attributes. They
2456  * are a particular kind of pivot tuple.
2457  */
2458  if (heapkeyspace)
2459  return tupnatts == 0;
2460 
2461  /*
2462  * The number of attributes won't be explicitly represented if the
2463  * negative infinity tuple was generated during a page split that
2464  * occurred with a version of Postgres before v11. There must be
2465  * a problem when there is an explicit representation that is
2466  * non-zero, or when there is no explicit representation and the
2467  * tuple is evidently not a pre-pg_upgrade tuple.
2468  *
2469  * Prior to v11, downlinks always had P_HIKEY as their offset. Use
2470  * that to decide if the tuple is a pre-v11 tuple.
2471  */
2472  return tupnatts == 0 ||
2473  ((itup->t_info & INDEX_ALT_TID_MASK) == 0 &&
2475  }
2476  else
2477  {
2478  /*
2479  * !heapkeyspace downlink tuple with separator key contains only
2480  * key attributes. Note that tupnatts will only have been
2481  * explicitly represented in !heapkeyspace indexes that happen to
2482  * have non-key attributes.
2483  */
2484  if (!heapkeyspace)
2485  return tupnatts == nkeyatts;
2486 
2487  /* Use generic heapkeyspace pivot tuple handling */
2488  }
2489 
2490  }
2491 
2492  /* Handle heapkeyspace pivot tuples (excluding minus infinity items) */
2493  Assert(heapkeyspace);
2494 
2495  /*
2496  * Explicit representation of the number of attributes is mandatory with
2497  * heapkeyspace index pivot tuples, regardless of whether or not there are
2498  * non-key attributes.
2499  */
2500  if ((itup->t_info & INDEX_ALT_TID_MASK) == 0)
2501  return false;
2502 
2503  /*
2504  * Heap TID is a tiebreaker key attribute, so it cannot be untruncated
2505  * when any other key attribute is truncated
2506  */
2507  if (BTreeTupleGetHeapTID(itup) != NULL && tupnatts != nkeyatts)
2508  return false;
2509 
2510  /*
2511  * Pivot tuple must have at least one untruncated key attribute (minus
2512  * infinity pivot tuples are the only exception). Pivot tuples can never
2513  * represent that there is a value present for a key attribute that
2514  * exceeds pg_index.indnkeyatts for the index.
2515  */
2516  return tupnatts > 0 && tupnatts <= nkeyatts;
2517 }
signed short int16
Definition: c.h:345
#define P_IGNORE(opaque)
Definition: nbtree.h:194
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:219
ItemPointerData t_tid
Definition: itup.h:37
#define BTreeTupleGetHeapTID(itup)
Definition: nbtree.h:346
#define BTreeTupleGetNAtts(itup, rel)
Definition: nbtree.h:327
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:357
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:68
#define StaticAssertStmt(condition, errmessage)
Definition: c.h:828
#define INDEX_ALT_TID_MASK
Definition: nbtree.h:293
#define FirstOffsetNumber
Definition: off.h:27
IndexTupleData * IndexTuple
Definition: itup.h:53
#define IndexRelationGetNumberOfAttributes(relation)
Definition: rel.h:426
#define IndexRelationGetNumberOfKeyAttributes(relation)
Definition: rel.h:433
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:235
#define Assert(condition)
Definition: c.h:732
#define INDEX_MAX_KEYS
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
#define BT_N_KEYS_OFFSET_MASK
Definition: nbtree.h:297
#define ItemPointerGetOffsetNumber(pointer)
Definition: itemptr.h:117
#define P_HIKEY
Definition: nbtree.h:217
unsigned short t_info
Definition: itup.h:49
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:188
#define PageGetItem(page, itemId)
Definition: bufpage.h:340
#define P_ISLEAF(opaque)
Definition: nbtree.h:189

◆ _bt_check_third_page()

void _bt_check_third_page ( Relation  rel,
Relation  heap,
bool  needheaptidspace,
Page  page,
IndexTuple  newtup 
)

Definition at line 2532 of file nbtutils.c.

References BTMaxItemSize, BTMaxItemSizeNoHeapTid, BTREE_NOVAC_VERSION, BTREE_VERSION, elog, ereport, errcode(), errdetail(), errhint(), errmsg(), ERROR, errtableconstraint(), IndexTupleSize, ItemPointerGetBlockNumber, ItemPointerGetOffsetNumber, MAXALIGN, P_ISLEAF, PageGetSpecialPointer, RelationGetRelationName, and IndexTupleData::t_tid.

Referenced by _bt_buildadd(), and _bt_findinsertloc().

2534 {
2535  Size itemsz;
2536  BTPageOpaque opaque;
2537 
2538  itemsz = MAXALIGN(IndexTupleSize(newtup));
2539 
2540  /* Double check item size against limit */
2541  if (itemsz <= BTMaxItemSize(page))
2542  return;
2543 
2544  /*
2545  * Tuple is probably too large to fit on page, but it's possible that the
2546  * index uses version 2 or version 3, or that page is an internal page, in
2547  * which case a slightly higher limit applies.
2548  */
2549  if (!needheaptidspace && itemsz <= BTMaxItemSizeNoHeapTid(page))
2550  return;
2551 
2552  /*
2553  * Internal page insertions cannot fail here, because that would mean that
2554  * an earlier leaf level insertion that should have failed didn't
2555  */
2556  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2557  if (!P_ISLEAF(opaque))
2558  elog(ERROR, "cannot insert oversized tuple of size %zu on internal page of index \"%s\"",
2559  itemsz, RelationGetRelationName(rel));
2560 
2561  ereport(ERROR,
2562  (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
2563  errmsg("index row size %zu exceeds btree version %u maximum %zu for index \"%s\"",
2564  itemsz,
2565  needheaptidspace ? BTREE_VERSION : BTREE_NOVAC_VERSION,
2566  needheaptidspace ? BTMaxItemSize(page) :
2567  BTMaxItemSizeNoHeapTid(page),
2569  errdetail("Index row references tuple (%u,%u) in relation \"%s\".",
2570  ItemPointerGetBlockNumber(&newtup->t_tid),
2572  RelationGetRelationName(heap)),
2573  errhint("Values larger than 1/3 of a buffer page cannot be indexed.\n"
2574  "Consider a function index of an MD5 hash of the value, "
2575  "or use full text indexing."),
2577 }
int errhint(const char *fmt,...)
Definition: elog.c:974
#define BTREE_VERSION
Definition: nbtree.h:133
ItemPointerData t_tid
Definition: itup.h:37
#define BTMaxItemSizeNoHeapTid(page)
Definition: nbtree.h:153
int errcode(int sqlerrcode)
Definition: elog.c:570
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:68
int errtableconstraint(Relation rel, const char *conname)
Definition: relcache.c:5243
#define ERROR
Definition: elog.h:43
int errdetail(const char *fmt,...)
Definition: elog.c:860
#define RelationGetRelationName(relation)
Definition: rel.h:448
#define BTREE_NOVAC_VERSION
Definition: nbtree.h:135
#define ereport(elevel, rest)
Definition: elog.h:141
size_t Size
Definition: c.h:466
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
#define MAXALIGN(LEN)
Definition: c.h:685
#define ItemPointerGetOffsetNumber(pointer)
Definition: itemptr.h:117
#define BTMaxItemSize(page)
Definition: nbtree.h:147
int errmsg(const char *fmt,...)
Definition: elog.c:784
#define elog(elevel,...)
Definition: elog.h:226
#define ItemPointerGetBlockNumber(pointer)
Definition: itemptr.h:98
#define IndexTupleSize(itup)
Definition: itup.h:71
#define P_ISLEAF(opaque)
Definition: nbtree.h:189

◆ _bt_checkkeys()

bool _bt_checkkeys ( IndexScanDesc  scan,
IndexTuple  tuple,
int  tupnatts,
ScanDirection  dir,
bool continuescan 
)

Definition at line 1357 of file nbtutils.c.

References _bt_check_rowcompare(), Assert, BTreeTupleGetNAtts, DatumGetBool, FunctionCall2Coll(), index_getattr, IndexScanDescData::indexRelation, sort-test::key, BTScanOpaqueData::keyData, BTScanOpaqueData::numberOfKeys, IndexScanDescData::opaque, RelationGetDescr, ScanDirectionIsBackward, ScanDirectionIsForward, ScanKeyData::sk_argument, ScanKeyData::sk_attno, SK_BT_NULLS_FIRST, SK_BT_REQBKWD, SK_BT_REQFWD, ScanKeyData::sk_collation, ScanKeyData::sk_flags, ScanKeyData::sk_func, SK_ISNULL, SK_ROW_HEADER, SK_SEARCHNOTNULL, SK_SEARCHNULL, and test().

Referenced by _bt_readpage().

1359 {
1360  TupleDesc tupdesc;
1361  BTScanOpaque so;
1362  int keysz;
1363  int ikey;
1364  ScanKey key;
1365 
1366  Assert(BTreeTupleGetNAtts(tuple, scan->indexRelation) == tupnatts);
1367 
1368  *continuescan = true; /* default assumption */
1369 
1370  tupdesc = RelationGetDescr(scan->indexRelation);
1371  so = (BTScanOpaque) scan->opaque;
1372  keysz = so->numberOfKeys;
1373 
1374  for (key = so->keyData, ikey = 0; ikey < keysz; key++, ikey++)
1375  {
1376  Datum datum;
1377  bool isNull;
1378  Datum test;
1379 
1380  if (key->sk_attno > tupnatts)
1381  {
1382  /*
1383  * This attribute is truncated (must be high key). The value for
1384  * this attribute in the first non-pivot tuple on the page to the
1385  * right could be any possible value. Assume that truncated
1386  * attribute passes the qual.
1387  */
1389  continue;
1390  }
1391 
1392  /* row-comparison keys need special processing */
1393  if (key->sk_flags & SK_ROW_HEADER)
1394  {
1395  if (_bt_check_rowcompare(key, tuple, tupnatts, tupdesc, dir,
1396  continuescan))
1397  continue;
1398  return false;
1399  }
1400 
1401  datum = index_getattr(tuple,
1402  key->sk_attno,
1403  tupdesc,
1404  &isNull);
1405 
1406  if (key->sk_flags & SK_ISNULL)
1407  {
1408  /* Handle IS NULL/NOT NULL tests */
1409  if (key->sk_flags & SK_SEARCHNULL)
1410  {
1411  if (isNull)
1412  continue; /* tuple satisfies this qual */
1413  }
1414  else
1415  {
1417  if (!isNull)
1418  continue; /* tuple satisfies this qual */
1419  }
1420 
1421  /*
1422  * Tuple fails this qual. If it's a required qual for the current
1423  * scan direction, then we can conclude no further tuples will
1424  * pass, either.
1425  */
1426  if ((key->sk_flags & SK_BT_REQFWD) &&
1428  *continuescan = false;
1429  else if ((key->sk_flags & SK_BT_REQBKWD) &&
1431  *continuescan = false;
1432 
1433  /*
1434  * In any case, this indextuple doesn't match the qual.
1435  */
1436  return false;
1437  }
1438 
1439  if (isNull)
1440  {
1441  if (key->sk_flags & SK_BT_NULLS_FIRST)
1442  {
1443  /*
1444  * Since NULLs are sorted before non-NULLs, we know we have
1445  * reached the lower limit of the range of values for this
1446  * index attr. On a backward scan, we can stop if this qual
1447  * is one of the "must match" subset. We can stop regardless
1448  * of whether the qual is > or <, so long as it's required,
1449  * because it's not possible for any future tuples to pass. On
1450  * a forward scan, however, we must keep going, because we may
1451  * have initially positioned to the start of the index.
1452  */
1453  if ((key->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) &&
1455  *continuescan = false;
1456  }
1457  else
1458  {
1459  /*
1460  * Since NULLs are sorted after non-NULLs, we know we have
1461  * reached the upper limit of the range of values for this
1462  * index attr. On a forward scan, we can stop if this qual is
1463  * one of the "must match" subset. We can stop regardless of
1464  * whether the qual is > or <, so long as it's required,
1465  * because it's not possible for any future tuples to pass. On
1466  * a backward scan, however, we must keep going, because we
1467  * may have initially positioned to the end of the index.
1468  */
1469  if ((key->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) &&
1471  *continuescan = false;
1472  }
1473 
1474  /*
1475  * In any case, this indextuple doesn't match the qual.
1476  */
1477  return false;
1478  }
1479 
1480  test = FunctionCall2Coll(&key->sk_func, key->sk_collation,
1481  datum, key->sk_argument);
1482 
1483  if (!DatumGetBool(test))
1484  {
1485  /*
1486  * Tuple fails this qual. If it's a required qual for the current
1487  * scan direction, then we can conclude no further tuples will
1488  * pass, either.
1489  *
1490  * Note: because we stop the scan as soon as any required equality
1491  * qual fails, it is critical that equality quals be used for the
1492  * initial positioning in _bt_first() when they are available. See
1493  * comments in _bt_first().
1494  */
1495  if ((key->sk_flags & SK_BT_REQFWD) &&
1497  *continuescan = false;
1498  else if ((key->sk_flags & SK_BT_REQBKWD) &&
1500  *continuescan = false;
1501 
1502  /*
1503  * In any case, this indextuple doesn't match the qual.
1504  */
1505  return false;
1506  }
1507  }
1508 
1509  /* If we get here, the tuple passes all index quals. */
1510  return true;
1511 }
static void test(void)
#define ScanDirectionIsForward(direction)
Definition: sdir.h:55
#define RelationGetDescr(relation)
Definition: rel.h:440
Datum FunctionCall2Coll(FmgrInfo *flinfo, Oid collation, Datum arg1, Datum arg2)
Definition: fmgr.c:1150
#define BTreeTupleGetNAtts(itup, rel)
Definition: nbtree.h:327
Relation indexRelation
Definition: relscan.h:103
#define ScanDirectionIsBackward(direction)
Definition: sdir.h:41
BTScanOpaqueData * BTScanOpaque
Definition: nbtree.h:671
FmgrInfo sk_func
Definition: skey.h:71
#define DatumGetBool(X)
Definition: postgres.h:393
#define SK_SEARCHNOTNULL
Definition: skey.h:122
#define SK_ISNULL
Definition: skey.h:115
#define SK_ROW_HEADER
Definition: skey.h:117
#define SK_BT_NULLS_FIRST
Definition: nbtree.h:682
uintptr_t Datum
Definition: postgres.h:367
static bool _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts, TupleDesc tupdesc, ScanDirection dir, bool *continuescan)
Definition: nbtutils.c:1523
#define SK_BT_REQFWD
Definition: nbtree.h:678
int sk_flags
Definition: skey.h:66
#define Assert(condition)
Definition: c.h:732
int numberOfKeys
Definition: nbtree.h:633
#define index_getattr(tup, attnum, tupleDesc, isnull)
Definition: itup.h:100
ScanKey keyData
Definition: nbtree.h:634
Oid sk_collation
Definition: skey.h:70
#define SK_BT_REQBKWD
Definition: nbtree.h:679
Datum sk_argument
Definition: skey.h:72
#define SK_SEARCHNULL
Definition: skey.h:121
AttrNumber sk_attno
Definition: skey.h:67

◆ _bt_checkpage()

void _bt_checkpage ( Relation  rel,
Buffer  buf 
)

Definition at line 729 of file nbtpage.c.

References BufferGetBlockNumber(), BufferGetPage, ereport, errcode(), errhint(), errmsg(), ERROR, MAXALIGN, PageGetSpecialSize, PageIsNew, and RelationGetRelationName.

Referenced by _bt_doinsert(), _bt_getbuf(), _bt_relandgetbuf(), btvacuumpage(), btvacuumscan(), and palloc_btree_page().

730 {
731  Page page = BufferGetPage(buf);
732 
733  /*
734  * ReadBuffer verifies that every newly-read page passes
735  * PageHeaderIsValid, which means it either contains a reasonably sane
736  * page header or is all-zero. We have to defend against the all-zero
737  * case, however.
738  */
739  if (PageIsNew(page))
740  ereport(ERROR,
741  (errcode(ERRCODE_INDEX_CORRUPTED),
742  errmsg("index \"%s\" contains unexpected zero page at block %u",
745  errhint("Please REINDEX it.")));
746 
747  /*
748  * Additionally check that the special area looks sane.
749  */
750  if (PageGetSpecialSize(page) != MAXALIGN(sizeof(BTPageOpaqueData)))
751  ereport(ERROR,
752  (errcode(ERRCODE_INDEX_CORRUPTED),
753  errmsg("index \"%s\" contains corrupted page at block %u",
756  errhint("Please REINDEX it.")));
757 }
int errhint(const char *fmt,...)
Definition: elog.c:974
int errcode(int sqlerrcode)
Definition: elog.c:570
#define ERROR
Definition: elog.h:43
static char * buf
Definition: pg_test_fsync.c:68
#define RelationGetRelationName(relation)
Definition: rel.h:448
#define BufferGetPage(buffer)
Definition: bufmgr.h:159
#define ereport(elevel, rest)
Definition: elog.h:141
#define MAXALIGN(LEN)
Definition: c.h:685
#define PageGetSpecialSize(page)
Definition: bufpage.h:300
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2613
#define PageIsNew(page)
Definition: bufpage.h:229
int errmsg(const char *fmt,...)
Definition: elog.c:784
Pointer Page
Definition: bufpage.h:78

◆ _bt_compare()

int32 _bt_compare ( Relation  rel,
BTScanInsert  key,
Page  page,
OffsetNumber  offnum 
)

Definition at line 559 of file nbtsearch.c.

References _bt_check_natts(), Assert, BTreeTupleGetHeapTID, BTreeTupleGetNAtts, DatumGetInt32, FunctionCall2Coll(), BTScanInsertData::heapkeyspace, i, index_getattr, IndexRelationGetNumberOfKeyAttributes, INVERT_COMPARE_RESULT, ItemPointerCompare(), BTScanInsertData::keysz, Min, P_FIRSTDATAKEY, P_ISLEAF, PageGetItem, PageGetItemId, PageGetSpecialPointer, BTScanInsertData::pivotsearch, RelationGetDescr, BTScanInsertData::scankeys, BTScanInsertData::scantid, ScanKeyData::sk_argument, ScanKeyData::sk_attno, SK_BT_DESC, SK_BT_NULLS_FIRST, ScanKeyData::sk_collation, ScanKeyData::sk_flags, ScanKeyData::sk_func, and SK_ISNULL.

Referenced by _bt_binsrch(), _bt_binsrch_insert(), _bt_check_unique(), _bt_doinsert(), _bt_findinsertloc(), _bt_moveright(), bt_rootdescend(), invariant_g_offset(), invariant_l_nontarget_offset(), invariant_l_offset(), and invariant_leq_offset().

563 {
564  TupleDesc itupdesc = RelationGetDescr(rel);
566  IndexTuple itup;
567  ItemPointer heapTid;
568  ScanKey scankey;
569  int ncmpkey;
570  int ntupatts;
571 
572  Assert(_bt_check_natts(rel, key->heapkeyspace, page, offnum));
574  Assert(key->heapkeyspace || key->scantid == NULL);
575 
576  /*
577  * Force result ">" if target item is first data item on an internal page
578  * --- see NOTE above.
579  */
580  if (!P_ISLEAF(opaque) && offnum == P_FIRSTDATAKEY(opaque))
581  return 1;
582 
583  itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
584  ntupatts = BTreeTupleGetNAtts(itup, rel);
585 
586  /*
587  * The scan key is set up with the attribute number associated with each
588  * term in the key. It is important that, if the index is multi-key, the
589  * scan contain the first k key attributes, and that they be in order. If
590  * you think about how multi-key ordering works, you'll understand why
591  * this is.
592  *
593  * We don't test for violation of this condition here, however. The
594  * initial setup for the index scan had better have gotten it right (see
595  * _bt_first).
596  */
597 
598  ncmpkey = Min(ntupatts, key->keysz);
599  Assert(key->heapkeyspace || ncmpkey == key->keysz);
600  scankey = key->scankeys;
601  for (int i = 1; i <= ncmpkey; i++)
602  {
603  Datum datum;
604  bool isNull;
605  int32 result;
606 
607  datum = index_getattr(itup, scankey->sk_attno, itupdesc, &isNull);
608 
609  /* see comments about NULLs handling in btbuild */
610  if (scankey->sk_flags & SK_ISNULL) /* key is NULL */
611  {
612  if (isNull)
613  result = 0; /* NULL "=" NULL */
614  else if (scankey->sk_flags & SK_BT_NULLS_FIRST)
615  result = -1; /* NULL "<" NOT_NULL */
616  else
617  result = 1; /* NULL ">" NOT_NULL */
618  }
619  else if (isNull) /* key is NOT_NULL and item is NULL */
620  {
621  if (scankey->sk_flags & SK_BT_NULLS_FIRST)
622  result = 1; /* NOT_NULL ">" NULL */
623  else
624  result = -1; /* NOT_NULL "<" NULL */
625  }
626  else
627  {
628  /*
629  * The sk_func needs to be passed the index value as left arg and
630  * the sk_argument as right arg (they might be of different
631  * types). Since it is convenient for callers to think of
632  * _bt_compare as comparing the scankey to the index item, we have
633  * to flip the sign of the comparison result. (Unless it's a DESC
634  * column, in which case we *don't* flip the sign.)
635  */
636  result = DatumGetInt32(FunctionCall2Coll(&scankey->sk_func,
637  scankey->sk_collation,
638  datum,
639  scankey->sk_argument));
640 
641  if (!(scankey->sk_flags & SK_BT_DESC))
642  INVERT_COMPARE_RESULT(result);
643  }
644 
645  /* if the keys are unequal, return the difference */
646  if (result != 0)
647  return result;
648 
649  scankey++;
650  }
651 
652  /*
653  * All non-truncated attributes (other than heap TID) were found to be
654  * equal. Treat truncated attributes as minus infinity when scankey has a
655  * key attribute value that would otherwise be compared directly.
656  *
657  * Note: it doesn't matter if ntupatts includes non-key attributes;
658  * scankey won't, so explicitly excluding non-key attributes isn't
659  * necessary.
660  */
661  if (key->keysz > ntupatts)
662  return 1;
663 
664  /*
665  * Use the heap TID attribute and scantid to try to break the tie. The
666  * rules are the same as any other key attribute -- only the
667  * representation differs.
668  */
669  heapTid = BTreeTupleGetHeapTID(itup);
670  if (key->scantid == NULL)
671  {
672  /*
673  * Most searches have a scankey that is considered greater than a
674  * truncated pivot tuple if and when the scankey has equal values for
675  * attributes up to and including the least significant untruncated
676  * attribute in tuple.
677  *
678  * For example, if an index has the minimum two attributes (single
679  * user key attribute, plus heap TID attribute), and a page's high key
680  * is ('foo', -inf), and scankey is ('foo', <omitted>), the search
681  * will not descend to the page to the left. The search will descend
682  * right instead. The truncated attribute in pivot tuple means that
683  * all non-pivot tuples on the page to the left are strictly < 'foo',
684  * so it isn't necessary to descend left. In other words, search
685  * doesn't have to descend left because it isn't interested in a match
686  * that has a heap TID value of -inf.
687  *
688  * However, some searches (pivotsearch searches) actually require that
689  * we descend left when this happens. -inf is treated as a possible
690  * match for omitted scankey attribute(s). This is needed by page
691  * deletion, which must re-find leaf pages that are targets for
692  * deletion using their high keys.
693  *
694  * Note: the heap TID part of the test ensures that scankey is being
695  * compared to a pivot tuple with one or more truncated key
696  * attributes.
697  *
698  * Note: pg_upgrade'd !heapkeyspace indexes must always descend to the
699  * left here, since they have no heap TID attribute (and cannot have
700  * any -inf key values in any case, since truncation can only remove
701  * non-key attributes). !heapkeyspace searches must always be
702  * prepared to deal with matches on both sides of the pivot once the
703  * leaf level is reached.
704  */
705  if (key->heapkeyspace && !key->pivotsearch &&
706  key->keysz == ntupatts && heapTid == NULL)
707  return 1;
708 
709  /* All provided scankey arguments found to be equal */
710  return 0;
711  }
712 
713  /*
714  * Treat truncated heap TID as minus infinity, since scankey has a key
715  * attribute value (scantid) that would otherwise be compared directly
716  */
718  if (heapTid == NULL)
719  return 1;
720 
722  return ItemPointerCompare(key->scantid, heapTid);
723 }
int32 ItemPointerCompare(ItemPointer arg1, ItemPointer arg2)
Definition: itemptr.c:52
#define DatumGetInt32(X)
Definition: postgres.h:472
#define RelationGetDescr(relation)
Definition: rel.h:440
ItemPointer scantid
Definition: nbtree.h:476
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:219
#define Min(x, y)
Definition: c.h:890
#define BTreeTupleGetHeapTID(itup)
Definition: nbtree.h:346
Datum FunctionCall2Coll(FmgrInfo *flinfo, Oid collation, Datum arg1, Datum arg2)
Definition: fmgr.c:1150
#define BTreeTupleGetNAtts(itup, rel)
Definition: nbtree.h:327
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:68
signed int int32
Definition: c.h:346
IndexTupleData * IndexTuple
Definition: itup.h:53
FmgrInfo sk_func
Definition: skey.h:71
#define IndexRelationGetNumberOfKeyAttributes(relation)
Definition: rel.h:433
#define SK_ISNULL
Definition: skey.h:115
bool pivotsearch
Definition: nbtree.h:475
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:235
#define SK_BT_NULLS_FIRST
Definition: nbtree.h:682
uintptr_t Datum
Definition: postgres.h:367
bool _bt_check_natts(Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum)
Definition: nbtutils.c:2382
int sk_flags
Definition: skey.h:66
#define Assert(condition)
Definition: c.h:732
#define SK_BT_DESC
Definition: nbtree.h:681
bool heapkeyspace
Definition: nbtree.h:472
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
#define index_getattr(tup, attnum, tupleDesc, isnull)
Definition: itup.h:100
ScanKeyData scankeys[INDEX_MAX_KEYS]
Definition: nbtree.h:478
Oid sk_collation
Definition: skey.h:70
int i
Datum sk_argument
Definition: skey.h:72
#define INVERT_COMPARE_RESULT(var)
Definition: c.h:1030
#define PageGetItem(page, itemId)
Definition: bufpage.h:340
AttrNumber sk_attno
Definition: skey.h:67
#define P_ISLEAF(opaque)
Definition: nbtree.h:189

◆ _bt_delitems_delete()

void _bt_delitems_delete ( Relation  rel,
Buffer  buf,
OffsetNumber itemnos,
int  nitems,
Relation  heapRel 
)

Definition at line 1096 of file nbtpage.c.

References Assert, BTP_HAS_GARBAGE, BTPageOpaqueData::btpo_flags, BufferGetPage, END_CRIT_SECTION, index_compute_xid_horizon_for_tuples(), InvalidTransactionId, xl_btree_delete::latestRemovedXid, MarkBufferDirty(), xl_btree_delete::nitems, PageGetSpecialPointer, PageIndexMultiDelete(), PageSetLSN, REGBUF_STANDARD, RelationNeedsWAL, SizeOfBtreeDelete, START_CRIT_SECTION, XLOG_BTREE_DELETE, XLogBeginInsert(), XLogInsert(), XLogRegisterBuffer(), XLogRegisterData(), and XLogStandbyInfoActive.

Referenced by _bt_vacuum_one_page().

1099 {
1100  Page page = BufferGetPage(buf);
1101  BTPageOpaque opaque;
1102  TransactionId latestRemovedXid = InvalidTransactionId;
1103 
1104  /* Shouldn't be called unless there's something to do */
1105  Assert(nitems > 0);
1106 
1108  latestRemovedXid =
1110  itemnos, nitems);
1111 
1112  /* No ereport(ERROR) until changes are logged */
1114 
1115  /* Fix the page */
1116  PageIndexMultiDelete(page, itemnos, nitems);
1117 
1118  /*
1119  * Unlike _bt_delitems_vacuum, we *must not* clear the vacuum cycle ID,
1120  * because this is not called by VACUUM.
1121  */
1122 
1123  /*
1124  * Mark the page as not containing any LP_DEAD items. This is not
1125  * certainly true (there might be some that have recently been marked, but
1126  * weren't included in our target-item list), but it will almost always be
1127  * true and it doesn't seem worth an additional page scan to check it.
1128  * Remember that BTP_HAS_GARBAGE is only a hint anyway.
1129  */
1130  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1131  opaque->btpo_flags &= ~BTP_HAS_GARBAGE;
1132 
1134 
1135  /* XLOG stuff */
1136  if (RelationNeedsWAL(rel))
1137  {
1138  XLogRecPtr recptr;
1139  xl_btree_delete xlrec_delete;
1140 
1141  xlrec_delete.latestRemovedXid = latestRemovedXid;
1142  xlrec_delete.nitems = nitems;
1143 
1144  XLogBeginInsert();
1146  XLogRegisterData((char *) &xlrec_delete, SizeOfBtreeDelete);
1147 
1148  /*
1149  * We need the target-offsets array whether or not we store the whole
1150  * buffer, to allow us to find the latestRemovedXid on a standby
1151  * server.
1152  */
1153  XLogRegisterData((char *) itemnos, nitems * sizeof(OffsetNumber));
1154 
1155  recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DELETE);
1156 
1157  PageSetLSN(page, recptr);
1158  }
1159 
1160  END_CRIT_SECTION();
1161 }
TransactionId latestRemovedXid
Definition: nbtxlog.h:129
uint32 TransactionId
Definition: c.h:507
void MarkBufferDirty(Buffer buffer)
Definition: bufmgr.c:1458
void XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags)
Definition: xloginsert.c:213
#define END_CRIT_SECTION()
Definition: miscadmin.h:134
#define START_CRIT_SECTION()
Definition: miscadmin.h:132
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:68
uint16 OffsetNumber
Definition: off.h:24
static char * buf
Definition: pg_test_fsync.c:68
#define REGBUF_STANDARD
Definition: xloginsert.h:35
#define InvalidTransactionId
Definition: transam.h:31
#define BufferGetPage(buffer)
Definition: bufmgr.h:159
#define XLOG_BTREE_DELETE
Definition: nbtxlog.h:32
void XLogRegisterData(char *data, int len)
Definition: xloginsert.c:323
XLogRecPtr XLogInsert(RmgrId rmid, uint8 info)
Definition: xloginsert.c:415
#define XLogStandbyInfoActive()
Definition: xlog.h:195
uint64 XLogRecPtr
Definition: xlogdefs.h:21
#define Assert(condition)
Definition: c.h:732
void PageIndexMultiDelete(Page page, OffsetNumber *itemnos, int nitems)
Definition: bufpage.c:835
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
#define RelationNeedsWAL(relation)
Definition: rel.h:516
#define SizeOfBtreeDelete
Definition: nbtxlog.h:135
void XLogBeginInsert(void)
Definition: xloginsert.c:120
uint16 btpo_flags
Definition: nbtree.h:64
#define PageSetLSN(page, lsn)
Definition: bufpage.h:368
#define BTP_HAS_GARBAGE
Definition: nbtree.h:77
Pointer Page
Definition: bufpage.h:78
TransactionId index_compute_xid_horizon_for_tuples(Relation irel, Relation hrel, Buffer ibuf, OffsetNumber *itemnos, int nitems)
Definition: genam.c:281

◆ _bt_delitems_vacuum()

void _bt_delitems_vacuum ( Relation  rel,
Buffer  buf,
OffsetNumber itemnos,
int  nitems,
BlockNumber  lastBlockVacuumed 
)

Definition at line 1023 of file nbtpage.c.

References BTP_HAS_GARBAGE, BTPageOpaqueData::btpo_cycleid, BTPageOpaqueData::btpo_flags, BufferGetPage, END_CRIT_SECTION, xl_btree_vacuum::lastBlockVacuumed, MarkBufferDirty(), PageGetSpecialPointer, PageIndexMultiDelete(), PageSetLSN, REGBUF_STANDARD, RelationNeedsWAL, SizeOfBtreeVacuum, START_CRIT_SECTION, XLOG_BTREE_VACUUM, XLogBeginInsert(), XLogInsert(), XLogRegisterBufData(), XLogRegisterBuffer(), and XLogRegisterData().

Referenced by btvacuumpage(), and btvacuumscan().

1026 {
1027  Page page = BufferGetPage(buf);
1028  BTPageOpaque opaque;
1029 
1030  /* No ereport(ERROR) until changes are logged */
1032 
1033  /* Fix the page */
1034  if (nitems > 0)
1035  PageIndexMultiDelete(page, itemnos, nitems);
1036 
1037  /*
1038  * We can clear the vacuum cycle ID since this page has certainly been
1039  * processed by the current vacuum scan.
1040  */
1041  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1042  opaque->btpo_cycleid = 0;
1043 
1044  /*
1045  * Mark the page as not containing any LP_DEAD items. This is not
1046  * certainly true (there might be some that have recently been marked, but
1047  * weren't included in our target-item list), but it will almost always be
1048  * true and it doesn't seem worth an additional page scan to check it.
1049  * Remember that BTP_HAS_GARBAGE is only a hint anyway.
1050  */
1051  opaque->btpo_flags &= ~BTP_HAS_GARBAGE;
1052 
1054 
1055  /* XLOG stuff */
1056  if (RelationNeedsWAL(rel))
1057  {
1058  XLogRecPtr recptr;
1059  xl_btree_vacuum xlrec_vacuum;
1060 
1061  xlrec_vacuum.lastBlockVacuumed = lastBlockVacuumed;
1062 
1063  XLogBeginInsert();
1065  XLogRegisterData((char *) &xlrec_vacuum, SizeOfBtreeVacuum);
1066 
1067  /*
1068  * The target-offsets array is not in the buffer, but pretend that it
1069  * is. When XLogInsert stores the whole buffer, the offsets array
1070  * need not be stored too.
1071  */
1072  if (nitems > 0)
1073  XLogRegisterBufData(0, (char *) itemnos, nitems * sizeof(OffsetNumber));
1074 
1075  recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_VACUUM);
1076 
1077  PageSetLSN(page, recptr);
1078  }
1079 
1080  END_CRIT_SECTION();
1081 }
BlockNumber lastBlockVacuumed
Definition: nbtxlog.h:174
void XLogRegisterBufData(uint8 block_id, char *data, int len)
Definition: xloginsert.c:361
void MarkBufferDirty(Buffer buffer)
Definition: bufmgr.c:1458
void XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags)
Definition: xloginsert.c:213
#define END_CRIT_SECTION()
Definition: miscadmin.h:134
#define START_CRIT_SECTION()
Definition: miscadmin.h:132
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:68
uint16 OffsetNumber
Definition: off.h:24
#define SizeOfBtreeVacuum
Definition: nbtxlog.h:179
BTCycleId btpo_cycleid
Definition: nbtree.h:65
static char * buf
Definition: pg_test_fsync.c:68
#define REGBUF_STANDARD
Definition: xloginsert.h:35
#define XLOG_BTREE_VACUUM
Definition: nbtxlog.h:37
#define BufferGetPage(buffer)
Definition: bufmgr.h:159
void XLogRegisterData(char *data, int len)
Definition: xloginsert.c:323
XLogRecPtr XLogInsert(RmgrId rmid, uint8 info)
Definition: xloginsert.c:415
uint64 XLogRecPtr
Definition: xlogdefs.h:21
void PageIndexMultiDelete(Page page, OffsetNumber *itemnos, int nitems)
Definition: bufpage.c:835
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
#define RelationNeedsWAL(relation)
Definition: rel.h:516
void XLogBeginInsert(void)
Definition: xloginsert.c:120
uint16 btpo_flags
Definition: nbtree.h:64
#define PageSetLSN(page, lsn)
Definition: bufpage.h:368
#define BTP_HAS_GARBAGE
Definition: nbtree.h:77
Pointer Page
Definition: bufpage.h:78

◆ _bt_doinsert()

bool _bt_doinsert ( Relation  rel,
IndexTuple  itup,
IndexUniqueCheck  checkUnique,
Relation  heapRel 
)

Definition at line 79 of file nbtinsert.c.

References _bt_check_unique(), _bt_checkpage(), _bt_compare(), _bt_findinsertloc(), _bt_freestack(), _bt_insertonpg(), _bt_mkscankey(), _bt_relbuf(), _bt_search(), BTScanInsertData::anynullkeys, Assert, BTInsertStateData::bounds_valid, BT_WRITE, buf, BTInsertStateData::buf, BufferGetPage, CheckForSerializableConflictIn(), ConditionalLockBuffer(), BTScanInsertData::heapkeyspace, IndexTupleSize, InvalidBlockNumber, InvalidBuffer, BTInsertStateData::itemsz, BTInsertStateData::itup, BTInsertStateData::itup_key, MAXALIGN, P_FIRSTDATAKEY, P_IGNORE, P_INCOMPLETE_SPLIT, P_ISLEAF, P_RIGHTMOST, PageGetFreeSpace(), PageGetMaxOffsetNumber, PageGetSpecialPointer, pfree(), ReadBuffer(), RelationGetTargetBlock, RelationSetTargetBlock, ReleaseBuffer(), BTScanInsertData::scantid, SpeculativeInsertionWait(), IndexTupleData::t_tid, TransactionIdIsValid, UNIQUE_CHECK_EXISTING, UNIQUE_CHECK_NO, XactLockTableWait(), and XLTW_InsertIndex.

Referenced by btinsert().

81 {
82  bool is_unique = false;
83  BTInsertStateData insertstate;
84  BTScanInsert itup_key;
85  BTStack stack = NULL;
86  Buffer buf;
87  bool fastpath;
88  bool checkingunique = (checkUnique != UNIQUE_CHECK_NO);
89 
90  /* we need an insertion scan key to do our search, so build one */
91  itup_key = _bt_mkscankey(rel, itup);
92 
93  if (checkingunique)
94  {
95  if (!itup_key->anynullkeys)
96  {
97  /* No (heapkeyspace) scantid until uniqueness established */
98  itup_key->scantid = NULL;
99  }
100  else
101  {
102  /*
103  * Scan key for new tuple contains NULL key values. Bypass
104  * checkingunique steps. They are unnecessary because core code
105  * considers NULL unequal to every value, including NULL.
106  *
107  * This optimization avoids O(N^2) behavior within the
108  * _bt_findinsertloc() heapkeyspace path when a unique index has a
109  * large number of "duplicates" with NULL key values.
110  */
111  checkingunique = false;
112  /* Tuple is unique in the sense that core code cares about */
113  Assert(checkUnique != UNIQUE_CHECK_EXISTING);
114  is_unique = true;
115  }
116  }
117 
118  /*
119  * Fill in the BTInsertState working area, to track the current page and
120  * position within the page to insert on
121  */
122  insertstate.itup = itup;
123  /* PageAddItem will MAXALIGN(), but be consistent */
124  insertstate.itemsz = MAXALIGN(IndexTupleSize(itup));
125  insertstate.itup_key = itup_key;
126  insertstate.bounds_valid = false;
127  insertstate.buf = InvalidBuffer;
128 
129  /*
130  * It's very common to have an index on an auto-incremented or
131  * monotonically increasing value. In such cases, every insertion happens
132  * towards the end of the index. We try to optimize that case by caching
133  * the right-most leaf of the index. If our cached block is still the
134  * rightmost leaf, has enough free space to accommodate a new entry and
135  * the insertion key is strictly greater than the first key in this page,
136  * then we can safely conclude that the new key will be inserted in the
137  * cached block. So we simply search within the cached block and insert
138  * the key at the appropriate location. We call it a fastpath.
139  *
140  * Testing has revealed, though, that the fastpath can result in increased
141  * contention on the exclusive-lock on the rightmost leaf page. So we
142  * conditionally check if the lock is available. If it's not available
143  * then we simply abandon the fastpath and take the regular path. This
144  * makes sense because unavailability of the lock also signals that some
145  * other backend might be concurrently inserting into the page, thus
146  * reducing our chances to finding an insertion place in this page.
147  */
148 top:
149  fastpath = false;
151  {
152  Page page;
153  BTPageOpaque lpageop;
154 
155  /*
156  * Conditionally acquire exclusive lock on the buffer before doing any
157  * checks. If we don't get the lock, we simply follow slowpath. If we
158  * do get the lock, this ensures that the index state cannot change,
159  * as far as the rightmost part of the index is concerned.
160  */
161  buf = ReadBuffer(rel, RelationGetTargetBlock(rel));
162 
163  if (ConditionalLockBuffer(buf))
164  {
165  _bt_checkpage(rel, buf);
166 
167  page = BufferGetPage(buf);
168 
169  lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
170 
171  /*
172  * Check if the page is still the rightmost leaf page, has enough
173  * free space to accommodate the new tuple, and the insertion scan
174  * key is strictly greater than the first key on the page.
175  */
176  if (P_ISLEAF(lpageop) && P_RIGHTMOST(lpageop) &&
177  !P_IGNORE(lpageop) &&
178  (PageGetFreeSpace(page) > insertstate.itemsz) &&
179  PageGetMaxOffsetNumber(page) >= P_FIRSTDATAKEY(lpageop) &&
180  _bt_compare(rel, itup_key, page, P_FIRSTDATAKEY(lpageop)) > 0)
181  {
182  /*
183  * The right-most block should never have an incomplete split.
184  * But be paranoid and check for it anyway.
185  */
186  Assert(!P_INCOMPLETE_SPLIT(lpageop));
187  fastpath = true;
188  }
189  else
190  {
191  _bt_relbuf(rel, buf);
192 
193  /*
194  * Something did not work out. Just forget about the cached
195  * block and follow the normal path. It might be set again if
196  * the conditions are favourable.
197  */
199  }
200  }
201  else
202  {
203  ReleaseBuffer(buf);
204 
205  /*
206  * If someone's holding a lock, it's likely to change anyway, so
207  * don't try again until we get an updated rightmost leaf.
208  */
210  }
211  }
212 
213  if (!fastpath)
214  {
215  /*
216  * Find the first page containing this key. Buffer returned by
217  * _bt_search() is locked in exclusive mode.
218  */
219  stack = _bt_search(rel, itup_key, &buf, BT_WRITE, NULL);
220  }
221 
222  insertstate.buf = buf;
223  buf = InvalidBuffer; /* insertstate.buf now owns the buffer */
224 
225  /*
226  * If we're not allowing duplicates, make sure the key isn't already in
227  * the index.
228  *
229  * NOTE: obviously, _bt_check_unique can only detect keys that are already
230  * in the index; so it cannot defend against concurrent insertions of the
231  * same key. We protect against that by means of holding a write lock on
232  * the first page the value could be on, with omitted/-inf value for the
233  * implicit heap TID tiebreaker attribute. Any other would-be inserter of
234  * the same key must acquire a write lock on the same page, so only one
235  * would-be inserter can be making the check at one time. Furthermore,
236  * once we are past the check we hold write locks continuously until we
237  * have performed our insertion, so no later inserter can fail to see our
238  * insertion. (This requires some care in _bt_findinsertloc.)
239  *
240  * If we must wait for another xact, we release the lock while waiting,
241  * and then must start over completely.
242  *
243  * For a partial uniqueness check, we don't wait for the other xact. Just
244  * let the tuple in and return false for possibly non-unique, or true for
245  * definitely unique.
246  */
247  if (checkingunique)
248  {
249  TransactionId xwait;
250  uint32 speculativeToken;
251 
252  xwait = _bt_check_unique(rel, &insertstate, heapRel, checkUnique,
253  &is_unique, &speculativeToken);
254 
255  if (TransactionIdIsValid(xwait))
256  {
257  /* Have to wait for the other guy ... */
258  _bt_relbuf(rel, insertstate.buf);
259  insertstate.buf = InvalidBuffer;
260 
261  /*
262  * If it's a speculative insertion, wait for it to finish (ie. to
263  * go ahead with the insertion, or kill the tuple). Otherwise
264  * wait for the transaction to finish as usual.
265  */
266  if (speculativeToken)
267  SpeculativeInsertionWait(xwait, speculativeToken);
268  else
269  XactLockTableWait(xwait, rel, &itup->t_tid, XLTW_InsertIndex);
270 
271  /* start over... */
272  if (stack)
273  _bt_freestack(stack);
274  goto top;
275  }
276 
277  /* Uniqueness is established -- restore heap tid as scantid */
278  if (itup_key->heapkeyspace)
279  itup_key->scantid = &itup->t_tid;
280  }
281 
282  if (checkUnique != UNIQUE_CHECK_EXISTING)
283  {
284  OffsetNumber newitemoff;
285 
286  /*
287  * The only conflict predicate locking cares about for indexes is when
288  * an index tuple insert conflicts with an existing lock. We don't
289  * know the actual page we're going to insert on for sure just yet in
290  * checkingunique and !heapkeyspace cases, but it's okay to use the
291  * first page the value could be on (with scantid omitted) instead.
292  */
293  CheckForSerializableConflictIn(rel, NULL, insertstate.buf);
294 
295  /*
296  * Do the insertion. Note that insertstate contains cached binary
297  * search bounds established within _bt_check_unique when insertion is
298  * checkingunique.
299  */
300  newitemoff = _bt_findinsertloc(rel, &insertstate, checkingunique,
301  stack, heapRel);
302  _bt_insertonpg(rel, itup_key, insertstate.buf, InvalidBuffer, stack,
303  itup, newitemoff, false);
304  }
305  else
306  {
307  /* just release the buffer */
308  _bt_relbuf(rel, insertstate.buf);
309  }
310 
311  /* be tidy */
312  if (stack)
313  _bt_freestack(stack);
314  pfree(itup_key);
315 
316  return is_unique;
317 }
void _bt_freestack(BTStack stack)
Definition: nbtutils.c:163
static void _bt_insertonpg(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, BTStack stack, IndexTuple itup, OffsetNumber newitemoff, bool split_only_page)
Definition: nbtinsert.c:932
#define P_IGNORE(opaque)
Definition: nbtree.h:194
bool bounds_valid
Definition: nbtree.h:508
uint32 TransactionId
Definition: c.h:507
BTScanInsert _bt_mkscankey(Relation rel, IndexTuple itup)
Definition: nbtutils.c:85
ItemPointer scantid
Definition: nbtree.h:476
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:219
ItemPointerData t_tid
Definition: itup.h:37
static OffsetNumber _bt_findinsertloc(Relation rel, BTInsertState insertstate, bool checkingunique, BTStack stack, Relation heapRel)
Definition: nbtinsert.c:683
#define InvalidBuffer
Definition: buf.h:25
void ReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:3353
#define P_INCOMPLETE_SPLIT(opaque)
Definition: nbtree.h:196
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:357
#define RelationGetTargetBlock(relation)
Definition: rel.h:499
void CheckForSerializableConflictIn(Relation relation, HeapTuple tuple, Buffer buffer)
Definition: predicate.c:4442
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:68
Size PageGetFreeSpace(Page page)
Definition: bufpage.c:581
uint16 OffsetNumber
Definition: off.h:24
void pfree(void *pointer)
Definition: mcxt.c:1031
void _bt_checkpage(Relation rel, Buffer buf)
Definition: nbtpage.c:729
void SpeculativeInsertionWait(TransactionId xid, uint32 token)
Definition: lmgr.c:781
static char * buf
Definition: pg_test_fsync.c:68
unsigned int uint32
Definition: c.h:358
IndexTuple itup
Definition: nbtree.h:496
int32 _bt_compare(Relation rel, BTScanInsert key, Page page, OffsetNumber offnum)
Definition: nbtsearch.c:559
#define BufferGetPage(buffer)
Definition: bufmgr.h:159
bool ConditionalLockBuffer(Buffer buffer)
Definition: bufmgr.c:3616
bool anynullkeys
Definition: nbtree.h:473
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:951
void XactLockTableWait(TransactionId xid, Relation rel, ItemPointer ctid, XLTW_Oper oper)
Definition: lmgr.c:624
BTScanInsert itup_key
Definition: nbtree.h:498
#define Assert(condition)
Definition: c.h:732
bool heapkeyspace
Definition: nbtree.h:472
Buffer ReadBuffer(Relation reln, BlockNumber blockNum)
Definition: bufmgr.c:596
#define RelationSetTargetBlock(relation, targblock)
Definition: rel.h:506
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
#define InvalidBlockNumber
Definition: block.h:33
#define MAXALIGN(LEN)
Definition: c.h:685
static TransactionId _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel, IndexUniqueCheck checkUnique, bool *is_unique, uint32 *speculativeToken)
Definition: nbtinsert.c:343
#define TransactionIdIsValid(xid)
Definition: transam.h:41
#define BT_WRITE
Definition: nbtree.h:403
int Buffer
Definition: buf.h:23
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:188
BTStack _bt_search(Relation rel, BTScanInsert key, Buffer *bufP, int access, Snapshot snapshot)
Definition: nbtsearch.c:92
Pointer Page
Definition: bufpage.h:78
#define IndexTupleSize(itup)
Definition: itup.h:71
#define P_ISLEAF(opaque)
Definition: nbtree.h:189

◆ _bt_end_vacuum()

void _bt_end_vacuum ( Relation  rel)

Definition at line 1950 of file nbtutils.c.

References LockRelId::dbId, i, LockInfoData::lockRelId, LW_EXCLUSIVE, LWLockAcquire(), LWLockRelease(), BTVacInfo::num_vacuums, RelationData::rd_lockInfo, LockRelId::relId, BTOneVacInfo::relid, and BTVacInfo::vacuums.

Referenced by _bt_end_vacuum_callback(), and btbulkdelete().

1951 {
1952  int i;
1953 
1954  LWLockAcquire(BtreeVacuumLock, LW_EXCLUSIVE);
1955 
1956  /* Find the array entry */
1957  for (i = 0; i < btvacinfo->num_vacuums; i++)
1958  {
1959  BTOneVacInfo *vac = &btvacinfo->vacuums[i];
1960 
1961  if (vac->relid.relId == rel->rd_lockInfo.lockRelId.relId &&
1962  vac->relid.dbId == rel->rd_lockInfo.lockRelId.dbId)
1963  {
1964  /* Remove it by shifting down the last entry */
1965  *vac = btvacinfo->vacuums[btvacinfo->num_vacuums - 1];
1967  break;
1968  }
1969  }
1970 
1971  LWLockRelease(BtreeVacuumLock);
1972 }
LockRelId lockRelId
Definition: rel.h:44
static BTVacInfo * btvacinfo
Definition: nbtutils.c:1846
LockRelId relid
Definition: nbtutils.c:1834
Oid dbId
Definition: rel.h:39
BTOneVacInfo vacuums[FLEXIBLE_ARRAY_MEMBER]
Definition: nbtutils.c:1843
void LWLockRelease(LWLock *lock)
Definition: lwlock.c:1726
LockInfoData rd_lockInfo
Definition: rel.h:87
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1122
int num_vacuums
Definition: nbtutils.c:1841
int i
Oid relId
Definition: rel.h:38

◆ _bt_end_vacuum_callback()

void _bt_end_vacuum_callback ( int  code,
Datum  arg 
)

Definition at line 1978 of file nbtutils.c.

References _bt_end_vacuum(), and DatumGetPointer.

Referenced by btbulkdelete().

1979 {
1981 }
void _bt_end_vacuum(Relation rel)
Definition: nbtutils.c:1950
#define DatumGetPointer(X)
Definition: postgres.h:549
void * arg

◆ _bt_findsplitloc()

OffsetNumber _bt_findsplitloc ( Relation  rel,
Page  page,
OffsetNumber  newitemoff,
Size  newitemsz,
IndexTuple  newitem,
bool newitemonleft 
)

Definition at line 127 of file nbtsplitloc.c.

References _bt_afternewitemoff(), _bt_bestsplitloc(), _bt_deltasortsplits(), _bt_recsplitloc(), _bt_strategy(), Assert, BTREE_DEFAULT_FILLFACTOR, BTREE_NONLEAF_FILLFACTOR, BTREE_SINGLEVAL_FILLFACTOR, elog, ERROR, SplitPoint::firstoldonright, i, IndexRelationGetNumberOfKeyAttributes, FindSplitData::interval, FindSplitData::is_leaf, FindSplitData::is_rightmost, ItemIdGetLength, FindSplitData::leftspace, Max, MAX_INTERNAL_INTERVAL, MAX_LEAF_INTERVAL, MAXALIGN, FindSplitData::maxsplits, Min, FindSplitData::minfirstrightsz, FindSplitData::newitem, FindSplitData::newitemoff, SplitPoint::newitemonleft, FindSplitData::newitemsz, FindSplitData::nsplits, OffsetNumberNext, FindSplitData::olddataitemstotal, P_FIRSTDATAKEY, P_HIKEY, P_ISLEAF, P_RIGHTMOST, FindSplitData::page, PageGetExactFreeSpace(), PageGetItemId, PageGetMaxOffsetNumber, PageGetPageSize, PageGetSpecialPointer, palloc(), pfree(), FindSplitData::rel, RelationGetFillFactor, RelationGetRelationName, FindSplitData::rightspace, SIZE_MAX, SizeOfPageHeaderData, SPLIT_DEFAULT, SPLIT_MANY_DUPLICATES, SPLIT_SINGLE_VALUE, and FindSplitData::splits.

Referenced by _bt_split().

133 {
134  BTPageOpaque opaque;
135  int leftspace,
136  rightspace,
137  olddataitemstotal,
138  olddataitemstoleft,
139  perfectpenalty,
140  leaffillfactor;
142  FindSplitStrat strategy;
143  ItemId itemid;
144  OffsetNumber offnum,
145  maxoff,
146  foundfirstright;
147  double fillfactormult;
148  bool usemult;
149  SplitPoint leftpage,
150  rightpage;
151 
152  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
153  maxoff = PageGetMaxOffsetNumber(page);
154 
155  /* Total free space available on a btree page, after fixed overhead */
156  leftspace = rightspace =
158  MAXALIGN(sizeof(BTPageOpaqueData));
159 
160  /* The right page will have the same high key as the old page */
161  if (!P_RIGHTMOST(opaque))
162  {
163  itemid = PageGetItemId(page, P_HIKEY);
164  rightspace -= (int) (MAXALIGN(ItemIdGetLength(itemid)) +
165  sizeof(ItemIdData));
166  }
167 
168  /* Count up total space in data items before actually scanning 'em */
169  olddataitemstotal = rightspace - (int) PageGetExactFreeSpace(page);
170  leaffillfactor = RelationGetFillFactor(rel, BTREE_DEFAULT_FILLFACTOR);
171 
172  /* Passed-in newitemsz is MAXALIGNED but does not include line pointer */
173  newitemsz += sizeof(ItemIdData);
174  state.rel = rel;
175  state.page = page;
176  state.newitem = newitem;
177  state.newitemsz = newitemsz;
178  state.is_leaf = P_ISLEAF(opaque);
179  state.is_rightmost = P_RIGHTMOST(opaque);
180  state.leftspace = leftspace;
181  state.rightspace = rightspace;
182  state.olddataitemstotal = olddataitemstotal;
183  state.minfirstrightsz = SIZE_MAX;
184  state.newitemoff = newitemoff;
185 
186  /*
187  * maxsplits should never exceed maxoff because there will be at most as
188  * many candidate split points as there are points _between_ tuples, once
189  * you imagine that the new item is already on the original page (the
190  * final number of splits may be slightly lower because not all points
191  * between tuples will be legal).
192  */
193  state.maxsplits = maxoff;
194  state.splits = palloc(sizeof(SplitPoint) * state.maxsplits);
195  state.nsplits = 0;
196 
197  /*
198  * Scan through the data items and calculate space usage for a split at
199  * each possible position
200  */
201  olddataitemstoleft = 0;
202 
203  for (offnum = P_FIRSTDATAKEY(opaque);
204  offnum <= maxoff;
205  offnum = OffsetNumberNext(offnum))
206  {
207  Size itemsz;
208 
209  itemid = PageGetItemId(page, offnum);
210  itemsz = MAXALIGN(ItemIdGetLength(itemid)) + sizeof(ItemIdData);
211 
212  /*
213  * When item offset number is not newitemoff, neither side of the
214  * split can be newitem. Record a split after the previous data item
215  * from original page, but before the current data item from original
216  * page. (_bt_recsplitloc() will reject the split when there are no
217  * previous items, which we rely on.)
218  */
219  if (offnum < newitemoff)
220  _bt_recsplitloc(&state, offnum, false, olddataitemstoleft, itemsz);
221  else if (offnum > newitemoff)
222  _bt_recsplitloc(&state, offnum, true, olddataitemstoleft, itemsz);
223  else
224  {
225  /*
226  * Record a split after all "offnum < newitemoff" original page
227  * data items, but before newitem
228  */
229  _bt_recsplitloc(&state, offnum, false, olddataitemstoleft, itemsz);
230 
231  /*
232  * Record a split after newitem, but before data item from
233  * original page at offset newitemoff/current offset
234  */
235  _bt_recsplitloc(&state, offnum, true, olddataitemstoleft, itemsz);
236  }
237 
238  olddataitemstoleft += itemsz;
239  }
240 
241  /*
242  * Record a split after all original page data items, but before newitem.
243  * (Though only when it's possible that newitem will end up alone on new
244  * right page.)
245  */
246  Assert(olddataitemstoleft == olddataitemstotal);
247  if (newitemoff > maxoff)
248  _bt_recsplitloc(&state, newitemoff, false, olddataitemstotal, 0);
249 
250  /*
251  * I believe it is not possible to fail to find a feasible split, but just
252  * in case ...
253  */
254  if (state.nsplits == 0)
255  elog(ERROR, "could not find a feasible split point for index \"%s\"",
257 
258  /*
259  * Start search for a split point among list of legal split points. Give
260  * primary consideration to equalizing available free space in each half
261  * of the split initially (start with default strategy), while applying
262  * rightmost and split-after-new-item optimizations where appropriate.
263  * Either of the two other fallback strategies may be required for cases
264  * with a large number of duplicates around the original/space-optimal
265  * split point.
266  *
267  * Default strategy gives some weight to suffix truncation in deciding a
268  * split point on leaf pages. It attempts to select a split point where a
269  * distinguishing attribute appears earlier in the new high key for the
270  * left side of the split, in order to maximize the number of trailing
271  * attributes that can be truncated away. Only candidate split points
272  * that imply an acceptable balance of free space on each side are
273  * considered.
274  */
275  if (!state.is_leaf)
276  {
277  /* fillfactormult only used on rightmost page */
278  usemult = state.is_rightmost;
279  fillfactormult = BTREE_NONLEAF_FILLFACTOR / 100.0;
280  }
281  else if (state.is_rightmost)
282  {
283  /* Rightmost leaf page -- fillfactormult always used */
284  usemult = true;
285  fillfactormult = leaffillfactor / 100.0;
286  }
287  else if (_bt_afternewitemoff(&state, maxoff, leaffillfactor, &usemult))
288  {
289  /*
290  * New item inserted at rightmost point among a localized grouping on
291  * a leaf page -- apply "split after new item" optimization, either by
292  * applying leaf fillfactor multiplier, or by choosing the exact split
293  * point that leaves the new item as last on the left. (usemult is set
294  * for us.)
295  */
296  if (usemult)
297  {
298  /* fillfactormult should be set based on leaf fillfactor */
299  fillfactormult = leaffillfactor / 100.0;
300  }
301  else
302  {
303  /* find precise split point after newitemoff */
304  for (int i = 0; i < state.nsplits; i++)
305  {
306  SplitPoint *split = state.splits + i;
307 
308  if (split->newitemonleft &&
309  newitemoff == split->firstoldonright)
310  {
311  pfree(state.splits);
312  *newitemonleft = true;
313  return newitemoff;
314  }
315  }
316 
317  /*
318  * Cannot legally split after newitemoff; proceed with split
319  * without using fillfactor multiplier. This is defensive, and
320  * should never be needed in practice.
321  */
322  fillfactormult = 0.50;
323  }
324  }
325  else
326  {
327  /* Other leaf page. 50:50 page split. */
328  usemult = false;
329  /* fillfactormult not used, but be tidy */
330  fillfactormult = 0.50;
331  }
332 
333  /*
334  * Set an initial limit on the split interval/number of candidate split
335  * points as appropriate. The "Prefix B-Trees" paper refers to this as
336  * sigma l for leaf splits and sigma b for internal ("branch") splits.
337  * It's hard to provide a theoretical justification for the initial size
338  * of the split interval, though it's clear that a small split interval
339  * makes suffix truncation much more effective without noticeably
340  * affecting space utilization over time.
341  */
342  state.interval = Min(Max(1, state.nsplits * 0.05),
343  state.is_leaf ? MAX_LEAF_INTERVAL :
345 
346  /*
347  * Save leftmost and rightmost splits for page before original ordinal
348  * sort order is lost by delta/fillfactormult sort
349  */
350  leftpage = state.splits[0];
351  rightpage = state.splits[state.nsplits - 1];
352 
353  /* Give split points a fillfactormult-wise delta, and sort on deltas */
354  _bt_deltasortsplits(&state, fillfactormult, usemult);
355 
356  /*
357  * Determine if default strategy/split interval will produce a
358  * sufficiently distinguishing split, or if we should change strategies.
359  * Alternative strategies change the range of split points that are
360  * considered acceptable (split interval), and possibly change
361  * fillfactormult, in order to deal with pages with a large number of
362  * duplicates gracefully.
363  *
364  * Pass low and high splits for the entire page (actually, they're for an
365  * imaginary version of the page that includes newitem). These are used
366  * when the initial split interval encloses split points that are full of
367  * duplicates, and we need to consider if it's even possible to avoid
368  * appending a heap TID.
369  */
370  perfectpenalty = _bt_strategy(&state, &leftpage, &rightpage, &strategy);
371 
372  if (strategy == SPLIT_DEFAULT)
373  {
374  /*
375  * Default strategy worked out (always works out with internal page).
376  * Original split interval still stands.
377  */
378  }
379 
380  /*
381  * Many duplicates strategy is used when a heap TID would otherwise be
382  * appended, but the page isn't completely full of logical duplicates.
383  *
384  * The split interval is widened to include all legal candidate split
385  * points. There might be a few as two distinct values in the whole-page
386  * split interval, though it's also possible that most of the values on
387  * the page are unique. The final split point will either be to the
388  * immediate left or to the immediate right of the group of duplicate
389  * tuples that enclose the first/delta-optimal split point (perfect
390  * penalty was set so that the lowest delta split point that avoids
391  * appending a heap TID will be chosen). Maximizing the number of
392  * attributes that can be truncated away is not a goal of the many
393  * duplicates strategy.
394  *
395  * Single value strategy is used when it is impossible to avoid appending
396  * a heap TID. It arranges to leave the left page very full. This
397  * maximizes space utilization in cases where tuples with the same
398  * attribute values span many pages. Newly inserted duplicates will tend
399  * to have higher heap TID values, so we'll end up splitting to the right
400  * consistently. (Single value strategy is harmless though not
401  * particularly useful with !heapkeyspace indexes.)
402  */
403  else if (strategy == SPLIT_MANY_DUPLICATES)
404  {
405  Assert(state.is_leaf);
406  /* Shouldn't try to truncate away extra user attributes */
407  Assert(perfectpenalty ==
409  /* No need to resort splits -- no change in fillfactormult/deltas */
410  state.interval = state.nsplits;
411  }
412  else if (strategy == SPLIT_SINGLE_VALUE)
413  {
414  Assert(state.is_leaf);
415  /* Split near the end of the page */
416  usemult = true;
417  fillfactormult = BTREE_SINGLEVAL_FILLFACTOR / 100.0;
418  /* Resort split points with new delta */
419  _bt_deltasortsplits(&state, fillfactormult, usemult);
420  /* Appending a heap TID is unavoidable, so interval of 1 is fine */
421  state.interval = 1;
422  }
423 
424  /*
425  * Search among acceptable split points (using final split interval) for
426  * the entry that has the lowest penalty, and is therefore expected to
427  * maximize fan-out. Sets *newitemonleft for us.
428  */
429  foundfirstright = _bt_bestsplitloc(&state, perfectpenalty, newitemonleft,
430  strategy);
431  pfree(state.splits);
432 
433  return foundfirstright;
434 }
OffsetNumber firstoldonright
Definition: nbtsplitloc.c:40
bool is_rightmost
Definition: nbtsplitloc.c:53
Size minfirstrightsz
Definition: nbtsplitloc.c:58
#define MAX_LEAF_INTERVAL
Definition: nbtsplitloc.c:21
IndexTuple newitem
Definition: nbtsplitloc.c:50
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:219
#define Min(x, y)
Definition: c.h:890
bool newitemonleft
Definition: nbtsplitloc.c:41
#define SizeOfPageHeaderData
Definition: bufpage.h:216
static int _bt_strategy(FindSplitData *state, SplitPoint *leftpage, SplitPoint *rightpage, FindSplitStrat *strategy)
Definition: nbtsplitloc.c:839
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:357
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:68
SplitPoint * splits
Definition: nbtsplitloc.c:63
uint16 OffsetNumber
Definition: off.h:24
void pfree(void *pointer)
Definition: mcxt.c:1031
#define ItemIdGetLength(itemId)
Definition: itemid.h:59
#define BTREE_NONLEAF_FILLFACTOR
Definition: nbtree.h:170
#define ERROR
Definition: elog.h:43
static OffsetNumber _bt_bestsplitloc(FindSplitData *state, int perfectpenalty, bool *newitemonleft, FindSplitStrat strategy)
Definition: nbtsplitloc.c:764
#define PageGetPageSize(page)
Definition: bufpage.h:268
static bool _bt_afternewitemoff(FindSplitData *state, OffsetNumber maxoff, int leaffillfactor, bool *usemult)
Definition: nbtsplitloc.c:607
#define RelationGetRelationName(relation)
Definition: rel.h:448
struct ItemIdData ItemIdData
Relation rel
Definition: nbtsplitloc.c:48
#define IndexRelationGetNumberOfKeyAttributes(relation)
Definition: rel.h:433
#define BTREE_DEFAULT_FILLFACTOR
Definition: nbtree.h:169
#define BTREE_SINGLEVAL_FILLFACTOR
Definition: nbtree.h:171
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:235
#define RelationGetFillFactor(relation, defaultff)
Definition: rel.h:290
FindSplitStrat
Definition: nbtsplitloc.c:24
#define SIZE_MAX
Definition: c.h:452
#define Max(x, y)
Definition: c.h:884
#define MAX_INTERNAL_INTERVAL
Definition: nbtsplitloc.c:22
int olddataitemstotal
Definition: nbtsplitloc.c:57
#define Assert(condition)
Definition: c.h:732
Definition: regguts.h:298
OffsetNumber newitemoff
Definition: nbtsplitloc.c:54
#define OffsetNumberNext(offsetNumber)
Definition: off.h:53
size_t Size
Definition: c.h:466
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
#define MAXALIGN(LEN)
Definition: c.h:685
static void _bt_recsplitloc(FindSplitData *state, OffsetNumber firstoldonright, bool newitemonleft, int olddataitemstoleft, Size firstoldonrightsz)
Definition: nbtsplitloc.c:453
Size PageGetExactFreeSpace(Page page)
Definition: bufpage.c:632
#define P_HIKEY
Definition: nbtree.h:217
void * palloc(Size size)
Definition: mcxt.c:924
#define elog(elevel,...)
Definition: elog.h:226
int i
static void _bt_deltasortsplits(FindSplitData *state, double fillfactormult, bool usemult)
Definition: nbtsplitloc.c:538
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:188
#define P_ISLEAF(opaque)
Definition: nbtree.h:189

◆ _bt_finish_split()

void _bt_finish_split ( Relation  rel,
Buffer  bbuf,
BTStack  stack 
)

Definition at line 1854 of file nbtinsert.c.

References _bt_getbuf(), _bt_insert_parent(), _bt_relbuf(), Assert, BT_WRITE, BTMetaPageData::btm_root, BTPageGetMeta, BTPageOpaqueData::btpo_next, BTREE_METAPAGE, BufferGetBlockNumber(), BufferGetPage, DEBUG1, elog, P_INCOMPLETE_SPLIT, P_LEFTMOST, P_RIGHTMOST, and PageGetSpecialPointer.

Referenced by _bt_getstackbuf(), _bt_moveright(), and _bt_stepright().

1855 {
1856  Page lpage = BufferGetPage(lbuf);
1857  BTPageOpaque lpageop = (BTPageOpaque) PageGetSpecialPointer(lpage);
1858  Buffer rbuf;
1859  Page rpage;
1860  BTPageOpaque rpageop;
1861  bool was_root;
1862  bool was_only;
1863 
1864  Assert(P_INCOMPLETE_SPLIT(lpageop));
1865 
1866  /* Lock right sibling, the one missing the downlink */
1867  rbuf = _bt_getbuf(rel, lpageop->btpo_next, BT_WRITE);
1868  rpage = BufferGetPage(rbuf);
1869  rpageop = (BTPageOpaque) PageGetSpecialPointer(rpage);
1870 
1871  /* Could this be a root split? */
1872  if (!stack)
1873  {
1874  Buffer metabuf;
1875  Page metapg;
1876  BTMetaPageData *metad;
1877 
1878  /* acquire lock on the metapage */
1879  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
1880  metapg = BufferGetPage(metabuf);
1881  metad = BTPageGetMeta(metapg);
1882 
1883  was_root = (metad->btm_root == BufferGetBlockNumber(lbuf));
1884 
1885  _bt_relbuf(rel, metabuf);
1886  }
1887  else
1888  was_root = false;
1889 
1890  /* Was this the only page on the level before split? */
1891  was_only = (P_LEFTMOST(lpageop) && P_RIGHTMOST(rpageop));
1892 
1893  elog(DEBUG1, "finishing incomplete split of %u/%u",
1895 
1896  _bt_insert_parent(rel, lbuf, rbuf, stack, was_root, was_only);
1897 }
BlockNumber btpo_next
Definition: nbtree.h:58
#define DEBUG1
Definition: elog.h:25
Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access)
Definition: nbtpage.c:796
static void _bt_insert_parent(Relation rel, Buffer buf, Buffer rbuf, BTStack stack, bool is_root, bool is_only)
Definition: nbtinsert.c:1742
#define P_INCOMPLETE_SPLIT(opaque)
Definition: nbtree.h:196
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:68
#define BTPageGetMeta(p)
Definition: nbtree.h:112
#define P_LEFTMOST(opaque)
Definition: nbtree.h:187
#define BufferGetPage(buffer)
Definition: bufmgr.h:159
#define BTREE_METAPAGE
Definition: nbtree.h:131
BlockNumber btm_root
Definition: nbtree.h:101
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:951
#define Assert(condition)
Definition: c.h:732
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2613
#define elog(elevel,...)
Definition: elog.h:226
#define BT_WRITE
Definition: nbtree.h:403
int Buffer
Definition: buf.h:23
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:188
Pointer Page
Definition: bufpage.h:78

◆ _bt_first()

bool _bt_first ( IndexScanDesc  scan,
ScanDirection  dir 
)

Definition at line 746 of file nbtsearch.c.

References _bt_binsrch(), _bt_drop_lock_and_maybe_pin(), _bt_endpoint(), _bt_freestack(), _bt_heapkeyspace(), _bt_initialize_more_data(), _bt_parallel_done(), _bt_parallel_readpage(), _bt_parallel_seize(), _bt_preprocess_keys(), _bt_readpage(), _bt_search(), _bt_steppage(), Assert, BT_READ, BTEqualStrategyNumber, BTGreaterEqualStrategyNumber, BTGreaterStrategyNumber, BTLessEqualStrategyNumber, BTLessStrategyNumber, BTORDER_PROC, BTScanPosInvalidate, BTScanPosIsValid, buf, BTScanPosData::buf, BUFFER_LOCK_UNLOCK, BufferGetBlockNumber(), BufferIsValid, cur, BTScanOpaqueData::currPos, BTScanOpaqueData::currTuples, DatumGetPointer, elog, ERROR, get_opfamily_proc(), i, index_getprocinfo(), INDEX_MAX_KEYS, IndexScanDescData::indexRelation, InvalidBlockNumber, InvalidOid, InvalidStrategy, BTScanPosData::itemIndex, BTScanPosData::items, BTScanOpaqueData::keyData, LockBuffer(), BTScanOpaqueData::numberOfKeys, OffsetNumberPrev, IndexScanDescData::opaque, P_NONE, IndexScanDescData::parallel_scan, pgstat_count_index_scan, PredicateLockPage(), PredicateLockRelation(), BTScanOpaqueData::qual_ok, RelationData::rd_opcintype, RelationData::rd_opfamily, RegProcedureIsValid, RelationGetRelationName, ScanDirectionIsBackward, ScanDirectionIsForward, ScanKeyEntryInitialize(), ScanKeyEntryInitializeWithInfo(), ScanKeyData::sk_argument, ScanKeyData::sk_attno, SK_BT_DESC, SK_BT_NULLS_FIRST, ScanKeyData::sk_collation, ScanKeyData::sk_flags, SK_ISNULL, SK_ROW_END, SK_ROW_HEADER, SK_ROW_MEMBER, SK_SEARCHNOTNULL, ScanKeyData::sk_strategy, ScanKeyData::sk_subtype, status(), IndexScanDescData::xs_heaptid, IndexScanDescData::xs_itup, IndexScanDescData::xs_snapshot, and IndexScanDescData::xs_want_itup.

Referenced by btgetbitmap(), and btgettuple().

747 {
748  Relation rel = scan->indexRelation;
749  BTScanOpaque so = (BTScanOpaque) scan->opaque;
750  Buffer buf;
751  BTStack stack;
752  OffsetNumber offnum;
753  StrategyNumber strat;
754  bool nextkey;
755  bool goback;
756  BTScanInsertData inskey;
757  ScanKey startKeys[INDEX_MAX_KEYS];
758  ScanKeyData notnullkeys[INDEX_MAX_KEYS];
759  int keysCount = 0;
760  int i;
761  bool status = true;
762  StrategyNumber strat_total;
763  BTScanPosItem *currItem;
764  BlockNumber blkno;
765 
767 
769 
770  /*
771  * Examine the scan keys and eliminate any redundant keys; also mark the
772  * keys that must be matched to continue the scan.
773  */
774  _bt_preprocess_keys(scan);
775 
776  /*
777  * Quit now if _bt_preprocess_keys() discovered that the scan keys can
778  * never be satisfied (eg, x == 1 AND x > 2).
779  */
780  if (!so->qual_ok)
781  return false;
782 
783  /*
784  * For parallel scans, get the starting page from shared state. If the
785  * scan has not started, proceed to find out first leaf page in the usual
786  * way while keeping other participating processes waiting. If the scan
787  * has already begun, use the page number from the shared structure.
788  */
789  if (scan->parallel_scan != NULL)
790  {
791  status = _bt_parallel_seize(scan, &blkno);
792  if (!status)
793  return false;
794  else if (blkno == P_NONE)
795  {
796  _bt_parallel_done(scan);
797  return false;
798  }
799  else if (blkno != InvalidBlockNumber)
800  {
801  if (!_bt_parallel_readpage(scan, blkno, dir))
802  return false;
803  goto readcomplete;
804  }
805  }
806 
807  /*----------
808  * Examine the scan keys to discover where we need to start the scan.
809  *
810  * We want to identify the keys that can be used as starting boundaries;
811  * these are =, >, or >= keys for a forward scan or =, <, <= keys for
812  * a backwards scan. We can use keys for multiple attributes so long as
813  * the prior attributes had only =, >= (resp. =, <=) keys. Once we accept
814  * a > or < boundary or find an attribute with no boundary (which can be
815  * thought of as the same as "> -infinity"), we can't use keys for any
816  * attributes to its right, because it would break our simplistic notion
817  * of what initial positioning strategy to use.
818  *
819  * When the scan keys include cross-type operators, _bt_preprocess_keys
820  * may not be able to eliminate redundant keys; in such cases we will
821  * arbitrarily pick a usable one for each attribute. This is correct
822  * but possibly not optimal behavior. (For example, with keys like
823  * "x >= 4 AND x >= 5" we would elect to scan starting at x=4 when
824  * x=5 would be more efficient.) Since the situation only arises given
825  * a poorly-worded query plus an incomplete opfamily, live with it.
826  *
827  * When both equality and inequality keys appear for a single attribute
828  * (again, only possible when cross-type operators appear), we *must*
829  * select one of the equality keys for the starting point, because
830  * _bt_checkkeys() will stop the scan as soon as an equality qual fails.
831  * For example, if we have keys like "x >= 4 AND x = 10" and we elect to
832  * start at x=4, we will fail and stop before reaching x=10. If multiple
833  * equality quals survive preprocessing, however, it doesn't matter which
834  * one we use --- by definition, they are either redundant or
835  * contradictory.
836  *
837  * Any regular (not SK_SEARCHNULL) key implies a NOT NULL qualifier.
838  * If the index stores nulls at the end of the index we'll be starting
839  * from, and we have no boundary key for the column (which means the key
840  * we deduced NOT NULL from is an inequality key that constrains the other
841  * end of the index), then we cons up an explicit SK_SEARCHNOTNULL key to
842  * use as a boundary key. If we didn't do this, we might find ourselves
843  * traversing a lot of null entries at the start of the scan.
844  *
845  * In this loop, row-comparison keys are treated the same as keys on their
846  * first (leftmost) columns. We'll add on lower-order columns of the row
847  * comparison below, if possible.
848  *
849  * The selected scan keys (at most one per index column) are remembered by
850  * storing their addresses into the local startKeys[] array.
851  *----------
852  */
853  strat_total = BTEqualStrategyNumber;
854  if (so->numberOfKeys > 0)
855  {
856  AttrNumber curattr;
857  ScanKey chosen;
858  ScanKey impliesNN;
859  ScanKey cur;
860 
861  /*
862  * chosen is the so-far-chosen key for the current attribute, if any.
863  * We don't cast the decision in stone until we reach keys for the
864  * next attribute.
865  */
866  curattr = 1;
867  chosen = NULL;
868  /* Also remember any scankey that implies a NOT NULL constraint */
869  impliesNN = NULL;
870 
871  /*
872  * Loop iterates from 0 to numberOfKeys inclusive; we use the last
873  * pass to handle after-last-key processing. Actual exit from the
874  * loop is at one of the "break" statements below.
875  */
876  for (cur = so->keyData, i = 0;; cur++, i++)
877  {
878  if (i >= so->numberOfKeys || cur->sk_attno != curattr)
879  {
880  /*
881  * Done looking at keys for curattr. If we didn't find a
882  * usable boundary key, see if we can deduce a NOT NULL key.
883  */
884  if (chosen == NULL && impliesNN != NULL &&
885  ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
888  {
889  /* Yes, so build the key in notnullkeys[keysCount] */
890  chosen = &notnullkeys[keysCount];
891  ScanKeyEntryInitialize(chosen,
893  (impliesNN->sk_flags &
895  curattr,
896  ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
899  InvalidOid,
900  InvalidOid,
901  InvalidOid,
902  (Datum) 0);
903  }
904 
905  /*
906  * If we still didn't find a usable boundary key, quit; else
907  * save the boundary key pointer in startKeys.
908  */
909  if (chosen == NULL)
910  break;
911  startKeys[keysCount++] = chosen;
912 
913  /*
914  * Adjust strat_total, and quit if we have stored a > or <
915  * key.
916  */
917  strat = chosen->sk_strategy;
918  if (strat != BTEqualStrategyNumber)
919  {
920  strat_total = strat;
921  if (strat == BTGreaterStrategyNumber ||
922  strat == BTLessStrategyNumber)
923  break;
924  }
925 
926  /*
927  * Done if that was the last attribute, or if next key is not
928  * in sequence (implying no boundary key is available for the
929  * next attribute).
930  */
931  if (i >= so->numberOfKeys ||
932  cur->sk_attno != curattr + 1)
933  break;
934 
935  /*
936  * Reset for next attr.
937  */
938  curattr = cur->sk_attno;
939  chosen = NULL;
940  impliesNN = NULL;
941  }
942 
943  /*
944  * Can we use this key as a starting boundary for this attr?
945  *
946  * If not, does it imply a NOT NULL constraint? (Because
947  * SK_SEARCHNULL keys are always assigned BTEqualStrategyNumber,
948  * *any* inequality key works for that; we need not test.)
949  */
950  switch (cur->sk_strategy)
951  {
954  if (chosen == NULL)
955  {
956  if (ScanDirectionIsBackward(dir))
957  chosen = cur;
958  else
959  impliesNN = cur;
960  }
961  break;
963  /* override any non-equality choice */
964  chosen = cur;
965  break;
968  if (chosen == NULL)
969  {
970  if (ScanDirectionIsForward(dir))
971  chosen = cur;
972  else
973  impliesNN = cur;
974  }
975  break;
976  }
977  }
978  }
979 
980  /*
981  * If we found no usable boundary keys, we have to start from one end of
982  * the tree. Walk down that edge to the first or last key, and scan from
983  * there.
984  */
985  if (keysCount == 0)
986  {
987  bool match;
988 
989  match = _bt_endpoint(scan, dir);
990 
991  if (!match)
992  {
993  /* No match, so mark (parallel) scan finished */
994  _bt_parallel_done(scan);
995  }
996 
997  return match;
998  }
999 
1000  /*
1001  * We want to start the scan somewhere within the index. Set up an
1002  * insertion scankey we can use to search for the boundary point we
1003  * identified above. The insertion scankey is built using the keys
1004  * identified by startKeys[]. (Remaining insertion scankey fields are
1005  * initialized after initial-positioning strategy is finalized.)
1006  */
1007  Assert(keysCount <= INDEX_MAX_KEYS);
1008  for (i = 0; i < keysCount; i++)
1009  {
1010  ScanKey cur = startKeys[i];
1011 
1012  Assert(cur->sk_attno == i + 1);
1013 
1014  if (cur->sk_flags & SK_ROW_HEADER)
1015  {
1016  /*
1017  * Row comparison header: look to the first row member instead.
1018  *
1019  * The member scankeys are already in insertion format (ie, they
1020  * have sk_func = 3-way-comparison function), but we have to watch
1021  * out for nulls, which _bt_preprocess_keys didn't check. A null
1022  * in the first row member makes the condition unmatchable, just
1023  * like qual_ok = false.
1024  */
1025  ScanKey subkey = (ScanKey) DatumGetPointer(cur->sk_argument);
1026 
1027  Assert(subkey->sk_flags & SK_ROW_MEMBER);
1028  if (subkey->sk_flags & SK_ISNULL)
1029  {
1030  _bt_parallel_done(scan);
1031  return false;
1032  }
1033  memcpy(inskey.scankeys + i, subkey, sizeof(ScanKeyData));
1034 
1035  /*
1036  * If the row comparison is the last positioning key we accepted,
1037  * try to add additional keys from the lower-order row members.
1038  * (If we accepted independent conditions on additional index
1039  * columns, we use those instead --- doesn't seem worth trying to
1040  * determine which is more restrictive.) Note that this is OK
1041  * even if the row comparison is of ">" or "<" type, because the
1042  * condition applied to all but the last row member is effectively
1043  * ">=" or "<=", and so the extra keys don't break the positioning
1044  * scheme. But, by the same token, if we aren't able to use all
1045  * the row members, then the part of the row comparison that we
1046  * did use has to be treated as just a ">=" or "<=" condition, and
1047  * so we'd better adjust strat_total accordingly.
1048  */
1049  if (i == keysCount - 1)
1050  {
1051  bool used_all_subkeys = false;
1052 
1053  Assert(!(subkey->sk_flags & SK_ROW_END));
1054  for (;;)
1055  {
1056  subkey++;
1057  Assert(subkey->sk_flags & SK_ROW_MEMBER);
1058  if (subkey->sk_attno != keysCount + 1)
1059  break; /* out-of-sequence, can't use it */
1060  if (subkey->sk_strategy != cur->sk_strategy)
1061  break; /* wrong direction, can't use it */
1062  if (subkey->sk_flags & SK_ISNULL)
1063  break; /* can't use null keys */
1064  Assert(keysCount < INDEX_MAX_KEYS);
1065  memcpy(inskey.scankeys + keysCount, subkey,
1066  sizeof(ScanKeyData));
1067  keysCount++;
1068  if (subkey->sk_flags & SK_ROW_END)
1069  {
1070  used_all_subkeys = true;
1071  break;
1072  }
1073  }
1074  if (!used_all_subkeys)
1075  {
1076  switch (strat_total)
1077  {
1078  case BTLessStrategyNumber:
1079  strat_total = BTLessEqualStrategyNumber;
1080  break;
1082  strat_total = BTGreaterEqualStrategyNumber;
1083  break;
1084  }
1085  }
1086  break; /* done with outer loop */
1087  }
1088  }
1089  else
1090  {
1091  /*
1092  * Ordinary comparison key. Transform the search-style scan key
1093  * to an insertion scan key by replacing the sk_func with the
1094  * appropriate btree comparison function.
1095  *
1096  * If scankey operator is not a cross-type comparison, we can use
1097  * the cached comparison function; otherwise gotta look it up in
1098  * the catalogs. (That can't lead to infinite recursion, since no
1099  * indexscan initiated by syscache lookup will use cross-data-type
1100  * operators.)
1101  *
1102  * We support the convention that sk_subtype == InvalidOid means
1103  * the opclass input type; this is a hack to simplify life for
1104  * ScanKeyInit().
1105  */
1106  if (cur->sk_subtype == rel->rd_opcintype[i] ||
1107  cur->sk_subtype == InvalidOid)
1108  {
1109  FmgrInfo *procinfo;
1110 
1111  procinfo = index_getprocinfo(rel, cur->sk_attno, BTORDER_PROC);
1112  ScanKeyEntryInitializeWithInfo(inskey.scankeys + i,
1113  cur->sk_flags,
1114  cur->sk_attno,
1116  cur->sk_subtype,
1117  cur->sk_collation,
1118  procinfo,
1119  cur->sk_argument);
1120  }
1121  else
1122  {
1123  RegProcedure cmp_proc;
1124 
1125  cmp_proc = get_opfamily_proc(rel->rd_opfamily[i],
1126  rel->rd_opcintype[i],
1127  cur->sk_subtype,
1128  BTORDER_PROC);
1129  if (!RegProcedureIsValid(cmp_proc))
1130  elog(ERROR, "missing support function %d(%u,%u) for attribute %d of index \"%s\"",
1131  BTORDER_PROC, rel->rd_opcintype[i], cur->sk_subtype,
1132  cur->sk_attno, RelationGetRelationName(rel));
1133  ScanKeyEntryInitialize(inskey.scankeys + i,
1134  cur->sk_flags,
1135  cur->sk_attno,
1137  cur->sk_subtype,
1138  cur->sk_collation,
1139  cmp_proc,
1140  cur->sk_argument);
1141  }
1142  }
1143  }
1144 
1145  /*----------
1146  * Examine the selected initial-positioning strategy to determine exactly
1147  * where we need to start the scan, and set flag variables to control the
1148  * code below.
1149  *
1150  * If nextkey = false, _bt_search and _bt_binsrch will locate the first
1151  * item >= scan key. If nextkey = true, they will locate the first
1152  * item > scan key.
1153  *
1154  * If goback = true, we will then step back one item, while if
1155  * goback = false, we will start the scan on the located item.
1156  *----------
1157  */
1158  switch (strat_total)
1159  {
1160  case BTLessStrategyNumber:
1161 
1162  /*
1163  * Find first item >= scankey, then back up one to arrive at last
1164  * item < scankey. (Note: this positioning strategy is only used
1165  * for a backward scan, so that is always the correct starting
1166  * position.)
1167  */
1168  nextkey = false;
1169  goback = true;
1170  break;
1171 
1173 
1174  /*
1175  * Find first item > scankey, then back up one to arrive at last
1176  * item <= scankey. (Note: this positioning strategy is only used
1177  * for a backward scan, so that is always the correct starting
1178  * position.)
1179  */
1180  nextkey = true;
1181  goback = true;
1182  break;
1183 
1184  case BTEqualStrategyNumber:
1185 
1186  /*
1187  * If a backward scan was specified, need to start with last equal
1188  * item not first one.
1189  */
1190  if (ScanDirectionIsBackward(dir))
1191  {
1192  /*
1193  * This is the same as the <= strategy. We will check at the
1194  * end whether the found item is actually =.
1195  */
1196  nextkey = true;
1197  goback = true;
1198  }
1199  else
1200  {
1201  /*
1202  * This is the same as the >= strategy. We will check at the
1203  * end whether the found item is actually =.
1204  */
1205  nextkey = false;
1206  goback = false;
1207  }
1208  break;
1209 
1211 
1212  /*
1213  * Find first item >= scankey. (This is only used for forward
1214  * scans.)
1215  */
1216  nextkey = false;
1217  goback = false;
1218  break;
1219 
1221 
1222  /*
1223  * Find first item > scankey. (This is only used for forward
1224  * scans.)
1225  */
1226  nextkey = true;
1227  goback = false;
1228  break;
1229 
1230  default:
1231  /* can't get here, but keep compiler quiet */
1232  elog(ERROR, "unrecognized strat_total: %d", (int) strat_total);
1233  return false;
1234  }
1235 
1236  /* Initialize remaining insertion scan key fields */
1237  inskey.heapkeyspace = _bt_heapkeyspace(rel);
1238  inskey.anynullkeys = false; /* unused */
1239  inskey.nextkey = nextkey;
1240  inskey.pivotsearch = false;
1241  inskey.scantid = NULL;
1242  inskey.keysz = keysCount;
1243 
1244  /*
1245  * Use the manufactured insertion scan key to descend the tree and
1246  * position ourselves on the target leaf page.
1247  */
1248  stack = _bt_search(rel, &inskey, &buf, BT_READ, scan->xs_snapshot);
1249 
1250  /* don't need to keep the stack around... */
1251  _bt_freestack(stack);
1252 
1253  if (!BufferIsValid(buf))
1254  {
1255  /*
1256  * We only get here if the index is completely empty. Lock relation
1257  * because nothing finer to lock exists.
1258  */
1259  PredicateLockRelation(rel, scan->xs_snapshot);
1260 
1261  /*
1262  * mark parallel scan as done, so that all the workers can finish
1263  * their scan
1264  */
1265  _bt_parallel_done(scan);
1267 
1268  return false;
1269  }
1270  else
1272  scan->xs_snapshot);
1273 
1274  _bt_initialize_more_data(so, dir);
1275 
1276  /* position to the precise item on the page */
1277  offnum = _bt_binsrch(rel, &inskey, buf);
1278 
1279  /*
1280  * If nextkey = false, we are positioned at the first item >= scan key, or
1281  * possibly at the end of a page on which all the existing items are less
1282  * than the scan key and we know that everything on later pages is greater
1283  * than or equal to scan key.
1284  *
1285  * If nextkey = true, we are positioned at the first item > scan key, or
1286  * possibly at the end of a page on which all the existing items are less
1287  * than or equal to the scan key and we know that everything on later
1288  * pages is greater than scan key.
1289  *
1290  * The actually desired starting point is either this item or the prior
1291  * one, or in the end-of-page case it's the first item on the next page or
1292  * the last item on this page. Adjust the starting offset if needed. (If
1293  * this results in an offset before the first item or after the last one,
1294  * _bt_readpage will report no items found, and then we'll step to the
1295  * next page as needed.)
1296  */
1297  if (goback)
1298  offnum = OffsetNumberPrev(offnum);
1299 
1300  /* remember which buffer we have pinned, if any */
1302  so->currPos.buf = buf;
1303 
1304  /*
1305  * Now load data from the first page of the scan.
1306  */
1307  if (!_bt_readpage(scan, dir, offnum))
1308  {
1309  /*
1310  * There's no actually-matching data on this page. Try to advance to
1311  * the next page. Return false if there's no matching data at all.
1312  */
1314  if (!_bt_steppage(scan, dir))
1315  return false;
1316  }
1317  else
1318  {
1319  /* Drop the lock, and maybe the pin, on the current page */
1321  }
1322 
1323 readcomplete:
1324  /* OK, itemIndex says what to return */
1325  currItem = &so->currPos.items[so->currPos.itemIndex];
1326  scan->xs_heaptid = currItem->heapTid;
1327  if (scan->xs_want_itup)
1328  scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
1329 
1330  return true;
1331 }
#define InvalidStrategy
Definition: stratnum.h:24
Oid sk_subtype
Definition: skey.h:69
#define BUFFER_LOCK_UNLOCK
Definition: bufmgr.h:86
Definition: fmgr.h:56
#define SK_ROW_MEMBER
Definition: skey.h:118
void _bt_freestack(BTStack stack)
Definition: nbtutils.c:163
void PredicateLockPage(Relation relation, BlockNumber blkno, Snapshot snapshot)
Definition: predicate.c:2526
#define BTORDER_PROC
Definition: nbtree.h:392
#define BTGreaterStrategyNumber
Definition: stratnum.h:33
FmgrInfo * index_getprocinfo(Relation irel, AttrNumber attnum, uint16 procnum)
Definition: indexam.c:794
#define ScanDirectionIsForward(direction)
Definition: sdir.h:55
IndexTuple xs_itup
Definition: relscan.h:127
BTScanPosItem items[MaxIndexTuplesPerPage]
Definition: nbtree.h:582
void ScanKeyEntryInitializeWithInfo(ScanKey entry, int flags, AttrNumber attributeNumber, StrategyNumber strategy, Oid subtype, Oid collation, FmgrInfo *finfo, Datum argument)
Definition: scankey.c:101
void _bt_parallel_done(IndexScanDesc scan)
Definition: nbtree.c:722
static bool _bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
Definition: nbtsearch.c:1915
static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp)
Definition: nbtsearch.c:57
int itemIndex
Definition: nbtree.h:580
void PredicateLockRelation(Relation relation, Snapshot snapshot)
Definition: predicate.c:2503
char * currTuples
Definition: nbtree.h:654
regproc RegProcedure
Definition: c.h:505
#define P_NONE
Definition: nbtree.h:181
struct cursor * cur
Definition: ecpg.c:28
uint16 StrategyNumber
Definition: stratnum.h:22
#define BTScanPosIsValid(scanpos)
Definition: nbtree.h:604
struct SnapshotData * xs_snapshot
Definition: relscan.h:104
uint32 BlockNumber
Definition: block.h:31
static OffsetNumber _bt_binsrch(Relation rel, BTScanInsert key, Buffer buf)
Definition: nbtsearch.c:346
#define BTLessEqualStrategyNumber
Definition: stratnum.h:30
Relation indexRelation
Definition: relscan.h:103
uint16 OffsetNumber
Definition: off.h:24
#define SK_ROW_END
Definition: skey.h:119
#define ScanDirectionIsBackward(direction)
Definition: sdir.h:41
#define BT_READ
Definition: nbtree.h:402
#define ERROR
Definition: elog.h:43
BTScanOpaqueData * BTScanOpaque
Definition: nbtree.h:671
StrategyNumber sk_strategy
Definition: skey.h:68
ItemPointerData xs_heaptid
Definition: relscan.h:132
static char * buf
Definition: pg_test_fsync.c:68
void ScanKeyEntryInitialize(ScanKey entry, int flags, AttrNumber attributeNumber, StrategyNumber strategy, Oid subtype, Oid collation, RegProcedure procedure, Datum argument)
Definition: scankey.c:32
ScanKeyData * ScanKey
Definition: skey.h:75
static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:2146
#define RegProcedureIsValid(p)
Definition: c.h:640
IndexTupleData * IndexTuple
Definition: itup.h:53
#define RelationGetRelationName(relation)
Definition: rel.h:448
#define pgstat_count_index_scan(rel)
Definition: pgstat.h:1374
#define SK_SEARCHNOTNULL
Definition: skey.h:122
bool _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno)
Definition: nbtree.c:641
Oid * rd_opfamily
Definition: rel.h:166
#define SK_ISNULL
Definition: skey.h:115
#define SK_ROW_HEADER
Definition: skey.h:117
void _bt_preprocess_keys(IndexScanDesc scan)
Definition: nbtutils.c:744
#define SK_BT_NULLS_FIRST
Definition: nbtree.h:682
static void _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir)
Definition: nbtsearch.c:2236
struct ParallelIndexScanDescData * parallel_scan
Definition: relscan.h:151
uintptr_t Datum
Definition: postgres.h:367
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:3590
bool _bt_heapkeyspace(Relation rel)
Definition: nbtpage.c:684
#define InvalidOid
Definition: postgres_ext.h:36
int sk_flags
Definition: skey.h:66
#define Assert(condition)
Definition: c.h:732
#define SK_BT_DESC
Definition: nbtree.h:681
Oid get_opfamily_proc(Oid opfamily, Oid lefttype, Oid righttype, int16 procnum)
Definition: lsyscache.c:744
#define BTScanPosInvalidate(scanpos)
Definition: nbtree.h:610
#define INDEX_MAX_KEYS
#define InvalidBlockNumber
Definition: block.h:33
#define OffsetNumberPrev(offsetNumber)
Definition: off.h:55
int numberOfKeys
Definition: nbtree.h:633
#define BufferIsValid(bufnum)
Definition: bufmgr.h:113
#define DatumGetPointer(X)
Definition: postgres.h:549
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2613
BTScanPosData currPos
Definition: nbtree.h:667
ScanKey keyData
Definition: nbtree.h:634
Oid sk_collation
Definition: skey.h:70
#define elog(elevel,...)
Definition: elog.h:226
int i
static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:1630
Buffer buf
Definition: nbtree.h:550
Oid * rd_opcintype
Definition: rel.h:167
static void static void status(const char *fmt,...) pg_attribute_printf(1
Definition: pg_regress.c:227
#define BTLessStrategyNumber
Definition: stratnum.h:29
int Buffer
Definition: buf.h:23
Datum sk_argument
Definition: skey.h:72
static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
Definition: nbtsearch.c:1403
int16 AttrNumber
Definition: attnum.h:21
BTStack _bt_search(Relation rel, BTScanInsert key, Buffer *bufP, int access, Snapshot snapshot)
Definition: nbtsearch.c:92
#define BTEqualStrategyNumber
Definition: stratnum.h:31
#define BTGreaterEqualStrategyNumber
Definition: stratnum.h:32
AttrNumber sk_attno
Definition: skey.h:67

◆ _bt_freestack()

void _bt_freestack ( BTStack  stack)

Definition at line 163 of file nbtutils.c.

References BTStackData::bts_parent, and pfree().

Referenced by _bt_doinsert(), _bt_first(), and bt_rootdescend().

164 {
165  BTStack ostack;
166 
167  while (stack != NULL)
168  {
169  ostack = stack;
170  stack = stack->bts_parent;
171  pfree(ostack);
172  }
173 }
void pfree(void *pointer)
Definition: mcxt.c:1031
struct BTStackData * bts_parent
Definition: nbtree.h:420

◆ _bt_get_endpoint()

Buffer _bt_get_endpoint ( Relation  rel,
uint32  level,
bool  rightmost,
Snapshot  snapshot 
)

Definition at line 2064 of file nbtsearch.c.

References _bt_getroot(), _bt_gettrueroot(), _bt_relandgetbuf(), BT_READ, BTPageOpaqueData::btpo, BTPageOpaqueData::btpo_next, BTreeInnerTupleGetDownLink, buf, BufferGetPage, BufferIsValid, elog, ERROR, InvalidBuffer, BTPageOpaqueData::level, P_FIRSTDATAKEY, P_IGNORE, P_NONE, P_RIGHTMOST, PageGetItem, PageGetItemId, PageGetMaxOffsetNumber, PageGetSpecialPointer, RelationGetRelationName, and TestForOldSnapshot().

Referenced by _bt_endpoint(), and _bt_insert_parent().

2066 {
2067  Buffer buf;
2068  Page page;
2069  BTPageOpaque opaque;
2070  OffsetNumber offnum;
2071  BlockNumber blkno;
2072  IndexTuple itup;
2073 
2074  /*
2075  * If we are looking for a leaf page, okay to descend from fast root;
2076  * otherwise better descend from true root. (There is no point in being
2077  * smarter about intermediate levels.)
2078  */
2079  if (level == 0)
2080  buf = _bt_getroot(rel, BT_READ);
2081  else
2082  buf = _bt_gettrueroot(rel);
2083 
2084  if (!BufferIsValid(buf))
2085  return InvalidBuffer;
2086 
2087  page = BufferGetPage(buf);
2088  TestForOldSnapshot(snapshot, rel, page);
2089  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2090 
2091  for (;;)
2092  {
2093  /*
2094  * If we landed on a deleted page, step right to find a live page
2095  * (there must be one). Also, if we want the rightmost page, step
2096  * right if needed to get to it (this could happen if the page split
2097  * since we obtained a pointer to it).
2098  */
2099  while (P_IGNORE(opaque) ||
2100  (rightmost && !P_RIGHTMOST(opaque)))
2101  {
2102  blkno = opaque->btpo_next;
2103  if (blkno == P_NONE)
2104  elog(ERROR, "fell off the end of index \"%s\"",
2106  buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2107  page = BufferGetPage(buf);
2108  TestForOldSnapshot(snapshot, rel, page);
2109  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2110  }
2111 
2112  /* Done? */
2113  if (opaque->btpo.level == level)
2114  break;
2115  if (opaque->btpo.level < level)
2116  elog(ERROR, "btree level %u not found in index \"%s\"",
2117  level, RelationGetRelationName(rel));
2118 
2119  /* Descend to leftmost or rightmost child page */
2120  if (rightmost)
2121  offnum = PageGetMaxOffsetNumber(page);
2122  else
2123  offnum = P_FIRSTDATAKEY(opaque);
2124 
2125  itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
2126  blkno = BTreeInnerTupleGetDownLink(itup);
2127 
2128  buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2129  page = BufferGetPage(buf);
2130  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2131  }
2132 
2133  return buf;
2134 }
BlockNumber btpo_next
Definition: nbtree.h:58
static void TestForOldSnapshot(Snapshot snapshot, Relation relation, Page page)
Definition: bufmgr.h:264
#define P_IGNORE(opaque)
Definition: nbtree.h:194
#define BTreeInnerTupleGetDownLink(itup)
Definition: nbtree.h:301
Buffer _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
Definition: nbtpage.c:932
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:219
union BTPageOpaqueData::@46 btpo
#define P_NONE
Definition: nbtree.h:181
#define InvalidBuffer
Definition: buf.h:25
uint32 BlockNumber
Definition: block.h:31
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:357
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:68
uint16 OffsetNumber
Definition: off.h:24
#define BT_READ
Definition: nbtree.h:402
#define ERROR
Definition: elog.h:43
static char * buf
Definition: pg_test_fsync.c:68
IndexTupleData * IndexTuple
Definition: itup.h:53
#define RelationGetRelationName(relation)
Definition: rel.h:448
Buffer _bt_getroot(Relation rel, int access)
Definition: nbtpage.c:291
#define BufferGetPage(buffer)
Definition: bufmgr.h:159
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:235
uint32 level
Definition: nbtree.h:61
Buffer _bt_gettrueroot(Relation rel)
Definition: nbtpage.c:539
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
#define BufferIsValid(bufnum)
Definition: bufmgr.h:113
#define elog(elevel,...)
Definition: elog.h:226
int Buffer
Definition: buf.h:23
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:188
#define PageGetItem(page, itemId)
Definition: bufpage.h:340
Pointer Page
Definition: bufpage.h:78

◆ _bt_getbuf()

Buffer _bt_getbuf ( Relation  rel,
BlockNumber  blkno,
int  access 
)

Definition at line 796 of file nbtpage.c.

References _bt_checkpage(), _bt_log_reuse_page(), _bt_page_recyclable(), _bt_pageinit(), _bt_relbuf(), Assert, BT_WRITE, BTPageOpaqueData::btpo, buf, BufferGetPage, BufferGetPageSize, ConditionalLockBuffer(), DEBUG2, elog, ExclusiveLock, GetFreeIndexPage(), InvalidBlockNumber, LockBuffer(), LockRelationForExtension(), P_NEW, PageGetSpecialPointer, PageIsNew, ReadBuffer(), RELATION_IS_LOCAL, RelationNeedsWAL, ReleaseBuffer(), UnlockRelationForExtension(), BTPageOpaqueData::xact, and XLogStandbyInfoActive.

Referenced by _bt_finish_split(), _bt_getroot(), _bt_getrootheight(), _bt_getstackbuf(), _bt_gettrueroot(), _bt_heapkeyspace(), _bt_insertonpg(), _bt_is_page_halfdead(), _bt_killitems(), _bt_lock_branch_parent(), _bt_moveright(), _bt_newroot(), _bt_pagedel(), _bt_readnextpage(), _bt_split(), _bt_unlink_halfdead_page(), _bt_update_meta_cleanup_info(), _bt_vacuum_needs_cleanup(), and _bt_walk_left().

797 {
798  Buffer buf;
799 
800  if (blkno != P_NEW)
801  {
802  /* Read an existing block of the relation */
803  buf = ReadBuffer(rel, blkno);
804  LockBuffer(buf, access);
805  _bt_checkpage(rel, buf);
806  }
807  else
808  {
809  bool needLock;
810  Page page;
811 
812  Assert(access == BT_WRITE);
813 
814  /*
815  * First see if the FSM knows of any free pages.
816  *
817  * We can't trust the FSM's report unreservedly; we have to check that
818  * the page is still free. (For example, an already-free page could
819  * have been re-used between the time the last VACUUM scanned it and
820  * the time the VACUUM made its FSM updates.)
821  *
822  * In fact, it's worse than that: we can't even assume that it's safe
823  * to take a lock on the reported page. If somebody else has a lock
824  * on it, or even worse our own caller does, we could deadlock. (The
825  * own-caller scenario is actually not improbable. Consider an index
826  * on a serial or timestamp column. Nearly all splits will be at the
827  * rightmost page, so it's entirely likely that _bt_split will call us
828  * while holding a lock on the page most recently acquired from FSM. A
829  * VACUUM running concurrently with the previous split could well have
830  * placed that page back in FSM.)
831  *
832  * To get around that, we ask for only a conditional lock on the
833  * reported page. If we fail, then someone else is using the page,
834  * and we may reasonably assume it's not free. (If we happen to be
835  * wrong, the worst consequence is the page will be lost to use till
836  * the next VACUUM, which is no big problem.)
837  */
838  for (;;)
839  {
840  blkno = GetFreeIndexPage(rel);
841  if (blkno == InvalidBlockNumber)
842  break;
843  buf = ReadBuffer(rel, blkno);
844  if (ConditionalLockBuffer(buf))
845  {
846  page = BufferGetPage(buf);
847  if (_bt_page_recyclable(page))
848  {
849  /*
850  * If we are generating WAL for Hot Standby then create a
851  * WAL record that will allow us to conflict with queries
852  * running on standby, in case they have snapshots older
853  * than btpo.xact. This can only apply if the page does
854  * have a valid btpo.xact value, ie not if it's new. (We
855  * must check that because an all-zero page has no special
856  * space.)
857  */
858  if (XLogStandbyInfoActive() && RelationNeedsWAL(rel) &&
859  !PageIsNew(page))
860  {
862 
863  _bt_log_reuse_page(rel, blkno, opaque->btpo.xact);
864  }
865 
866  /* Okay to use page. Re-initialize and return it */
867  _bt_pageinit(page, BufferGetPageSize(buf));
868  return buf;
869  }
870  elog(DEBUG2, "FSM returned nonrecyclable page");
871  _bt_relbuf(rel, buf);
872  }
873  else
874  {
875  elog(DEBUG2, "FSM returned nonlockable page");
876  /* couldn't get lock, so just drop pin */
877  ReleaseBuffer(buf);
878  }
879  }
880 
881  /*
882  * Extend the relation by one page.
883  *
884  * We have to use a lock to ensure no one else is extending the rel at
885  * the same time, else we will both try to initialize the same new
886  * page. We can skip locking for new or temp relations, however,
887  * since no one else could be accessing them.
888  */
889  needLock = !RELATION_IS_LOCAL(rel);
890 
891  if (needLock)
893 
894  buf = ReadBuffer(rel, P_NEW);
895 
896  /* Acquire buffer lock on new page */
897  LockBuffer(buf, BT_WRITE);
898 
899  /*
900  * Release the file-extension lock; it's now OK for someone else to
901  * extend the relation some more. Note that we cannot release this
902  * lock before we have buffer lock on the new page, or we risk a race
903  * condition against btvacuumscan --- see comments therein.
904  */
905  if (needLock)
907 
908  /* Initialize the new page before returning it */
909  page = BufferGetPage(buf);
910  Assert(PageIsNew(page));
911  _bt_pageinit(page, BufferGetPageSize(buf));
912  }
913 
914  /* ref count and lock type are correct */
915  return buf;
916 }
#define ExclusiveLock
Definition: lockdefs.h:44
#define RELATION_IS_LOCAL(relation)
Definition: rel.h:534
union BTPageOpaqueData::@46 btpo
void ReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:3353
#define P_NEW
Definition: bufmgr.h:81
bool _bt_page_recyclable(Page page)
Definition: nbtpage.c:978
TransactionId xact
Definition: nbtree.h:62
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:68
void _bt_checkpage(Relation rel, Buffer buf)
Definition: nbtpage.c:729
#define DEBUG2
Definition: elog.h:24
static char * buf
Definition: pg_test_fsync.c:68
#define BufferGetPage(buffer)
Definition: bufmgr.h:159
bool ConditionalLockBuffer(Buffer buffer)
Definition: bufmgr.c:3616
static void _bt_log_reuse_page(Relation rel, BlockNumber blkno, TransactionId latestRemovedXid)
Definition: nbtpage.c:763
void LockRelationForExtension(Relation relation, LOCKMODE lockmode)
Definition: lmgr.c:402
void UnlockRelationForExtension(Relation relation, LOCKMODE lockmode)
Definition: lmgr.c:452
#define BufferGetPageSize(buffer)
Definition: bufmgr.h:146
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:3590
#define XLogStandbyInfoActive()
Definition: xlog.h:195
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:951
BlockNumber GetFreeIndexPage(Relation rel)
Definition: indexfsm.c:38
#define Assert(condition)
Definition: c.h:732
Buffer ReadBuffer(Relation reln, BlockNumber blockNum)
Definition: bufmgr.c:596
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
#define InvalidBlockNumber
Definition: block.h:33
#define RelationNeedsWAL(relation)
Definition: rel.h:516
#define PageIsNew(page)
Definition: bufpage.h:229
#define elog(elevel,...)
Definition: elog.h:226
void _bt_pageinit(Page page, Size size)
Definition: nbtpage.c:963
#define BT_WRITE
Definition: nbtree.h:403
int Buffer
Definition: buf.h:23
Pointer Page
Definition: bufpage.h:78

◆ _bt_getroot()

Buffer _bt_getroot ( Relation  rel,
int  access 
)

Definition at line 291 of file nbtpage.c.

References _bt_cachemetadata(), _bt_getbuf(), _bt_getroot(), _bt_relandgetbuf(), _bt_relbuf(), _bt_upgrademetapage(), Assert, BT_READ, BT_WRITE, BTMetaPageData::btm_fastlevel, BTMetaPageData::btm_fastroot, BTMetaPageData::btm_last_cleanup_num_heap_tuples, BTMetaPageData::btm_level, BTMetaPageData::btm_magic, BTMetaPageData::btm_oldest_btpo_xact, BTMetaPageData::btm_root, BTMetaPageData::btm_version, BTP_LEAF, BTP_ROOT, BTPageGetMeta, BTPageOpaqueData::btpo, BTPageOpaqueData::btpo_cycleid, BTPageOpaqueData::btpo_flags, BTPageOpaqueData::btpo_next, BTPageOpaqueData::btpo_prev, BTREE_MAGIC, BTREE_METAPAGE, BTREE_MIN_VERSION, BTREE_NOVAC_VERSION, BTREE_VERSION, BUFFER_LOCK_UNLOCK, BufferGetBlockNumber(), BufferGetPage, elog, END_CRIT_SECTION, ereport, errcode(), errmsg(), ERROR, xl_btree_metadata::fastlevel, xl_btree_metadata::fastroot, InvalidBuffer, InvalidTransactionId, xl_btree_metadata::last_cleanup_num_heap_tuples, xl_btree_metadata::level, BTPageOpaqueData::level, xl_btree_newroot::level, LockBuffer(), MarkBufferDirty(), xl_btree_metadata::oldest_btpo_xact, P_IGNORE, P_ISMETA, P_LEFTMOST, P_NEW, P_NONE, P_RIGHTMOST, PageGetSpecialPointer, PageSetLSN, pfree(), RelationData::rd_amcache, REGBUF_STANDARD, REGBUF_WILL_INIT, RelationGetRelationName, RelationNeedsWAL, xl_btree_metadata::root, xl_btree_newroot::rootblk, SizeOfBtreeNewroot, START_CRIT_SECTION, xl_btree_metadata::version, XLOG_BTREE_NEWROOT, XLogBeginInsert(), XLogInsert(), XLogRegisterBufData(), XLogRegisterBuffer(), and XLogRegisterData().

Referenced by _bt_get_endpoint(), _bt_getroot(), and _bt_search().

292 {
293  Buffer metabuf;
294  Page metapg;
295  BTPageOpaque metaopaque;
296  Buffer rootbuf;
297  Page rootpage;
298  BTPageOpaque rootopaque;
299  BlockNumber rootblkno;
300  uint32 rootlevel;
301  BTMetaPageData *metad;
302 
303  /*
304  * Try to use previously-cached metapage data to find the root. This
305  * normally saves one buffer access per index search, which is a very
306  * helpful savings in bufmgr traffic and hence contention.
307  */
308  if (rel->rd_amcache != NULL)
309  {
310  metad = (BTMetaPageData *) rel->rd_amcache;
311  /* We shouldn't have cached it if any of these fail */
312  Assert(metad->btm_magic == BTREE_MAGIC);
314  Assert(metad->btm_version <= BTREE_VERSION);
315  Assert(metad->btm_root != P_NONE);
316 
317  rootblkno = metad->btm_fastroot;
318  Assert(rootblkno != P_NONE);
319  rootlevel = metad->btm_fastlevel;
320 
321  rootbuf = _bt_getbuf(rel, rootblkno, BT_READ);
322  rootpage = BufferGetPage(rootbuf);
323  rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
324 
325  /*
326  * Since the cache might be stale, we check the page more carefully
327  * here than normal. We *must* check that it's not deleted. If it's
328  * not alone on its level, then we reject too --- this may be overly
329  * paranoid but better safe than sorry. Note we don't check P_ISROOT,
330  * because that's not set in a "fast root".
331  */
332  if (!P_IGNORE(rootopaque) &&
333  rootopaque->btpo.level == rootlevel &&
334  P_LEFTMOST(rootopaque) &&
335  P_RIGHTMOST(rootopaque))
336  {
337  /* OK, accept cached page as the root */
338  return rootbuf;
339  }
340  _bt_relbuf(rel, rootbuf);
341  /* Cache is stale, throw it away */
342  if (rel->rd_amcache)
343  pfree(rel->rd_amcache);
344  rel->rd_amcache = NULL;
345  }
346 
347  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
348  metapg = BufferGetPage(metabuf);
349  metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg);
350  metad = BTPageGetMeta(metapg);
351 
352  /* sanity-check the metapage */
353  if (!P_ISMETA(metaopaque) ||
354  metad->btm_magic != BTREE_MAGIC)
355  ereport(ERROR,
356  (errcode(ERRCODE_INDEX_CORRUPTED),
357  errmsg("index \"%s\" is not a btree",
358  RelationGetRelationName(rel))));
359 
360  if (metad->btm_version < BTREE_MIN_VERSION ||
361  metad->btm_version > BTREE_VERSION)
362  ereport(ERROR,
363  (errcode(ERRCODE_INDEX_CORRUPTED),
364  errmsg("version mismatch in index \"%s\": file version %d, "
365  "current version %d, minimal supported version %d",
368 
369  /* if no root page initialized yet, do it */
370  if (metad->btm_root == P_NONE)
371  {
372  /* If access = BT_READ, caller doesn't want us to create root yet */
373  if (access == BT_READ)
374  {
375  _bt_relbuf(rel, metabuf);
376  return InvalidBuffer;
377  }
378 
379  /* trade in our read lock for a write lock */
380  LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
381  LockBuffer(metabuf, BT_WRITE);
382 
383  /*
384  * Race condition: if someone else initialized the metadata between
385  * the time we released the read lock and acquired the write lock, we
386  * must avoid doing it again.
387  */
388  if (metad->btm_root != P_NONE)
389  {
390  /*
391  * Metadata initialized by someone else. In order to guarantee no
392  * deadlocks, we have to release the metadata page and start all
393  * over again. (Is that really true? But it's hardly worth trying
394  * to optimize this case.)
395  */
396  _bt_relbuf(rel, metabuf);
397  return _bt_getroot(rel, access);
398  }
399 
400  /*
401  * Get, initialize, write, and leave a lock of the appropriate type on
402  * the new root page. Since this is the first page in the tree, it's
403  * a leaf as well as the root.
404  */
405  rootbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
406  rootblkno = BufferGetBlockNumber(rootbuf);
407  rootpage = BufferGetPage(rootbuf);
408  rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
409  rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE;
410  rootopaque->btpo_flags = (BTP_LEAF | BTP_ROOT);
411  rootopaque->btpo.level = 0;
412  rootopaque->btpo_cycleid = 0;
413 
414  /* NO ELOG(ERROR) till meta is updated */
416 
417  /* upgrade metapage if needed */
418  if (metad->btm_version < BTREE_NOVAC_VERSION)
419  _bt_upgrademetapage(metapg);
420 
421  metad->btm_root = rootblkno;
422  metad->btm_level = 0;
423  metad->btm_fastroot = rootblkno;
424  metad->btm_fastlevel = 0;
426  metad->btm_last_cleanup_num_heap_tuples = -1.0;
427 
428  MarkBufferDirty(rootbuf);
429  MarkBufferDirty(metabuf);
430 
431  /* XLOG stuff */
432  if (RelationNeedsWAL(rel))
433  {
434  xl_btree_newroot xlrec;
435  XLogRecPtr recptr;
437 
438  XLogBeginInsert();
441 
443  md.version = metad->btm_version;
444  md.root = rootblkno;
445  md.level = 0;
446  md.fastroot = rootblkno;
447  md.fastlevel = 0;
450 
451  XLogRegisterBufData(2, (char *) &md, sizeof(xl_btree_metadata));
452 
453  xlrec.rootblk = rootblkno;
454  xlrec.level = 0;
455 
456  XLogRegisterData((char *) &xlrec, SizeOfBtreeNewroot);
457 
458  recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT);
459 
460  PageSetLSN(rootpage, recptr);
461  PageSetLSN(metapg, recptr);
462  }
463 
465 
466  /*
467  * swap root write lock for read lock. There is no danger of anyone
468  * else accessing the new root page while it's unlocked, since no one
469  * else knows where it is yet.
470  */
471  LockBuffer(rootbuf, BUFFER_LOCK_UNLOCK);
472  LockBuffer(rootbuf, BT_READ);
473 
474  /* okay, metadata is correct, release lock on it */
475  _bt_relbuf(rel, metabuf);
476  }
477  else
478  {
479  rootblkno = metad->btm_fastroot;
480  Assert(rootblkno != P_NONE);
481  rootlevel = metad->btm_fastlevel;
482 
483  /*
484  * Cache the metapage data for next time
485  */
486  _bt_cachemetadata(rel, metad);
487 
488  /*
489  * We are done with the metapage; arrange to release it via first
490  * _bt_relandgetbuf call
491  */
492  rootbuf = metabuf;
493 
494  for (;;)
495  {
496  rootbuf = _bt_relandgetbuf(rel, rootbuf, rootblkno, BT_READ);
497  rootpage = BufferGetPage(rootbuf);
498  rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
499 
500  if (!P_IGNORE(rootopaque))
501  break;
502 
503  /* it's dead, Jim. step right one page */
504  if (P_RIGHTMOST(rootopaque))
505  elog(ERROR, "no live root page found in index \"%s\"",
507  rootblkno = rootopaque->btpo_next;
508  }
509 
510  /* Note: can't check btpo.level on deleted pages */
511  if (rootopaque->btpo.level != rootlevel)
512  elog(ERROR, "root page %u of index \"%s\" has level %u, expected %u",
513  rootblkno, RelationGetRelationName(rel),
514  rootopaque->btpo.level, rootlevel);
515  }
516 
517  /*
518  * By here, we have a pin and read lock on the root page, and no lock set
519  * on the metadata page. Return the root page's buffer.
520  */
521  return rootbuf;
522 }
void XLogRegisterBufData(uint8 block_id, char *data, int len)
Definition: xloginsert.c:361
BlockNumber rootblk
Definition: nbtxlog.h:247
#define BUFFER_LOCK_UNLOCK
Definition: bufmgr.h:86
#define BTP_ROOT
Definition: nbtree.h:72
static void _bt_cachemetadata(Relation rel, BTMetaPageData *input)
Definition: nbtpage.c:116
BlockNumber btpo_next
Definition: nbtree.h:58
#define P_IGNORE(opaque)
Definition: nbtree.h:194
void _bt_upgrademetapage(Page page)
Definition: nbtpage.c:89
uint32 btm_version
Definition: nbtree.h:100
Buffer _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
Definition: nbtpage.c:932
Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access)
Definition: nbtpage.c:796
void MarkBufferDirty(Buffer buffer)
Definition: bufmgr.c:1458
void XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags)
Definition: xloginsert.c:213
#define BTREE_VERSION
Definition: nbtree.h:133
uint32 btm_magic
Definition: nbtree.h:99
#define BTP_LEAF
Definition: nbtree.h:71
union BTPageOpaqueData::@46 btpo
#define END_CRIT_SECTION()
Definition: miscadmin.h:134
BlockNumber root
Definition: nbtxlog.h:50
#define P_NONE
Definition: nbtree.h:181
#define InvalidBuffer
Definition: buf.h:25
#define REGBUF_WILL_INIT
Definition: xloginsert.h:33
#define START_CRIT_SECTION()
Definition: miscadmin.h:132
int errcode(int sqlerrcode)
Definition: elog.c:570
uint32 level
Definition: nbtxlog.h:248
uint32 BlockNumber
Definition: block.h:31
#define P_NEW
Definition: bufmgr.h:81
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:68
#define P_ISMETA(opaque)
Definition: nbtree.h:192
#define BT_READ
Definition: nbtree.h:402
BlockNumber btm_fastroot
Definition: nbtree.h:103
#define XLOG_BTREE_NEWROOT
Definition: nbtxlog.h:35
void pfree(void *pointer)
Definition: mcxt.c:1031
#define BTREE_MAGIC
Definition: nbtree.h:132
#define ERROR
Definition: elog.h:43
float8 last_cleanup_num_heap_tuples
Definition: nbtxlog.h:55
BTCycleId btpo_cycleid
Definition: nbtree.h:65
TransactionId oldest_btpo_xact
Definition: nbtxlog.h:54
#define BTPageGetMeta(p)
Definition: nbtree.h:112
BlockNumber btpo_prev
Definition: nbtree.h:57
#define REGBUF_STANDARD
Definition: xloginsert.h:35
#define InvalidTransactionId
Definition: transam.h:31
#define RelationGetRelationName(relation)
Definition: rel.h:448
#define P_LEFTMOST(opaque)
Definition: nbtree.h:187
unsigned int uint32
Definition: c.h:358
#define BTREE_NOVAC_VERSION
Definition: nbtree.h:135
Buffer _bt_getroot(Relation rel, int access)
Definition: nbtpage.c:291
#define BufferGetPage(buffer)
Definition: bufmgr.h:159
#define ereport(elevel, rest)
Definition: elog.h:141
#define BTREE_METAPAGE
Definition: nbtree.h:131
#define SizeOfBtreeNewroot
Definition: nbtxlog.h:251
uint32 version
Definition: nbtxlog.h:49
uint32 btm_fastlevel
Definition: nbtree.h:104
void XLogRegisterData(char *data, int len)
Definition: xloginsert.c:323
uint32 level
Definition: nbtree.h:61
XLogRecPtr XLogInsert(RmgrId rmid, uint8 info)
Definition: xloginsert.c:415
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:3590
BlockNumber btm_root
Definition: nbtree.h:101
#define BTREE_MIN_VERSION
Definition: nbtree.h:134
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:951
uint64 XLogRecPtr
Definition: xlogdefs.h:21
#define Assert(condition)
Definition: c.h:732
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
float8 btm_last_cleanup_num_heap_tuples
Definition: nbtree.h:108
#define RelationNeedsWAL(relation)
Definition: rel.h:516
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2613
int errmsg(const char *fmt,...)
Definition: elog.c:784
uint32 fastlevel
Definition: nbtxlog.h:53
uint32 btm_level
Definition: nbtree.h:102
#define elog(elevel,...)
Definition: elog.h:226
uint32 level
Definition: nbtxlog.h:51
BlockNumber fastroot
Definition: nbtxlog.h:52
TransactionId btm_oldest_btpo_xact
Definition: nbtree.h:106
void * rd_amcache
Definition: rel.h:176
#define BT_WRITE
Definition: nbtree.h:403
void XLogBeginInsert(void)
Definition: xloginsert.c:120
uint16 btpo_flags
Definition: nbtree.h:64
#define PageSetLSN(page, lsn)
Definition: bufpage.h:368
int Buffer
Definition: buf.h:23
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:188
Pointer Page
Definition: bufpage.h:78

◆ _bt_getrootheight()

int _bt_getrootheight ( Relation  rel)

Definition at line 635 of file nbtpage.c.

References _bt_cachemetadata(), _bt_getbuf(), _bt_getmeta(), _bt_relbuf(), Assert, BT_READ, BTMetaPageData::btm_fastlevel, BTMetaPageData::btm_fastroot, BTMetaPageData::btm_magic, BTMetaPageData::btm_root, BTMetaPageData::btm_version, BTREE_MAGIC, BTREE_METAPAGE, BTREE_NOVAC_VERSION, P_NONE, and RelationData::rd_amcache.

Referenced by _bt_insertonpg(), and get_relation_info().

636 {
637  BTMetaPageData *metad;
638 
639  if (rel->rd_amcache == NULL)
640  {
641  Buffer metabuf;
642 
643  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
644  metad = _bt_getmeta(rel, metabuf);
645 
646  /*
647  * If there's no root page yet, _bt_getroot() doesn't expect a cache
648  * to be made, so just stop here and report the index height is zero.
649  * (XXX perhaps _bt_getroot() should be changed to allow this case.)
650  */
651  if (metad->btm_root == P_NONE)
652  {
653  _bt_relbuf(rel, metabuf);
654  return 0;
655  }
656 
657  /*
658  * Cache the metapage data for next time
659  */
660  _bt_cachemetadata(rel, metad);
661  /* We shouldn't have cached it if any of these fail */
662  Assert(metad->btm_magic == BTREE_MAGIC);
664  Assert(metad->btm_fastroot != P_NONE);
665  _bt_relbuf(rel, metabuf);
666  }
667 
668  /* Get cached page */
669  metad = (BTMetaPageData *) rel->rd_amcache;
670 
671  return metad->btm_fastlevel;
672 }
static void _bt_cachemetadata(Relation rel, BTMetaPageData *input)
Definition: nbtpage.c:116
static BTMetaPageData * _bt_getmeta(Relation rel, Buffer metabuf)
Definition: nbtpage.c:156
uint32 btm_version
Definition: nbtree.h:100
Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access)
Definition: nbtpage.c:796
uint32 btm_magic
Definition: nbtree.h:99
#define P_NONE
Definition: nbtree.h:181
#define BT_READ
Definition: nbtree.h:402
BlockNumber btm_fastroot
Definition: nbtree.h:103
#define BTREE_MAGIC
Definition: nbtree.h:132
#define BTREE_NOVAC_VERSION
Definition: nbtree.h:135
#define BTREE_METAPAGE
Definition: nbtree.h:131
uint32 btm_fastlevel
Definition: nbtree.h:104
BlockNumber btm_root
Definition: nbtree.h:101
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:951
#define Assert(condition)
Definition: c.h:732
void * rd_amcache
Definition: rel.h:176
int Buffer
Definition: buf.h:23

◆ _bt_getstackbuf()

Buffer _bt_getstackbuf ( Relation  rel,
BTStack  stack 
)

Definition at line 1914 of file nbtinsert.c.

References _bt_finish_split(), _bt_getbuf(), _bt_relbuf(), BT_WRITE, BTPageOpaqueData::btpo_next, BTreeInnerTupleGetDownLink, BTStackData::bts_blkno, BTStackData::bts_btentry, BTStackData::bts_offset, BTStackData::bts_parent, buf, BufferGetPage, InvalidBuffer, InvalidOffsetNumber, OffsetNumberNext, OffsetNumberPrev, P_FIRSTDATAKEY, P_IGNORE, P_INCOMPLETE_SPLIT, P_RIGHTMOST, PageGetItem, PageGetItemId, PageGetMaxOffsetNumber, and PageGetSpecialPointer.

Referenced by _bt_insert_parent(), and _bt_lock_branch_parent().

1915 {
1916  BlockNumber blkno;
1917  OffsetNumber start;
1918 
1919  blkno = stack->bts_blkno;
1920  start = stack->bts_offset;
1921 
1922  for (;;)
1923  {
1924  Buffer buf;
1925  Page page;
1926  BTPageOpaque opaque;
1927 
1928  buf = _bt_getbuf(rel, blkno, BT_WRITE);
1929  page = BufferGetPage(buf);
1930  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1931 
1932  if (P_INCOMPLETE_SPLIT(opaque))
1933  {
1934  _bt_finish_split(rel, buf, stack->bts_parent);
1935  continue;
1936  }
1937 
1938  if (!P_IGNORE(opaque))
1939  {
1940  OffsetNumber offnum,
1941  minoff,
1942  maxoff;
1943  ItemId itemid;
1944  IndexTuple item;
1945 
1946  minoff = P_FIRSTDATAKEY(opaque);
1947  maxoff = PageGetMaxOffsetNumber(page);
1948 
1949  /*
1950  * start = InvalidOffsetNumber means "search the whole page". We
1951  * need this test anyway due to possibility that page has a high
1952  * key now when it didn't before.
1953  */
1954  if (start < minoff)
1955  start = minoff;
1956 
1957  /*
1958  * Need this check too, to guard against possibility that page
1959  * split since we visited it originally.
1960  */
1961  if (start > maxoff)
1962  start = OffsetNumberNext(maxoff);
1963 
1964  /*
1965  * These loops will check every item on the page --- but in an
1966  * order that's attuned to the probability of where it actually
1967  * is. Scan to the right first, then to the left.
1968  */
1969  for (offnum = start;
1970  offnum <= maxoff;
1971  offnum = OffsetNumberNext(offnum))
1972  {
1973  itemid = PageGetItemId(page, offnum);
1974  item = (IndexTuple) PageGetItem(page, itemid);
1975 
1976  if (BTreeInnerTupleGetDownLink(item) == stack->bts_btentry)
1977  {
1978  /* Return accurate pointer to where link is now */
1979  stack->bts_blkno = blkno;
1980  stack->bts_offset = offnum;
1981  return buf;
1982  }
1983  }
1984 
1985  for (offnum = OffsetNumberPrev(start);
1986  offnum >= minoff;
1987  offnum = OffsetNumberPrev(offnum))
1988  {
1989  itemid = PageGetItemId(page, offnum);
1990  item = (IndexTuple) PageGetItem(page, itemid);
1991 
1992  if (BTreeInnerTupleGetDownLink(item) == stack->bts_btentry)
1993  {
1994  /* Return accurate pointer to where link is now */
1995  stack->bts_blkno = blkno;
1996  stack->bts_offset = offnum;
1997  return buf;
1998  }
1999  }
2000  }
2001 
2002  /*
2003  * The item we're looking for moved right at least one page.
2004  */
2005  if (P_RIGHTMOST(opaque))
2006  {
2007  _bt_relbuf(rel, buf);
2008  return InvalidBuffer;
2009  }
2010  blkno = opaque->btpo_next;
2011  start = InvalidOffsetNumber;
2012  _bt_relbuf(rel, buf);
2013  }
2014 }
BlockNumber btpo_next
Definition: nbtree.h:58
#define P_IGNORE(opaque)
Definition: nbtree.h:194
#define BTreeInnerTupleGetDownLink(itup)
Definition: nbtree.h:301
Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access)
Definition: nbtpage.c:796
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:219
#define InvalidBuffer
Definition: buf.h:25
uint32 BlockNumber
Definition: block.h:31
#define P_INCOMPLETE_SPLIT(opaque)
Definition: nbtree.h:196
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:357
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:68
uint16 OffsetNumber
Definition: off.h:24
void _bt_finish_split(Relation rel, Buffer lbuf, BTStack stack)
Definition: nbtinsert.c:1854
OffsetNumber bts_offset
Definition: nbtree.h:418
static char * buf
Definition: pg_test_fsync.c:68
IndexTupleData * IndexTuple
Definition: itup.h:53
#define BufferGetPage(buffer)
Definition: bufmgr.h:159
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:235
#define InvalidOffsetNumber
Definition: off.h:26
BlockNumber bts_blkno
Definition: nbtree.h:417
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:951
BlockNumber bts_btentry
Definition: nbtree.h:419
#define OffsetNumberNext(offsetNumber)
Definition: off.h:53
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
#define OffsetNumberPrev(offsetNumber)
Definition: off.h:55
struct BTStackData * bts_parent
Definition: nbtree.h:420
#define BT_WRITE
Definition: nbtree.h:403
int Buffer
Definition: buf.h:23
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:188
#define PageGetItem(page, itemId)
Definition: bufpage.h:340
Pointer Page
Definition: bufpage.h:78

◆ _bt_gettrueroot()

Buffer _bt_gettrueroot ( Relation  rel)

Definition at line 539 of file nbtpage.c.

References _bt_getbuf(), _bt_relandgetbuf(), _bt_relbuf(), BT_READ, BTMetaPageData::btm_level, BTMetaPageData::btm_magic, BTMetaPageData::btm_root, BTMetaPageData::btm_version, BTPageGetMeta, BTPageOpaqueData::btpo, BTPageOpaqueData::btpo_next, BTREE_MAGIC, BTREE_METAPAGE, BTREE_MIN_VERSION, BTREE_VERSION, BufferGetPage, elog, ereport, errcode(), errmsg(), ERROR, InvalidBuffer, BTPageOpaqueData::level, P_IGNORE, P_ISMETA, P_NONE, P_RIGHTMOST, PageGetSpecialPointer, pfree(), RelationData::rd_amcache, and RelationGetRelationName.

Referenced by _bt_get_endpoint().

540 {
541  Buffer metabuf;
542  Page metapg;
543  BTPageOpaque metaopaque;
544  Buffer rootbuf;
545  Page rootpage;
546  BTPageOpaque rootopaque;
547  BlockNumber rootblkno;
548  uint32 rootlevel;
549  BTMetaPageData *metad;
550 
551  /*
552  * We don't try to use cached metapage data here, since (a) this path is
553  * not performance-critical, and (b) if we are here it suggests our cache
554  * is out-of-date anyway. In light of point (b), it's probably safest to
555  * actively flush any cached metapage info.
556  */
557  if (rel->rd_amcache)
558  pfree(rel->rd_amcache);
559  rel->rd_amcache = NULL;
560 
561  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
562  metapg = BufferGetPage(metabuf);
563  metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg);
564  metad = BTPageGetMeta(metapg);
565 
566  if (!P_ISMETA(metaopaque) ||
567  metad->btm_magic != BTREE_MAGIC)
568  ereport(ERROR,
569  (errcode(ERRCODE_INDEX_CORRUPTED),
570  errmsg("index \"%s\" is not a btree",
571  RelationGetRelationName(rel))));
572 
573  if (metad->btm_version < BTREE_MIN_VERSION ||
574  metad->btm_version > BTREE_VERSION)
575  ereport(ERROR,
576  (errcode(ERRCODE_INDEX_CORRUPTED),
577  errmsg("version mismatch in index \"%s\": file version %d, "
578  "current version %d, minimal supported version %d",
581 
582  /* if no root page initialized yet, fail */
583  if (metad->btm_root == P_NONE)
584  {
585  _bt_relbuf(rel, metabuf);
586  return InvalidBuffer;
587  }
588 
589  rootblkno = metad->btm_root;
590  rootlevel = metad->btm_level;
591 
592  /*
593  * We are done with the metapage; arrange to release it via first
594  * _bt_relandgetbuf call
595  */
596  rootbuf = metabuf;
597 
598  for (;;)
599  {
600  rootbuf = _bt_relandgetbuf(rel, rootbuf, rootblkno, BT_READ);
601  rootpage = BufferGetPage(rootbuf);
602  rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
603 
604  if (!P_IGNORE(rootopaque))
605  break;
606 
607  /* it's dead, Jim. step right one page */
608  if (P_RIGHTMOST(rootopaque))
609  elog(ERROR, "no live root page found in index \"%s\"",
611  rootblkno = rootopaque->btpo_next;
612  }
613 
614  /* Note: can't check btpo.level on deleted pages */
615  if (rootopaque->btpo.level != rootlevel)
616  elog(ERROR, "root page %u of index \"%s\" has level %u, expected %u",
617  rootblkno, RelationGetRelationName(rel),
618  rootopaque->btpo.level, rootlevel);
619 
620  return rootbuf;
621 }
BlockNumber btpo_next
Definition: nbtree.h:58
#define P_IGNORE(opaque)
Definition: nbtree.h:194
uint32 btm_version
Definition: nbtree.h:100
Buffer _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
Definition: nbtpage.c:932
Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access)
Definition: nbtpage.c:796
#define BTREE_VERSION
Definition: nbtree.h:133
uint32 btm_magic
Definition: nbtree.h:99
union BTPageOpaqueData::@46 btpo
#define P_NONE
Definition: nbtree.h:181
#define InvalidBuffer
Definition: buf.h:25
int errcode(int sqlerrcode)
Definition: elog.c:570
uint32 BlockNumber
Definition: block.h:31
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:68
#define P_ISMETA(opaque)
Definition: nbtree.h:192
#define BT_READ
Definition: nbtree.h:402
void pfree(void *pointer)
Definition: mcxt.c:1031
#define BTREE_MAGIC
Definition: nbtree.h:132
#define ERROR
Definition: elog.h:43
#define BTPageGetMeta(p)
Definition: nbtree.h:112
#define RelationGetRelationName(relation)
Definition: rel.h:448
unsigned int uint32
Definition: c.h:358
#define BufferGetPage(buffer)
Definition: bufmgr.h:159
#define ereport(elevel, rest)
Definition: elog.h:141
#define BTREE_METAPAGE
Definition: nbtree.h:131
uint32 level
Definition: nbtree.h:61
BlockNumber btm_root
Definition: nbtree.h:101
#define BTREE_MIN_VERSION
Definition: nbtree.h:134
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:951
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
int errmsg(const char *fmt,...)
Definition: elog.c:784
uint32 btm_level
Definition: nbtree.h:102
#define elog(elevel,...)
Definition: elog.h:226
void * rd_amcache
Definition: rel.h:176
int Buffer
Definition: buf.h:23
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:188
Pointer Page
Definition: bufpage.h:78

◆ _bt_heapkeyspace()

bool _bt_heapkeyspace ( Relation  rel)

Definition at line 684 of file nbtpage.c.

References _bt_cachemetadata(), _bt_getbuf(), _bt_getmeta(), _bt_relbuf(), Assert, BT_READ, BTMetaPageData::btm_fastroot, BTMetaPageData::btm_magic, BTMetaPageData::btm_root, BTMetaPageData::btm_version, BTREE_MAGIC, BTREE_METAPAGE, BTREE_NOVAC_VERSION, P_NONE, and RelationData::rd_amcache.

Referenced by _bt_first(), _bt_mkscankey(), and bt_index_check_internal().

685 {
686  BTMetaPageData *metad;
687 
688  if (rel->rd_amcache == NULL)
689  {
690  Buffer metabuf;
691 
692  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
693  metad = _bt_getmeta(rel, metabuf);
694 
695  /*
696  * If there's no root page yet, _bt_getroot() doesn't expect a cache
697  * to be made, so just stop here. (XXX perhaps _bt_getroot() should
698  * be changed to allow this case.)
699  */
700  if (metad->btm_root == P_NONE)
701  {
702  uint32 btm_version = metad->btm_version;
703 
704  _bt_relbuf(rel, metabuf);
705  return btm_version > BTREE_NOVAC_VERSION;
706  }
707 
708  /*
709  * Cache the metapage data for next time
710  */
711  _bt_cachemetadata(rel, metad);
712  /* We shouldn't have cached it if any of these fail */
713  Assert(metad->btm_magic == BTREE_MAGIC);
715  Assert(metad->btm_fastroot != P_NONE);
716  _bt_relbuf(rel, metabuf);
717  }
718 
719  /* Get cached page */
720  metad = (BTMetaPageData *) rel->rd_amcache;
721 
722  return metad->btm_version > BTREE_NOVAC_VERSION;
723 }
static void _bt_cachemetadata(Relation rel, BTMetaPageData *input)
Definition: nbtpage.c:116
static BTMetaPageData * _bt_getmeta(Relation rel, Buffer metabuf)
Definition: nbtpage.c:156
uint32 btm_version
Definition: nbtree.h:100
Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access)
Definition: nbtpage.c:796
uint32 btm_magic
Definition: nbtree.h:99
#define P_NONE
Definition: nbtree.h:181
#define BT_READ
Definition: nbtree.h:402
BlockNumber btm_fastroot
Definition: nbtree.h:103
#define BTREE_MAGIC
Definition: nbtree.h:132
unsigned int uint32
Definition: c.h:358
#define BTREE_NOVAC_VERSION
Definition: nbtree.h:135
#define BTREE_METAPAGE
Definition: nbtree.h:131
BlockNumber btm_root
Definition: nbtree.h:101
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:951
#define Assert(condition)
Definition: c.h:732
void * rd_amcache
Definition: rel.h:176
int Buffer
Definition: buf.h:23

◆ _bt_initmetapage()

void _bt_initmetapage ( Page  page,
BlockNumber  rootbknum,
uint32  level 
)

Definition at line 51 of file nbtpage.c.

References _bt_pageinit(), BTMetaPageData::btm_fastlevel, BTMetaPageData::btm_fastroot, BTMetaPageData::btm_last_cleanup_num_heap_tuples, BTMetaPageData::btm_level, BTMetaPageData::btm_magic, BTMetaPageData::btm_oldest_btpo_xact, BTMetaPageData::btm_root, BTMetaPageData::btm_version, BTP_META, BTPageGetMeta, BTPageOpaqueData::btpo_flags, BTREE_MAGIC, BTREE_VERSION, InvalidTransactionId, and PageGetSpecialPointer.

Referenced by _bt_uppershutdown(), and btbuildempty().

52 {
53  BTMetaPageData *metad;
54  BTPageOpaque metaopaque;
55 
56  _bt_pageinit(page, BLCKSZ);
57 
58  metad = BTPageGetMeta(page);
59  metad->btm_magic = BTREE_MAGIC;
60  metad->btm_version = BTREE_VERSION;
61  metad->btm_root = rootbknum;
62  metad->btm_level = level;
63  metad->btm_fastroot = rootbknum;
64  metad->btm_fastlevel = level;
67 
68  metaopaque = (BTPageOpaque) PageGetSpecialPointer(page);
69  metaopaque->btpo_flags = BTP_META;
70 
71  /*
72  * Set pd_lower just past the end of the metadata. This is essential,
73  * because without doing so, metadata will be lost if xlog.c compresses
74  * the page.
75  */
76  ((PageHeader) page)->pd_lower =
77  ((char *) metad + sizeof(BTMetaPageData)) - (char *) page;
78 }
uint32 btm_version
Definition: nbtree.h:100
#define BTREE_VERSION
Definition: nbtree.h:133
uint32 btm_magic
Definition: nbtree.h:99
#define BTP_META
Definition: nbtree.h:74
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:68
BlockNumber btm_fastroot
Definition: nbtree.h:103
#define BTREE_MAGIC
Definition: nbtree.h:132
#define BTPageGetMeta(p)
Definition: nbtree.h:112
#define InvalidTransactionId
Definition: transam.h:31
uint32 btm_fastlevel
Definition: nbtree.h:104
BlockNumber btm_root
Definition: nbtree.h:101
PageHeaderData * PageHeader
Definition: bufpage.h:166
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
float8 btm_last_cleanup_num_heap_tuples
Definition: nbtree.h:108
uint32 btm_level
Definition: nbtree.h:102
void _bt_pageinit(Page page, Size size)
Definition: nbtpage.c:963
TransactionId btm_oldest_btpo_xact
Definition: nbtree.h:106
uint16 btpo_flags
Definition: nbtree.h:64

◆ _bt_keep_natts_fast()

int _bt_keep_natts_fast ( Relation  rel,
IndexTuple  lastleft,
IndexTuple  firstright 
)

Definition at line 2335 of file nbtutils.c.

References attnum, datumIsEqual(), index_getattr, IndexRelationGetNumberOfKeyAttributes, RelationGetDescr, and TupleDescAttr.

Referenced by _bt_afternewitemoff(), _bt_split_penalty(), and _bt_strategy().

2336 {
2337  TupleDesc itupdesc = RelationGetDescr(rel);
2338  int keysz = IndexRelationGetNumberOfKeyAttributes(rel);
2339  int keepnatts;
2340 
2341  keepnatts = 1;
2342  for (int attnum = 1; attnum <= keysz; attnum++)
2343  {
2344  Datum datum1,
2345  datum2;
2346  bool isNull1,
2347  isNull2;
2348  Form_pg_attribute att;
2349 
2350  datum1 = index_getattr(lastleft, attnum, itupdesc, &isNull1);
2351  datum2 = index_getattr(firstright, attnum, itupdesc, &isNull2);
2352  att = TupleDescAttr(itupdesc, attnum - 1);
2353 
2354  if (isNull1 != isNull2)
2355  break;
2356 
2357  if (!isNull1 &&
2358  !datumIsEqual(datum1, datum2, att->attbyval, att->attlen))
2359  break;
2360 
2361  keepnatts++;
2362  }
2363 
2364  return keepnatts;
2365 }
#define RelationGetDescr(relation)
Definition: rel.h:440
#define TupleDescAttr(tupdesc, i)
Definition: tupdesc.h:92
bool datumIsEqual(Datum value1, Datum value2, bool typByVal, int typLen)
Definition: datum.c:221
FormData_pg_attribute * Form_pg_attribute
Definition: pg_attribute.h:200
#define IndexRelationGetNumberOfKeyAttributes(relation)
Definition: rel.h:433
uintptr_t Datum
Definition: postgres.h:367
int16 attnum
Definition: pg_attribute.h:79
#define index_getattr(tup, attnum, tupleDesc, isnull)
Definition: itup.h:100