PostgreSQL Source Code  git master
nbtree.h File Reference
#include "access/amapi.h"
#include "access/itup.h"
#include "access/sdir.h"
#include "access/xlogreader.h"
#include "catalog/pg_index.h"
#include "lib/stringinfo.h"
#include "storage/bufmgr.h"
#include "storage/shm_toc.h"
Include dependency graph for nbtree.h:
This graph shows which files directly or indirectly include this file:

Go to the source code of this file.

Data Structures

struct  BTPageOpaqueData
 
struct  BTMetaPageData
 
struct  BTStackData
 
struct  BTScanInsertData
 
struct  BTInsertStateData
 
struct  BTScanPosItem
 
struct  BTScanPosData
 
struct  BTArrayKeyInfo
 
struct  BTScanOpaqueData
 

Macros

#define BTP_LEAF   (1 << 0) /* leaf page, i.e. not internal page */
 
#define BTP_ROOT   (1 << 1) /* root page (has no parent) */
 
#define BTP_DELETED   (1 << 2) /* page has been deleted from tree */
 
#define BTP_META   (1 << 3) /* meta-page */
 
#define BTP_HALF_DEAD   (1 << 4) /* empty, but still in tree */
 
#define BTP_SPLIT_END   (1 << 5) /* rightmost page of split group */
 
#define BTP_HAS_GARBAGE   (1 << 6) /* page has LP_DEAD tuples */
 
#define BTP_INCOMPLETE_SPLIT   (1 << 7) /* right sibling's downlink is missing */
 
#define MAX_BT_CYCLE_ID   0xFF7F
 
#define BTPageGetMeta(p)   ((BTMetaPageData *) PageGetContents(p))
 
#define BTREE_METAPAGE   0 /* first page is meta */
 
#define BTREE_MAGIC   0x053162 /* magic number in metapage */
 
#define BTREE_VERSION   4 /* current version number */
 
#define BTREE_MIN_VERSION   2 /* minimal supported version number */
 
#define BTREE_NOVAC_VERSION   3 /* minimal version with all meta fields */
 
#define BTMaxItemSize(page)
 
#define BTMaxItemSizeNoHeapTid(page)
 
#define BTREE_MIN_FILLFACTOR   10
 
#define BTREE_DEFAULT_FILLFACTOR   90
 
#define BTREE_NONLEAF_FILLFACTOR   70
 
#define BTREE_SINGLEVAL_FILLFACTOR   96
 
#define P_NONE   0
 
#define P_LEFTMOST(opaque)   ((opaque)->btpo_prev == P_NONE)
 
#define P_RIGHTMOST(opaque)   ((opaque)->btpo_next == P_NONE)
 
#define P_ISLEAF(opaque)   (((opaque)->btpo_flags & BTP_LEAF) != 0)
 
#define P_ISROOT(opaque)   (((opaque)->btpo_flags & BTP_ROOT) != 0)
 
#define P_ISDELETED(opaque)   (((opaque)->btpo_flags & BTP_DELETED) != 0)
 
#define P_ISMETA(opaque)   (((opaque)->btpo_flags & BTP_META) != 0)
 
#define P_ISHALFDEAD(opaque)   (((opaque)->btpo_flags & BTP_HALF_DEAD) != 0)
 
#define P_IGNORE(opaque)   (((opaque)->btpo_flags & (BTP_DELETED|BTP_HALF_DEAD)) != 0)
 
#define P_HAS_GARBAGE(opaque)   (((opaque)->btpo_flags & BTP_HAS_GARBAGE) != 0)
 
#define P_INCOMPLETE_SPLIT(opaque)   (((opaque)->btpo_flags & BTP_INCOMPLETE_SPLIT) != 0)
 
#define P_HIKEY   ((OffsetNumber) 1)
 
#define P_FIRSTKEY   ((OffsetNumber) 2)
 
#define P_FIRSTDATAKEY(opaque)   (P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY)
 
#define INDEX_ALT_TID_MASK   INDEX_AM_RESERVED_BIT
 
#define BT_RESERVED_OFFSET_MASK   0xF000
 
#define BT_N_KEYS_OFFSET_MASK   0x0FFF
 
#define BT_HEAP_TID_ATTR   0x1000
 
#define BTreeInnerTupleGetDownLink(itup)   ItemPointerGetBlockNumberNoCheck(&((itup)->t_tid))
 
#define BTreeInnerTupleSetDownLink(itup, blkno)   ItemPointerSetBlockNumber(&((itup)->t_tid), (blkno))
 
#define BTreeTupleGetTopParent(itup)   ItemPointerGetBlockNumberNoCheck(&((itup)->t_tid))
 
#define BTreeTupleSetTopParent(itup, blkno)
 
#define BTreeTupleGetNAtts(itup, rel)
 
#define BTreeTupleSetNAtts(itup, n)
 
#define BTreeTupleGetHeapTID(itup)
 
#define BTreeTupleSetAltHeapTID(itup)
 
#define BTCommuteStrategyNumber(strat)   (BTMaxStrategyNumber + 1 - (strat))
 
#define BTORDER_PROC   1
 
#define BTSORTSUPPORT_PROC   2
 
#define BTINRANGE_PROC   3
 
#define BTNProcs   3
 
#define BT_READ   BUFFER_LOCK_SHARE
 
#define BT_WRITE   BUFFER_LOCK_EXCLUSIVE
 
#define BTScanPosIsPinned(scanpos)
 
#define BTScanPosUnpin(scanpos)
 
#define BTScanPosUnpinIfPinned(scanpos)
 
#define BTScanPosIsValid(scanpos)
 
#define BTScanPosInvalidate(scanpos)
 
#define SK_BT_REQFWD   0x00010000 /* required to continue forward scan */
 
#define SK_BT_REQBKWD   0x00020000 /* required to continue backward scan */
 
#define SK_BT_INDOPTION_SHIFT   24 /* must clear the above bits */
 
#define SK_BT_DESC   (INDOPTION_DESC << SK_BT_INDOPTION_SHIFT)
 
#define SK_BT_NULLS_FIRST   (INDOPTION_NULLS_FIRST << SK_BT_INDOPTION_SHIFT)
 
#define PROGRESS_BTREE_PHASE_INDEXBUILD_TABLESCAN   2
 
#define PROGRESS_BTREE_PHASE_PERFORMSORT_1   3
 
#define PROGRESS_BTREE_PHASE_PERFORMSORT_2   4
 
#define PROGRESS_BTREE_PHASE_LEAF_LOAD   5
 

Typedefs

typedef uint16 BTCycleId
 
typedef struct BTPageOpaqueData BTPageOpaqueData
 
typedef BTPageOpaqueDataBTPageOpaque
 
typedef struct BTMetaPageData BTMetaPageData
 
typedef struct BTStackData BTStackData
 
typedef BTStackDataBTStack
 
typedef struct BTScanInsertData BTScanInsertData
 
typedef BTScanInsertDataBTScanInsert
 
typedef struct BTInsertStateData BTInsertStateData
 
typedef BTInsertStateDataBTInsertState
 
typedef struct BTScanPosItem BTScanPosItem
 
typedef struct BTScanPosData BTScanPosData
 
typedef BTScanPosDataBTScanPos
 
typedef struct BTArrayKeyInfo BTArrayKeyInfo
 
typedef struct BTScanOpaqueData BTScanOpaqueData
 
typedef BTScanOpaqueDataBTScanOpaque
 

Functions

void btbuildempty (Relation index)
 
bool btinsert (Relation rel, Datum *values, bool *isnull, ItemPointer ht_ctid, Relation heapRel, IndexUniqueCheck checkUnique, struct IndexInfo *indexInfo)
 
IndexScanDesc btbeginscan (Relation rel, int nkeys, int norderbys)
 
Size btestimateparallelscan (void)
 
void btinitparallelscan (void *target)
 
bool btgettuple (IndexScanDesc scan, ScanDirection dir)
 
int64 btgetbitmap (IndexScanDesc scan, TIDBitmap *tbm)
 
void btrescan (IndexScanDesc scan, ScanKey scankey, int nscankeys, ScanKey orderbys, int norderbys)
 
void btparallelrescan (IndexScanDesc scan)
 
void btendscan (IndexScanDesc scan)
 
void btmarkpos (IndexScanDesc scan)
 
void btrestrpos (IndexScanDesc scan)
 
IndexBulkDeleteResultbtbulkdelete (IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state)
 
IndexBulkDeleteResultbtvacuumcleanup (IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
 
bool btcanreturn (Relation index, int attno)
 
bool _bt_parallel_seize (IndexScanDesc scan, BlockNumber *pageno)
 
void _bt_parallel_release (IndexScanDesc scan, BlockNumber scan_page)
 
void _bt_parallel_done (IndexScanDesc scan)
 
void _bt_parallel_advance_array_keys (IndexScanDesc scan)
 
bool _bt_doinsert (Relation rel, IndexTuple itup, IndexUniqueCheck checkUnique, Relation heapRel)
 
void _bt_finish_split (Relation rel, Buffer bbuf, BTStack stack)
 
Buffer _bt_getstackbuf (Relation rel, BTStack stack, BlockNumber child)
 
OffsetNumber _bt_findsplitloc (Relation rel, Page page, OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem, bool *newitemonleft)
 
void _bt_initmetapage (Page page, BlockNumber rootbknum, uint32 level)
 
void _bt_update_meta_cleanup_info (Relation rel, TransactionId oldestBtpoXact, float8 numHeapTuples)
 
void _bt_upgrademetapage (Page page)
 
Buffer _bt_getroot (Relation rel, int access)
 
Buffer _bt_gettrueroot (Relation rel)
 
int _bt_getrootheight (Relation rel)
 
bool _bt_heapkeyspace (Relation rel)
 
void _bt_checkpage (Relation rel, Buffer buf)
 
Buffer _bt_getbuf (Relation rel, BlockNumber blkno, int access)
 
Buffer _bt_relandgetbuf (Relation rel, Buffer obuf, BlockNumber blkno, int access)
 
void _bt_relbuf (Relation rel, Buffer buf)
 
void _bt_pageinit (Page page, Size size)
 
bool _bt_page_recyclable (Page page)
 
void _bt_delitems_delete (Relation rel, Buffer buf, OffsetNumber *itemnos, int nitems, Relation heapRel)
 
void _bt_delitems_vacuum (Relation rel, Buffer buf, OffsetNumber *itemnos, int nitems, BlockNumber lastBlockVacuumed)
 
int _bt_pagedel (Relation rel, Buffer buf)
 
BTStack _bt_search (Relation rel, BTScanInsert key, Buffer *bufP, int access, Snapshot snapshot)
 
Buffer _bt_moveright (Relation rel, BTScanInsert key, Buffer buf, bool forupdate, BTStack stack, int access, Snapshot snapshot)
 
OffsetNumber _bt_binsrch_insert (Relation rel, BTInsertState insertstate)
 
int32 _bt_compare (Relation rel, BTScanInsert key, Page page, OffsetNumber offnum)
 
bool _bt_first (IndexScanDesc scan, ScanDirection dir)
 
bool _bt_next (IndexScanDesc scan, ScanDirection dir)
 
Buffer _bt_get_endpoint (Relation rel, uint32 level, bool rightmost, Snapshot snapshot)
 
BTScanInsert _bt_mkscankey (Relation rel, IndexTuple itup)
 
void _bt_freestack (BTStack stack)
 
void _bt_preprocess_array_keys (IndexScanDesc scan)
 
void _bt_start_array_keys (IndexScanDesc scan, ScanDirection dir)
 
bool _bt_advance_array_keys (IndexScanDesc scan, ScanDirection dir)
 
void _bt_mark_array_keys (IndexScanDesc scan)
 
void _bt_restore_array_keys (IndexScanDesc scan)
 
void _bt_preprocess_keys (IndexScanDesc scan)
 
bool _bt_checkkeys (IndexScanDesc scan, IndexTuple tuple, int tupnatts, ScanDirection dir, bool *continuescan)
 
void _bt_killitems (IndexScanDesc scan)
 
BTCycleId _bt_vacuum_cycleid (Relation rel)
 
BTCycleId _bt_start_vacuum (Relation rel)
 
void _bt_end_vacuum (Relation rel)
 
void _bt_end_vacuum_callback (int code, Datum arg)
 
Size BTreeShmemSize (void)
 
void BTreeShmemInit (void)
 
byteabtoptions (Datum reloptions, bool validate)
 
bool btproperty (Oid index_oid, int attno, IndexAMProperty prop, const char *propname, bool *res, bool *isnull)
 
char * btbuildphasename (int64 phasenum)
 
IndexTuple _bt_truncate (Relation rel, IndexTuple lastleft, IndexTuple firstright, BTScanInsert itup_key)
 
int _bt_keep_natts_fast (Relation rel, IndexTuple lastleft, IndexTuple firstright)
 
bool _bt_check_natts (Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum)
 
void _bt_check_third_page (Relation rel, Relation heap, bool needheaptidspace, Page page, IndexTuple newtup)
 
bool btvalidate (Oid opclassoid)
 
IndexBuildResultbtbuild (Relation heap, Relation index, struct IndexInfo *indexInfo)
 
void _bt_parallel_build_main (dsm_segment *seg, shm_toc *toc)
 

Macro Definition Documentation

◆ BT_HEAP_TID_ATTR

#define BT_HEAP_TID_ATTR   0x1000

Definition at line 298 of file nbtree.h.

◆ BT_N_KEYS_OFFSET_MASK

#define BT_N_KEYS_OFFSET_MASK   0x0FFF

Definition at line 297 of file nbtree.h.

Referenced by _bt_check_natts().

◆ BT_READ

◆ BT_RESERVED_OFFSET_MASK

#define BT_RESERVED_OFFSET_MASK   0xF000

Definition at line 296 of file nbtree.h.

◆ BT_WRITE

◆ BTCommuteStrategyNumber

#define BTCommuteStrategyNumber (   strat)    (BTMaxStrategyNumber + 1 - (strat))

Definition at line 374 of file nbtree.h.

Referenced by _bt_compare_scankey_args(), and _bt_fix_scankey_strategy().

◆ BTINRANGE_PROC

#define BTINRANGE_PROC   3

Definition at line 394 of file nbtree.h.

Referenced by assignProcTypes(), btvalidate(), and transformFrameOffset().

◆ BTMaxItemSize

#define BTMaxItemSize (   page)
Value:
3*sizeof(ItemIdData) + \
3*sizeof(ItemPointerData)) - \
MAXALIGN(sizeof(BTPageOpaqueData))) / 3)
#define SizeOfPageHeaderData
Definition: bufpage.h:216
#define PageGetPageSize(page)
Definition: bufpage.h:268
#define MAXALIGN(LEN)
Definition: c.h:685
#define MAXALIGN_DOWN(LEN)
Definition: c.h:697

Definition at line 147 of file nbtree.h.

Referenced by _bt_buildadd(), _bt_check_third_page(), _bt_findinsertloc(), and bt_target_page_check().

◆ BTMaxItemSizeNoHeapTid

#define BTMaxItemSizeNoHeapTid (   page)
Value:
MAXALIGN(sizeof(BTPageOpaqueData))) / 3)
#define SizeOfPageHeaderData
Definition: bufpage.h:216
#define PageGetPageSize(page)
Definition: bufpage.h:268
#define MAXALIGN(LEN)
Definition: c.h:685
#define MAXALIGN_DOWN(LEN)
Definition: c.h:697

Definition at line 153 of file nbtree.h.

Referenced by _bt_check_third_page(), and bt_target_page_check().

◆ BTNProcs

#define BTNProcs   3

Definition at line 395 of file nbtree.h.

Referenced by bthandler().

◆ BTORDER_PROC

◆ BTP_DELETED

#define BTP_DELETED   (1 << 2) /* page has been deleted from tree */

Definition at line 73 of file nbtree.h.

Referenced by _bt_unlink_halfdead_page(), and btree_xlog_unlink_page().

◆ BTP_HALF_DEAD

#define BTP_HALF_DEAD   (1 << 4) /* empty, but still in tree */

◆ BTP_HAS_GARBAGE

#define BTP_HAS_GARBAGE   (1 << 6) /* page has LP_DEAD tuples */

◆ BTP_INCOMPLETE_SPLIT

#define BTP_INCOMPLETE_SPLIT   (1 << 7) /* right sibling's downlink is missing */

◆ BTP_LEAF

#define BTP_LEAF   (1 << 0) /* leaf page, i.e. not internal page */

◆ BTP_META

#define BTP_META   (1 << 3) /* meta-page */

Definition at line 74 of file nbtree.h.

Referenced by _bt_initmetapage(), _bt_restore_meta(), and _bt_upgrademetapage().

◆ BTP_ROOT

#define BTP_ROOT   (1 << 1) /* root page (has no parent) */

Definition at line 72 of file nbtree.h.

Referenced by _bt_getroot(), _bt_newroot(), _bt_split(), _bt_uppershutdown(), and btree_xlog_newroot().

◆ BTP_SPLIT_END

#define BTP_SPLIT_END   (1 << 5) /* rightmost page of split group */

Definition at line 76 of file nbtree.h.

Referenced by _bt_split(), btree_mask(), and btvacuumpage().

◆ BTPageGetMeta

◆ BTREE_DEFAULT_FILLFACTOR

#define BTREE_DEFAULT_FILLFACTOR   90

Definition at line 169 of file nbtree.h.

Referenced by _bt_findsplitloc(), and _bt_pagestate().

◆ BTREE_MAGIC

#define BTREE_MAGIC   0x053162 /* magic number in metapage */

◆ BTREE_METAPAGE

◆ BTREE_MIN_FILLFACTOR

#define BTREE_MIN_FILLFACTOR   10

Definition at line 168 of file nbtree.h.

◆ BTREE_MIN_VERSION

#define BTREE_MIN_VERSION   2 /* minimal supported version number */

◆ BTREE_NONLEAF_FILLFACTOR

#define BTREE_NONLEAF_FILLFACTOR   70

Definition at line 170 of file nbtree.h.

Referenced by _bt_findsplitloc(), and _bt_pagestate().

◆ BTREE_NOVAC_VERSION

◆ BTREE_SINGLEVAL_FILLFACTOR

#define BTREE_SINGLEVAL_FILLFACTOR   96

Definition at line 171 of file nbtree.h.

Referenced by _bt_findsplitloc().

◆ BTREE_VERSION

#define BTREE_VERSION   4 /* current version number */

◆ BTreeInnerTupleGetDownLink

◆ BTreeInnerTupleSetDownLink

#define BTreeInnerTupleSetDownLink (   itup,
  blkno 
)    ItemPointerSetBlockNumber(&((itup)->t_tid), (blkno))

◆ BTreeTupleGetHeapTID

#define BTreeTupleGetHeapTID (   itup)
Value:
( \
(itup)->t_info & INDEX_ALT_TID_MASK && \
( \
(ItemPointer) (((char *) (itup) + IndexTupleSize(itup)) - \
sizeof(ItemPointerData)) \
) \
: (itup)->t_info & INDEX_ALT_TID_MASK ? NULL : (ItemPointer) &((itup)->t_tid) \
)
#define ItemPointerGetOffsetNumberNoCheck(pointer)
Definition: itemptr.h:108
#define BT_HEAP_TID_ATTR
Definition: nbtree.h:298
#define INDEX_ALT_TID_MASK
Definition: nbtree.h:293
#define IndexTupleSize(itup)
Definition: itup.h:71

Definition at line 346 of file nbtree.h.

Referenced by _bt_check_natts(), _bt_compare(), _bt_mkscankey(), bt_target_page_check(), and BTreeTupleGetHeapTIDCareful().

◆ BTreeTupleGetNAtts

#define BTreeTupleGetNAtts (   itup,
  rel 
)
Value:
( \
(itup)->t_info & INDEX_ALT_TID_MASK ? \
( \
) \
: \
)
#define ItemPointerGetOffsetNumberNoCheck(pointer)
Definition: itemptr.h:108
#define INDEX_ALT_TID_MASK
Definition: nbtree.h:293
#define IndexRelationGetNumberOfAttributes(relation)
Definition: rel.h:428
#define BT_N_KEYS_OFFSET_MASK
Definition: nbtree.h:297

Definition at line 327 of file nbtree.h.

Referenced by _bt_buildadd(), _bt_check_natts(), _bt_checkkeys(), _bt_compare(), _bt_insertonpg(), _bt_mkscankey(), _bt_newroot(), _bt_readpage(), _bt_split(), _bt_truncate(), _bt_uppershutdown(), and bt_target_page_check().

◆ BTreeTupleGetTopParent

#define BTreeTupleGetTopParent (   itup)    ItemPointerGetBlockNumberNoCheck(&((itup)->t_tid))

Definition at line 312 of file nbtree.h.

Referenced by _bt_unlink_halfdead_page(), and bt_downlink_missing_check().

◆ BTreeTupleSetAltHeapTID

#define BTreeTupleSetAltHeapTID (   itup)
Value:
do { \
Assert((itup)->t_info & INDEX_ALT_TID_MASK); \
ItemPointerSetOffsetNumber(&(itup)->t_tid, \
} while(0)
#define ItemPointerGetOffsetNumberNoCheck(pointer)
Definition: itemptr.h:108
#define BT_HEAP_TID_ATTR
Definition: nbtree.h:298
#define INDEX_ALT_TID_MASK
Definition: nbtree.h:293

Definition at line 360 of file nbtree.h.

Referenced by _bt_truncate().

◆ BTreeTupleSetNAtts

#define BTreeTupleSetNAtts (   itup,
 
)
Value:
do { \
(itup)->t_info |= INDEX_ALT_TID_MASK; \
ItemPointerSetOffsetNumber(&(itup)->t_tid, (n) & BT_N_KEYS_OFFSET_MASK); \
} while(0)
#define INDEX_ALT_TID_MASK
Definition: nbtree.h:293
#define BT_N_KEYS_OFFSET_MASK
Definition: nbtree.h:297

Definition at line 336 of file nbtree.h.

Referenced by _bt_buildadd(), _bt_newroot(), _bt_pgaddtup(), _bt_sortaddtup(), and _bt_truncate().

◆ BTreeTupleSetTopParent

#define BTreeTupleSetTopParent (   itup,
  blkno 
)
Value:
do { \
ItemPointerSetBlockNumber(&((itup)->t_tid), (blkno)); \
BTreeTupleSetNAtts((itup), 0); \
} while(0)

Definition at line 314 of file nbtree.h.

Referenced by _bt_mark_page_halfdead(), _bt_unlink_halfdead_page(), btree_xlog_mark_page_halfdead(), and btree_xlog_unlink_page().

◆ BTScanPosInvalidate

#define BTScanPosInvalidate (   scanpos)
Value:
do { \
(scanpos).currPage = InvalidBlockNumber; \
(scanpos).nextPage = InvalidBlockNumber; \
(scanpos).buf = InvalidBuffer; \
(scanpos).lsn = InvalidXLogRecPtr; \
(scanpos).nextTupleOffset = 0; \
} while (0);
#define InvalidXLogRecPtr
Definition: xlogdefs.h:28
#define InvalidBuffer
Definition: buf.h:25
static char * buf
Definition: pg_test_fsync.c:68
#define InvalidBlockNumber
Definition: block.h:33

Definition at line 609 of file nbtree.h.

Referenced by _bt_endpoint(), _bt_first(), _bt_readnextpage(), _bt_steppage(), btbeginscan(), btmarkpos(), btrescan(), and btrestrpos().

◆ BTScanPosIsPinned

#define BTScanPosIsPinned (   scanpos)
Value:
( \
AssertMacro(BlockNumberIsValid((scanpos).currPage) || \
!BufferIsValid((scanpos).buf)), \
BufferIsValid((scanpos).buf) \
)
static char * buf
Definition: pg_test_fsync.c:68
#define BlockNumberIsValid(blockNumber)
Definition: block.h:70
#define BufferIsValid(bufnum)
Definition: bufmgr.h:113

Definition at line 586 of file nbtree.h.

Referenced by _bt_killitems(), _bt_readnextpage(), _bt_readpage(), _bt_steppage(), and btrestrpos().

◆ BTScanPosIsValid

#define BTScanPosIsValid (   scanpos)
Value:
( \
AssertMacro(BlockNumberIsValid((scanpos).currPage) || \
!BufferIsValid((scanpos).buf)), \
BlockNumberIsValid((scanpos).currPage) \
)
static char * buf
Definition: pg_test_fsync.c:68
#define BlockNumberIsValid(blockNumber)
Definition: block.h:70
#define BufferIsValid(bufnum)
Definition: bufmgr.h:113

Definition at line 603 of file nbtree.h.

Referenced by _bt_first(), _bt_killitems(), _bt_steppage(), btendscan(), btgettuple(), btmarkpos(), btrescan(), and btrestrpos().

◆ BTScanPosUnpin

#define BTScanPosUnpin (   scanpos)
Value:
do { \
ReleaseBuffer((scanpos).buf); \
(scanpos).buf = InvalidBuffer; \
} while (0)
#define InvalidBuffer
Definition: buf.h:25
static char * buf
Definition: pg_test_fsync.c:68

Definition at line 592 of file nbtree.h.

◆ BTScanPosUnpinIfPinned

#define BTScanPosUnpinIfPinned (   scanpos)
Value:
do { \
if (BTScanPosIsPinned(scanpos)) \
BTScanPosUnpin(scanpos); \
} while (0)
#define BTScanPosIsPinned(scanpos)
Definition: nbtree.h:586

Definition at line 597 of file nbtree.h.

Referenced by _bt_readnextpage(), _bt_steppage(), btendscan(), btmarkpos(), btrescan(), and btrestrpos().

◆ BTSORTSUPPORT_PROC

#define BTSORTSUPPORT_PROC   2

Definition at line 393 of file nbtree.h.

Referenced by assignProcTypes(), btvalidate(), FinishSortSupportFunction(), and MJExamineQuals().

◆ INDEX_ALT_TID_MASK

#define INDEX_ALT_TID_MASK   INDEX_AM_RESERVED_BIT

Definition at line 293 of file nbtree.h.

Referenced by _bt_check_natts().

◆ MAX_BT_CYCLE_ID

#define MAX_BT_CYCLE_ID   0xFF7F

Definition at line 87 of file nbtree.h.

Referenced by _bt_start_vacuum().

◆ P_FIRSTDATAKEY

◆ P_FIRSTKEY

#define P_FIRSTKEY   ((OffsetNumber) 2)

◆ P_HAS_GARBAGE

#define P_HAS_GARBAGE (   opaque)    (((opaque)->btpo_flags & BTP_HAS_GARBAGE) != 0)

Definition at line 195 of file nbtree.h.

Referenced by _bt_findinsertloc(), and palloc_btree_page().

◆ P_HIKEY

◆ P_IGNORE

◆ P_INCOMPLETE_SPLIT

◆ P_ISDELETED

◆ P_ISHALFDEAD

#define P_ISHALFDEAD (   opaque)    (((opaque)->btpo_flags & BTP_HALF_DEAD) != 0)

◆ P_ISLEAF

◆ P_ISMETA

#define P_ISMETA (   opaque)    (((opaque)->btpo_flags & BTP_META) != 0)

Definition at line 192 of file nbtree.h.

Referenced by _bt_getmeta(), _bt_gettrueroot(), bt_page_items_bytea(), and palloc_btree_page().

◆ P_ISROOT

#define P_ISROOT (   opaque)    (((opaque)->btpo_flags & BTP_ROOT) != 0)

◆ P_LEFTMOST

#define P_LEFTMOST (   opaque)    ((opaque)->btpo_prev == P_NONE)

◆ P_NONE

◆ P_RIGHTMOST

◆ PROGRESS_BTREE_PHASE_INDEXBUILD_TABLESCAN

#define PROGRESS_BTREE_PHASE_INDEXBUILD_TABLESCAN   2

Definition at line 688 of file nbtree.h.

Referenced by _bt_spools_heapscan(), and btbuildphasename().

◆ PROGRESS_BTREE_PHASE_LEAF_LOAD

#define PROGRESS_BTREE_PHASE_LEAF_LOAD   5

Definition at line 691 of file nbtree.h.

Referenced by _bt_leafbuild(), and btbuildphasename().

◆ PROGRESS_BTREE_PHASE_PERFORMSORT_1

#define PROGRESS_BTREE_PHASE_PERFORMSORT_1   3

Definition at line 689 of file nbtree.h.

Referenced by _bt_leafbuild(), and btbuildphasename().

◆ PROGRESS_BTREE_PHASE_PERFORMSORT_2

#define PROGRESS_BTREE_PHASE_PERFORMSORT_2   4

Definition at line 690 of file nbtree.h.

Referenced by _bt_leafbuild(), and btbuildphasename().

◆ SK_BT_DESC

◆ SK_BT_INDOPTION_SHIFT

#define SK_BT_INDOPTION_SHIFT   24 /* must clear the above bits */

Definition at line 679 of file nbtree.h.

Referenced by _bt_fix_scankey_strategy(), and _bt_mkscankey().

◆ SK_BT_NULLS_FIRST

◆ SK_BT_REQBKWD

#define SK_BT_REQBKWD   0x00020000 /* required to continue backward scan */

Definition at line 678 of file nbtree.h.

Referenced by _bt_check_rowcompare(), _bt_checkkeys(), and _bt_mark_scankey_required().

◆ SK_BT_REQFWD

#define SK_BT_REQFWD   0x00010000 /* required to continue forward scan */

Definition at line 677 of file nbtree.h.

Referenced by _bt_check_rowcompare(), _bt_checkkeys(), and _bt_mark_scankey_required().

Typedef Documentation

◆ BTArrayKeyInfo

◆ BTCycleId

Definition at line 27 of file nbtree.h.

◆ BTInsertState

Definition at line 512 of file nbtree.h.

◆ BTInsertStateData

◆ BTMetaPageData

◆ BTPageOpaque

Definition at line 68 of file nbtree.h.

◆ BTPageOpaqueData

◆ BTScanInsert

Definition at line 480 of file nbtree.h.

◆ BTScanInsertData

◆ BTScanOpaque

Definition at line 670 of file nbtree.h.

◆ BTScanOpaqueData

◆ BTScanPos

Definition at line 584 of file nbtree.h.

◆ BTScanPosData

◆ BTScanPosItem

◆ BTStack

Definition at line 422 of file nbtree.h.

◆ BTStackData

Function Documentation

◆ _bt_advance_array_keys()

bool _bt_advance_array_keys ( IndexScanDesc  scan,
ScanDirection  dir 
)

Definition at line 546 of file nbtutils.c.

References _bt_parallel_advance_array_keys(), BTScanOpaqueData::arrayKeyData, BTScanOpaqueData::arrayKeys, BTArrayKeyInfo::cur_elem, BTArrayKeyInfo::elem_values, i, BTArrayKeyInfo::num_elems, BTScanOpaqueData::numArrayKeys, IndexScanDescData::opaque, IndexScanDescData::parallel_scan, BTArrayKeyInfo::scan_key, ScanDirectionIsBackward, and ScanKeyData::sk_argument.

Referenced by btgetbitmap(), and btgettuple().

547 {
548  BTScanOpaque so = (BTScanOpaque) scan->opaque;
549  bool found = false;
550  int i;
551 
552  /*
553  * We must advance the last array key most quickly, since it will
554  * correspond to the lowest-order index column among the available
555  * qualifications. This is necessary to ensure correct ordering of output
556  * when there are multiple array keys.
557  */
558  for (i = so->numArrayKeys - 1; i >= 0; i--)
559  {
560  BTArrayKeyInfo *curArrayKey = &so->arrayKeys[i];
561  ScanKey skey = &so->arrayKeyData[curArrayKey->scan_key];
562  int cur_elem = curArrayKey->cur_elem;
563  int num_elems = curArrayKey->num_elems;
564 
565  if (ScanDirectionIsBackward(dir))
566  {
567  if (--cur_elem < 0)
568  {
569  cur_elem = num_elems - 1;
570  found = false; /* need to advance next array key */
571  }
572  else
573  found = true;
574  }
575  else
576  {
577  if (++cur_elem >= num_elems)
578  {
579  cur_elem = 0;
580  found = false; /* need to advance next array key */
581  }
582  else
583  found = true;
584  }
585 
586  curArrayKey->cur_elem = cur_elem;
587  skey->sk_argument = curArrayKey->elem_values[cur_elem];
588  if (found)
589  break;
590  }
591 
592  /* advance parallel scan */
593  if (scan->parallel_scan != NULL)
595 
596  return found;
597 }
Datum * elem_values
Definition: nbtree.h:625
#define ScanDirectionIsBackward(direction)
Definition: sdir.h:41
BTScanOpaqueData * BTScanOpaque
Definition: nbtree.h:670
int cur_elem
Definition: nbtree.h:622
int numArrayKeys
Definition: nbtree.h:637
ScanKey arrayKeyData
Definition: nbtree.h:636
struct ParallelIndexScanDescData * parallel_scan
Definition: relscan.h:151
int num_elems
Definition: nbtree.h:624
void _bt_parallel_advance_array_keys(IndexScanDesc scan)
Definition: nbtree.c:763
BTArrayKeyInfo * arrayKeys
Definition: nbtree.h:641
int i
Datum sk_argument
Definition: skey.h:72
int scan_key
Definition: nbtree.h:621

◆ _bt_binsrch_insert()

OffsetNumber _bt_binsrch_insert ( Relation  rel,
BTInsertState  insertstate 
)

Definition at line 440 of file nbtsearch.c.

References _bt_compare(), Assert, BTInsertStateData::bounds_valid, BTInsertStateData::buf, BufferGetPage, InvalidOffsetNumber, BTInsertStateData::itup_key, sort-test::key, BTInsertStateData::low, BTScanInsertData::nextkey, P_FIRSTDATAKEY, P_ISLEAF, PageGetMaxOffsetNumber, PageGetSpecialPointer, BTInsertStateData::stricthigh, and unlikely.

Referenced by _bt_check_unique(), _bt_findinsertloc(), and bt_rootdescend().

441 {
442  BTScanInsert key = insertstate->itup_key;
443  Page page;
444  BTPageOpaque opaque;
445  OffsetNumber low,
446  high,
447  stricthigh;
448  int32 result,
449  cmpval;
450 
451  page = BufferGetPage(insertstate->buf);
452  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
453 
454  Assert(P_ISLEAF(opaque));
455  Assert(!key->nextkey);
456 
457  if (!insertstate->bounds_valid)
458  {
459  /* Start new binary search */
460  low = P_FIRSTDATAKEY(opaque);
461  high = PageGetMaxOffsetNumber(page);
462  }
463  else
464  {
465  /* Restore result of previous binary search against same page */
466  low = insertstate->low;
467  high = insertstate->stricthigh;
468  }
469 
470  /* If there are no keys on the page, return the first available slot */
471  if (unlikely(high < low))
472  {
473  /* Caller can't reuse bounds */
474  insertstate->low = InvalidOffsetNumber;
475  insertstate->stricthigh = InvalidOffsetNumber;
476  insertstate->bounds_valid = false;
477  return low;
478  }
479 
480  /*
481  * Binary search to find the first key on the page >= scan key. (nextkey
482  * is always false when inserting).
483  *
484  * The loop invariant is: all slots before 'low' are < scan key, all slots
485  * at or after 'high' are >= scan key. 'stricthigh' is > scan key, and is
486  * maintained to save additional search effort for caller.
487  *
488  * We can fall out when high == low.
489  */
490  if (!insertstate->bounds_valid)
491  high++; /* establish the loop invariant for high */
492  stricthigh = high; /* high initially strictly higher */
493 
494  cmpval = 1; /* !nextkey comparison value */
495 
496  while (high > low)
497  {
498  OffsetNumber mid = low + ((high - low) / 2);
499 
500  /* We have low <= mid < high, so mid points at a real slot */
501 
502  result = _bt_compare(rel, key, page, mid);
503 
504  if (result >= cmpval)
505  low = mid + 1;
506  else
507  {
508  high = mid;
509  if (result != 0)
510  stricthigh = high;
511  }
512  }
513 
514  /*
515  * On a leaf page, a binary search always returns the first key >= scan
516  * key (at least in !nextkey case), which could be the last slot + 1. This
517  * is also the lower bound of cached search.
518  *
519  * stricthigh may also be the last slot + 1, which prevents caller from
520  * using bounds directly, but is still useful to us if we're called a
521  * second time with cached bounds (cached low will be < stricthigh when
522  * that happens).
523  */
524  insertstate->low = low;
525  insertstate->stricthigh = stricthigh;
526  insertstate->bounds_valid = true;
527 
528  return low;
529 }
bool bounds_valid
Definition: nbtree.h:507
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:219
OffsetNumber stricthigh
Definition: nbtree.h:509
OffsetNumber low
Definition: nbtree.h:508
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:357
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:68
signed int int32
Definition: c.h:346
uint16 OffsetNumber
Definition: off.h:24
int32 _bt_compare(Relation rel, BTScanInsert key, Page page, OffsetNumber offnum)
Definition: nbtsearch.c:554
#define BufferGetPage(buffer)
Definition: bufmgr.h:159
#define InvalidOffsetNumber
Definition: off.h:26
BTScanInsert itup_key
Definition: nbtree.h:497
#define Assert(condition)
Definition: c.h:732
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
#define unlikely(x)
Definition: c.h:208
Pointer Page
Definition: bufpage.h:78
#define P_ISLEAF(opaque)
Definition: nbtree.h:189

◆ _bt_check_natts()

bool _bt_check_natts ( Relation  rel,
bool  heapkeyspace,
Page  page,
OffsetNumber  offnum 
)

Definition at line 2377 of file nbtutils.c.

References Assert, BT_N_KEYS_OFFSET_MASK, BTreeTupleGetHeapTID, BTreeTupleGetNAtts, FirstOffsetNumber, INDEX_ALT_TID_MASK, INDEX_MAX_KEYS, IndexRelationGetNumberOfAttributes, IndexRelationGetNumberOfKeyAttributes, ItemPointerGetOffsetNumber, P_FIRSTDATAKEY, P_HIKEY, P_IGNORE, P_ISLEAF, P_RIGHTMOST, PageGetItem, PageGetItemId, PageGetMaxOffsetNumber, PageGetSpecialPointer, StaticAssertStmt, IndexTupleData::t_info, and IndexTupleData::t_tid.

Referenced by _bt_compare(), and bt_target_page_check().

2378 {
2382  IndexTuple itup;
2383  int tupnatts;
2384 
2385  /*
2386  * We cannot reliably test a deleted or half-dead page, since they have
2387  * dummy high keys
2388  */
2389  if (P_IGNORE(opaque))
2390  return true;
2391 
2392  Assert(offnum >= FirstOffsetNumber &&
2393  offnum <= PageGetMaxOffsetNumber(page));
2394 
2395  /*
2396  * Mask allocated for number of keys in index tuple must be able to fit
2397  * maximum possible number of index attributes
2398  */
2400  "BT_N_KEYS_OFFSET_MASK can't fit INDEX_MAX_KEYS");
2401 
2402  itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
2403  tupnatts = BTreeTupleGetNAtts(itup, rel);
2404 
2405  if (P_ISLEAF(opaque))
2406  {
2407  if (offnum >= P_FIRSTDATAKEY(opaque))
2408  {
2409  /*
2410  * Non-pivot tuples currently never use alternative heap TID
2411  * representation -- even those within heapkeyspace indexes
2412  */
2413  if ((itup->t_info & INDEX_ALT_TID_MASK) != 0)
2414  return false;
2415 
2416  /*
2417  * Leaf tuples that are not the page high key (non-pivot tuples)
2418  * should never be truncated. (Note that tupnatts must have been
2419  * inferred, rather than coming from an explicit on-disk
2420  * representation.)
2421  */
2422  return tupnatts == natts;
2423  }
2424  else
2425  {
2426  /*
2427  * Rightmost page doesn't contain a page high key, so tuple was
2428  * checked above as ordinary leaf tuple
2429  */
2430  Assert(!P_RIGHTMOST(opaque));
2431 
2432  /*
2433  * !heapkeyspace high key tuple contains only key attributes. Note
2434  * that tupnatts will only have been explicitly represented in
2435  * !heapkeyspace indexes that happen to have non-key attributes.
2436  */
2437  if (!heapkeyspace)
2438  return tupnatts == nkeyatts;
2439 
2440  /* Use generic heapkeyspace pivot tuple handling */
2441  }
2442  }
2443  else /* !P_ISLEAF(opaque) */
2444  {
2445  if (offnum == P_FIRSTDATAKEY(opaque))
2446  {
2447  /*
2448  * The first tuple on any internal page (possibly the first after
2449  * its high key) is its negative infinity tuple. Negative
2450  * infinity tuples are always truncated to zero attributes. They
2451  * are a particular kind of pivot tuple.
2452  */
2453  if (heapkeyspace)
2454  return tupnatts == 0;
2455 
2456  /*
2457  * The number of attributes won't be explicitly represented if the
2458  * negative infinity tuple was generated during a page split that
2459  * occurred with a version of Postgres before v11. There must be
2460  * a problem when there is an explicit representation that is
2461  * non-zero, or when there is no explicit representation and the
2462  * tuple is evidently not a pre-pg_upgrade tuple.
2463  *
2464  * Prior to v11, downlinks always had P_HIKEY as their offset. Use
2465  * that to decide if the tuple is a pre-v11 tuple.
2466  */
2467  return tupnatts == 0 ||
2468  ((itup->t_info & INDEX_ALT_TID_MASK) == 0 &&
2470  }
2471  else
2472  {
2473  /*
2474  * !heapkeyspace downlink tuple with separator key contains only
2475  * key attributes. Note that tupnatts will only have been
2476  * explicitly represented in !heapkeyspace indexes that happen to
2477  * have non-key attributes.
2478  */
2479  if (!heapkeyspace)
2480  return tupnatts == nkeyatts;
2481 
2482  /* Use generic heapkeyspace pivot tuple handling */
2483  }
2484 
2485  }
2486 
2487  /* Handle heapkeyspace pivot tuples (excluding minus infinity items) */
2488  Assert(heapkeyspace);
2489 
2490  /*
2491  * Explicit representation of the number of attributes is mandatory with
2492  * heapkeyspace index pivot tuples, regardless of whether or not there are
2493  * non-key attributes.
2494  */
2495  if ((itup->t_info & INDEX_ALT_TID_MASK) == 0)
2496  return false;
2497 
2498  /*
2499  * Heap TID is a tiebreaker key attribute, so it cannot be untruncated
2500  * when any other key attribute is truncated
2501  */
2502  if (BTreeTupleGetHeapTID(itup) != NULL && tupnatts != nkeyatts)
2503  return false;
2504 
2505  /*
2506  * Pivot tuple must have at least one untruncated key attribute (minus
2507  * infinity pivot tuples are the only exception). Pivot tuples can never
2508  * represent that there is a value present for a key attribute that
2509  * exceeds pg_index.indnkeyatts for the index.
2510  */
2511  return tupnatts > 0 && tupnatts <= nkeyatts;
2512 }
signed short int16
Definition: c.h:345
#define P_IGNORE(opaque)
Definition: nbtree.h:194
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:219
ItemPointerData t_tid
Definition: itup.h:37
#define BTreeTupleGetHeapTID(itup)
Definition: nbtree.h:346
#define BTreeTupleGetNAtts(itup, rel)
Definition: nbtree.h:327
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:357
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:68
#define StaticAssertStmt(condition, errmessage)
Definition: c.h:842
#define INDEX_ALT_TID_MASK
Definition: nbtree.h:293
#define FirstOffsetNumber
Definition: off.h:27
IndexTupleData * IndexTuple
Definition: itup.h:53
#define IndexRelationGetNumberOfAttributes(relation)
Definition: rel.h:428
#define IndexRelationGetNumberOfKeyAttributes(relation)
Definition: rel.h:435
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:235
#define Assert(condition)
Definition: c.h:732
#define INDEX_MAX_KEYS
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
#define BT_N_KEYS_OFFSET_MASK
Definition: nbtree.h:297
#define ItemPointerGetOffsetNumber(pointer)
Definition: itemptr.h:117
#define P_HIKEY
Definition: nbtree.h:217
unsigned short t_info
Definition: itup.h:49
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:188
#define PageGetItem(page, itemId)
Definition: bufpage.h:340
#define P_ISLEAF(opaque)
Definition: nbtree.h:189

◆ _bt_check_third_page()

void _bt_check_third_page ( Relation  rel,
Relation  heap,
bool  needheaptidspace,
Page  page,
IndexTuple  newtup 
)

Definition at line 2527 of file nbtutils.c.

References BTMaxItemSize, BTMaxItemSizeNoHeapTid, BTREE_NOVAC_VERSION, BTREE_VERSION, elog, ereport, errcode(), errdetail(), errhint(), errmsg(), ERROR, errtableconstraint(), IndexTupleSize, ItemPointerGetBlockNumber, ItemPointerGetOffsetNumber, MAXALIGN, P_ISLEAF, PageGetSpecialPointer, RelationGetRelationName, and IndexTupleData::t_tid.

Referenced by _bt_buildadd(), and _bt_findinsertloc().

2529 {
2530  Size itemsz;
2531  BTPageOpaque opaque;
2532 
2533  itemsz = MAXALIGN(IndexTupleSize(newtup));
2534 
2535  /* Double check item size against limit */
2536  if (itemsz <= BTMaxItemSize(page))
2537  return;
2538 
2539  /*
2540  * Tuple is probably too large to fit on page, but it's possible that the
2541  * index uses version 2 or version 3, or that page is an internal page, in
2542  * which case a slightly higher limit applies.
2543  */
2544  if (!needheaptidspace && itemsz <= BTMaxItemSizeNoHeapTid(page))
2545  return;
2546 
2547  /*
2548  * Internal page insertions cannot fail here, because that would mean that
2549  * an earlier leaf level insertion that should have failed didn't
2550  */
2551  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2552  if (!P_ISLEAF(opaque))
2553  elog(ERROR, "cannot insert oversized tuple of size %zu on internal page of index \"%s\"",
2554  itemsz, RelationGetRelationName(rel));
2555 
2556  ereport(ERROR,
2557  (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
2558  errmsg("index row size %zu exceeds btree version %u maximum %zu for index \"%s\"",
2559  itemsz,
2560  needheaptidspace ? BTREE_VERSION : BTREE_NOVAC_VERSION,
2561  needheaptidspace ? BTMaxItemSize(page) :
2562  BTMaxItemSizeNoHeapTid(page),
2564  errdetail("Index row references tuple (%u,%u) in relation \"%s\".",
2565  ItemPointerGetBlockNumber(&newtup->t_tid),
2567  RelationGetRelationName(heap)),
2568  errhint("Values larger than 1/3 of a buffer page cannot be indexed.\n"
2569  "Consider a function index of an MD5 hash of the value, "
2570  "or use full text indexing."),
2572 }
int errhint(const char *fmt,...)
Definition: elog.c:974
#define BTREE_VERSION
Definition: nbtree.h:133
ItemPointerData t_tid
Definition: itup.h:37
#define BTMaxItemSizeNoHeapTid(page)
Definition: nbtree.h:153
int errcode(int sqlerrcode)
Definition: elog.c:570
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:68
int errtableconstraint(Relation rel, const char *conname)
Definition: relcache.c:5222
#define ERROR
Definition: elog.h:43
int errdetail(const char *fmt,...)
Definition: elog.c:860
#define RelationGetRelationName(relation)
Definition: rel.h:450
#define BTREE_NOVAC_VERSION
Definition: nbtree.h:135
#define ereport(elevel, rest)
Definition: elog.h:141
size_t Size
Definition: c.h:466
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
#define MAXALIGN(LEN)
Definition: c.h:685
#define ItemPointerGetOffsetNumber(pointer)
Definition: itemptr.h:117
#define BTMaxItemSize(page)
Definition: nbtree.h:147
int errmsg(const char *fmt,...)
Definition: elog.c:784
#define elog(elevel,...)
Definition: elog.h:226
#define ItemPointerGetBlockNumber(pointer)
Definition: itemptr.h:98
#define IndexTupleSize(itup)
Definition: itup.h:71
#define P_ISLEAF(opaque)
Definition: nbtree.h:189

◆ _bt_checkkeys()

bool _bt_checkkeys ( IndexScanDesc  scan,
IndexTuple  tuple,
int  tupnatts,
ScanDirection  dir,
bool continuescan 
)

Definition at line 1357 of file nbtutils.c.

References _bt_check_rowcompare(), Assert, BTreeTupleGetNAtts, DatumGetBool, FunctionCall2Coll(), index_getattr, IndexScanDescData::indexRelation, sort-test::key, BTScanOpaqueData::keyData, BTScanOpaqueData::numberOfKeys, IndexScanDescData::opaque, RelationGetDescr, ScanDirectionIsBackward, ScanDirectionIsForward, ScanKeyData::sk_argument, ScanKeyData::sk_attno, SK_BT_NULLS_FIRST, SK_BT_REQBKWD, SK_BT_REQFWD, ScanKeyData::sk_collation, ScanKeyData::sk_flags, ScanKeyData::sk_func, SK_ISNULL, SK_ROW_HEADER, SK_SEARCHNOTNULL, SK_SEARCHNULL, and test().

Referenced by _bt_readpage().

1359 {
1360  TupleDesc tupdesc;
1361  BTScanOpaque so;
1362  int keysz;
1363  int ikey;
1364  ScanKey key;
1365 
1366  Assert(BTreeTupleGetNAtts(tuple, scan->indexRelation) == tupnatts);
1367 
1368  *continuescan = true; /* default assumption */
1369 
1370  tupdesc = RelationGetDescr(scan->indexRelation);
1371  so = (BTScanOpaque) scan->opaque;
1372  keysz = so->numberOfKeys;
1373 
1374  for (key = so->keyData, ikey = 0; ikey < keysz; key++, ikey++)
1375  {
1376  Datum datum;
1377  bool isNull;
1378  Datum test;
1379 
1380  if (key->sk_attno > tupnatts)
1381  {
1382  /*
1383  * This attribute is truncated (must be high key). The value for
1384  * this attribute in the first non-pivot tuple on the page to the
1385  * right could be any possible value. Assume that truncated
1386  * attribute passes the qual.
1387  */
1389  continue;
1390  }
1391 
1392  /* row-comparison keys need special processing */
1393  if (key->sk_flags & SK_ROW_HEADER)
1394  {
1395  if (_bt_check_rowcompare(key, tuple, tupnatts, tupdesc, dir,
1396  continuescan))
1397  continue;
1398  return false;
1399  }
1400 
1401  datum = index_getattr(tuple,
1402  key->sk_attno,
1403  tupdesc,
1404  &isNull);
1405 
1406  if (key->sk_flags & SK_ISNULL)
1407  {
1408  /* Handle IS NULL/NOT NULL tests */
1409  if (key->sk_flags & SK_SEARCHNULL)
1410  {
1411  if (isNull)
1412  continue; /* tuple satisfies this qual */
1413  }
1414  else
1415  {
1417  if (!isNull)
1418  continue; /* tuple satisfies this qual */
1419  }
1420 
1421  /*
1422  * Tuple fails this qual. If it's a required qual for the current
1423  * scan direction, then we can conclude no further tuples will
1424  * pass, either.
1425  */
1426  if ((key->sk_flags & SK_BT_REQFWD) &&
1428  *continuescan = false;
1429  else if ((key->sk_flags & SK_BT_REQBKWD) &&
1431  *continuescan = false;
1432 
1433  /*
1434  * In any case, this indextuple doesn't match the qual.
1435  */
1436  return false;
1437  }
1438 
1439  if (isNull)
1440  {
1441  if (key->sk_flags & SK_BT_NULLS_FIRST)
1442  {
1443  /*
1444  * Since NULLs are sorted before non-NULLs, we know we have
1445  * reached the lower limit of the range of values for this
1446  * index attr. On a backward scan, we can stop if this qual
1447  * is one of the "must match" subset. We can stop regardless
1448  * of whether the qual is > or <, so long as it's required,
1449  * because it's not possible for any future tuples to pass. On
1450  * a forward scan, however, we must keep going, because we may
1451  * have initially positioned to the start of the index.
1452  */
1453  if ((key->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) &&
1455  *continuescan = false;
1456  }
1457  else
1458  {
1459  /*
1460  * Since NULLs are sorted after non-NULLs, we know we have
1461  * reached the upper limit of the range of values for this
1462  * index attr. On a forward scan, we can stop if this qual is
1463  * one of the "must match" subset. We can stop regardless of
1464  * whether the qual is > or <, so long as it's required,
1465  * because it's not possible for any future tuples to pass. On
1466  * a backward scan, however, we must keep going, because we
1467  * may have initially positioned to the end of the index.
1468  */
1469  if ((key->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) &&
1471  *continuescan = false;
1472  }
1473 
1474  /*
1475  * In any case, this indextuple doesn't match the qual.
1476  */
1477  return false;
1478  }
1479 
1480  test = FunctionCall2Coll(&key->sk_func, key->sk_collation,
1481  datum, key->sk_argument);
1482 
1483  if (!DatumGetBool(test))
1484  {
1485  /*
1486  * Tuple fails this qual. If it's a required qual for the current
1487  * scan direction, then we can conclude no further tuples will
1488  * pass, either.
1489  *
1490  * Note: because we stop the scan as soon as any required equality
1491  * qual fails, it is critical that equality quals be used for the
1492  * initial positioning in _bt_first() when they are available. See
1493  * comments in _bt_first().
1494  */
1495  if ((key->sk_flags & SK_BT_REQFWD) &&
1497  *continuescan = false;
1498  else if ((key->sk_flags & SK_BT_REQBKWD) &&
1500  *continuescan = false;
1501 
1502  /*
1503  * In any case, this indextuple doesn't match the qual.
1504  */
1505  return false;
1506  }
1507  }
1508 
1509  /* If we get here, the tuple passes all index quals. */
1510  return true;
1511 }
static void test(void)
#define ScanDirectionIsForward(direction)
Definition: sdir.h:55
#define RelationGetDescr(relation)
Definition: rel.h:442
Datum FunctionCall2Coll(FmgrInfo *flinfo, Oid collation, Datum arg1, Datum arg2)
Definition: fmgr.c:1150
#define BTreeTupleGetNAtts(itup, rel)
Definition: nbtree.h:327
Relation indexRelation
Definition: relscan.h:103
#define ScanDirectionIsBackward(direction)
Definition: sdir.h:41
BTScanOpaqueData * BTScanOpaque
Definition: nbtree.h:670
FmgrInfo sk_func
Definition: skey.h:71
#define DatumGetBool(X)
Definition: postgres.h:393
#define SK_SEARCHNOTNULL
Definition: skey.h:122
#define SK_ISNULL
Definition: skey.h:115
#define SK_ROW_HEADER
Definition: skey.h:117
#define SK_BT_NULLS_FIRST
Definition: nbtree.h:681
uintptr_t Datum
Definition: postgres.h:367
static bool _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts, TupleDesc tupdesc, ScanDirection dir, bool *continuescan)
Definition: nbtutils.c:1523
#define SK_BT_REQFWD
Definition: nbtree.h:677
int sk_flags
Definition: skey.h:66
#define Assert(condition)
Definition: c.h:732
int numberOfKeys
Definition: nbtree.h:632
#define index_getattr(tup, attnum, tupleDesc, isnull)
Definition: itup.h:100
ScanKey keyData
Definition: nbtree.h:633
Oid sk_collation
Definition: skey.h:70
#define SK_BT_REQBKWD
Definition: nbtree.h:678
Datum sk_argument
Definition: skey.h:72
#define SK_SEARCHNULL
Definition: skey.h:121
AttrNumber sk_attno
Definition: skey.h:67

◆ _bt_checkpage()

void _bt_checkpage ( Relation  rel,
Buffer  buf 
)

Definition at line 690 of file nbtpage.c.

References BufferGetBlockNumber(), BufferGetPage, ereport, errcode(), errhint(), errmsg(), ERROR, MAXALIGN, PageGetSpecialSize, PageIsNew, and RelationGetRelationName.

Referenced by _bt_doinsert(), _bt_getbuf(), _bt_relandgetbuf(), btvacuumpage(), btvacuumscan(), and palloc_btree_page().

691 {
692  Page page = BufferGetPage(buf);
693 
694  /*
695  * ReadBuffer verifies that every newly-read page passes
696  * PageHeaderIsValid, which means it either contains a reasonably sane
697  * page header or is all-zero. We have to defend against the all-zero
698  * case, however.
699  */
700  if (PageIsNew(page))
701  ereport(ERROR,
702  (errcode(ERRCODE_INDEX_CORRUPTED),
703  errmsg("index \"%s\" contains unexpected zero page at block %u",
706  errhint("Please REINDEX it.")));
707 
708  /*
709  * Additionally check that the special area looks sane.
710  */
711  if (PageGetSpecialSize(page) != MAXALIGN(sizeof(BTPageOpaqueData)))
712  ereport(ERROR,
713  (errcode(ERRCODE_INDEX_CORRUPTED),
714  errmsg("index \"%s\" contains corrupted page at block %u",
717  errhint("Please REINDEX it.")));
718 }
int errhint(const char *fmt,...)
Definition: elog.c:974
int errcode(int sqlerrcode)
Definition: elog.c:570
#define ERROR
Definition: elog.h:43
static char * buf
Definition: pg_test_fsync.c:68
#define RelationGetRelationName(relation)
Definition: rel.h:450
#define BufferGetPage(buffer)
Definition: bufmgr.h:159
#define ereport(elevel, rest)
Definition: elog.h:141
#define MAXALIGN(LEN)
Definition: c.h:685
#define PageGetSpecialSize(page)
Definition: bufpage.h:300
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2613
#define PageIsNew(page)
Definition: bufpage.h:229
int errmsg(const char *fmt,...)
Definition: elog.c:784
Pointer Page
Definition: bufpage.h:78

◆ _bt_compare()

int32 _bt_compare ( Relation  rel,
BTScanInsert  key,
Page  page,
OffsetNumber  offnum 
)

Definition at line 554 of file nbtsearch.c.

References _bt_check_natts(), Assert, BTreeTupleGetHeapTID, BTreeTupleGetNAtts, DatumGetInt32, FunctionCall2Coll(), BTScanInsertData::heapkeyspace, i, index_getattr, IndexRelationGetNumberOfKeyAttributes, INVERT_COMPARE_RESULT, ItemPointerCompare(), BTScanInsertData::keysz, Min, P_FIRSTDATAKEY, P_ISLEAF, PageGetItem, PageGetItemId, PageGetSpecialPointer, BTScanInsertData::pivotsearch, RelationGetDescr, BTScanInsertData::scankeys, BTScanInsertData::scantid, ScanKeyData::sk_argument, ScanKeyData::sk_attno, SK_BT_DESC, SK_BT_NULLS_FIRST, ScanKeyData::sk_collation, ScanKeyData::sk_flags, ScanKeyData::sk_func, and SK_ISNULL.

Referenced by _bt_binsrch(), _bt_binsrch_insert(), _bt_check_unique(), _bt_doinsert(), _bt_findinsertloc(), _bt_moveright(), bt_rootdescend(), invariant_g_offset(), invariant_l_nontarget_offset(), invariant_l_offset(), and invariant_leq_offset().

558 {
559  TupleDesc itupdesc = RelationGetDescr(rel);
561  IndexTuple itup;
562  ItemPointer heapTid;
563  ScanKey scankey;
564  int ncmpkey;
565  int ntupatts;
566 
567  Assert(_bt_check_natts(rel, key->heapkeyspace, page, offnum));
569  Assert(key->heapkeyspace || key->scantid == NULL);
570 
571  /*
572  * Force result ">" if target item is first data item on an internal page
573  * --- see NOTE above.
574  */
575  if (!P_ISLEAF(opaque) && offnum == P_FIRSTDATAKEY(opaque))
576  return 1;
577 
578  itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
579  ntupatts = BTreeTupleGetNAtts(itup, rel);
580 
581  /*
582  * The scan key is set up with the attribute number associated with each
583  * term in the key. It is important that, if the index is multi-key, the
584  * scan contain the first k key attributes, and that they be in order. If
585  * you think about how multi-key ordering works, you'll understand why
586  * this is.
587  *
588  * We don't test for violation of this condition here, however. The
589  * initial setup for the index scan had better have gotten it right (see
590  * _bt_first).
591  */
592 
593  ncmpkey = Min(ntupatts, key->keysz);
594  Assert(key->heapkeyspace || ncmpkey == key->keysz);
595  scankey = key->scankeys;
596  for (int i = 1; i <= ncmpkey; i++)
597  {
598  Datum datum;
599  bool isNull;
600  int32 result;
601 
602  datum = index_getattr(itup, scankey->sk_attno, itupdesc, &isNull);
603 
604  /* see comments about NULLs handling in btbuild */
605  if (scankey->sk_flags & SK_ISNULL) /* key is NULL */
606  {
607  if (isNull)
608  result = 0; /* NULL "=" NULL */
609  else if (scankey->sk_flags & SK_BT_NULLS_FIRST)
610  result = -1; /* NULL "<" NOT_NULL */
611  else
612  result = 1; /* NULL ">" NOT_NULL */
613  }
614  else if (isNull) /* key is NOT_NULL and item is NULL */
615  {
616  if (scankey->sk_flags & SK_BT_NULLS_FIRST)
617  result = 1; /* NOT_NULL ">" NULL */
618  else
619  result = -1; /* NOT_NULL "<" NULL */
620  }
621  else
622  {
623  /*
624  * The sk_func needs to be passed the index value as left arg and
625  * the sk_argument as right arg (they might be of different
626  * types). Since it is convenient for callers to think of
627  * _bt_compare as comparing the scankey to the index item, we have
628  * to flip the sign of the comparison result. (Unless it's a DESC
629  * column, in which case we *don't* flip the sign.)
630  */
631  result = DatumGetInt32(FunctionCall2Coll(&scankey->sk_func,
632  scankey->sk_collation,
633  datum,
634  scankey->sk_argument));
635 
636  if (!(scankey->sk_flags & SK_BT_DESC))
637  INVERT_COMPARE_RESULT(result);
638  }
639 
640  /* if the keys are unequal, return the difference */
641  if (result != 0)
642  return result;
643 
644  scankey++;
645  }
646 
647  /*
648  * All non-truncated attributes (other than heap TID) were found to be
649  * equal. Treat truncated attributes as minus infinity when scankey has a
650  * key attribute value that would otherwise be compared directly.
651  *
652  * Note: it doesn't matter if ntupatts includes non-key attributes;
653  * scankey won't, so explicitly excluding non-key attributes isn't
654  * necessary.
655  */
656  if (key->keysz > ntupatts)
657  return 1;
658 
659  /*
660  * Use the heap TID attribute and scantid to try to break the tie. The
661  * rules are the same as any other key attribute -- only the
662  * representation differs.
663  */
664  heapTid = BTreeTupleGetHeapTID(itup);
665  if (key->scantid == NULL)
666  {
667  /*
668  * Most searches have a scankey that is considered greater than a
669  * truncated pivot tuple if and when the scankey has equal values for
670  * attributes up to and including the least significant untruncated
671  * attribute in tuple.
672  *
673  * For example, if an index has the minimum two attributes (single
674  * user key attribute, plus heap TID attribute), and a page's high key
675  * is ('foo', -inf), and scankey is ('foo', <omitted>), the search
676  * will not descend to the page to the left. The search will descend
677  * right instead. The truncated attribute in pivot tuple means that
678  * all non-pivot tuples on the page to the left are strictly < 'foo',
679  * so it isn't necessary to descend left. In other words, search
680  * doesn't have to descend left because it isn't interested in a match
681  * that has a heap TID value of -inf.
682  *
683  * However, some searches (pivotsearch searches) actually require that
684  * we descend left when this happens. -inf is treated as a possible
685  * match for omitted scankey attribute(s). This is needed by page
686  * deletion, which must re-find leaf pages that are targets for
687  * deletion using their high keys.
688  *
689  * Note: the heap TID part of the test ensures that scankey is being
690  * compared to a pivot tuple with one or more truncated key
691  * attributes.
692  *
693  * Note: pg_upgrade'd !heapkeyspace indexes must always descend to the
694  * left here, since they have no heap TID attribute (and cannot have
695  * any -inf key values in any case, since truncation can only remove
696  * non-key attributes). !heapkeyspace searches must always be
697  * prepared to deal with matches on both sides of the pivot once the
698  * leaf level is reached.
699  */
700  if (key->heapkeyspace && !key->pivotsearch &&
701  key->keysz == ntupatts && heapTid == NULL)
702  return 1;
703 
704  /* All provided scankey arguments found to be equal */
705  return 0;
706  }
707 
708  /*
709  * Treat truncated heap TID as minus infinity, since scankey has a key
710  * attribute value (scantid) that would otherwise be compared directly
711  */
713  if (heapTid == NULL)
714  return 1;
715 
717  return ItemPointerCompare(key->scantid, heapTid);
718 }
int32 ItemPointerCompare(ItemPointer arg1, ItemPointer arg2)
Definition: itemptr.c:52
#define DatumGetInt32(X)
Definition: postgres.h:472
#define RelationGetDescr(relation)
Definition: rel.h:442
ItemPointer scantid
Definition: nbtree.h:475
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:219
#define Min(x, y)
Definition: c.h:904
#define BTreeTupleGetHeapTID(itup)
Definition: nbtree.h:346
Datum FunctionCall2Coll(FmgrInfo *flinfo, Oid collation, Datum arg1, Datum arg2)
Definition: fmgr.c:1150
#define BTreeTupleGetNAtts(itup, rel)
Definition: nbtree.h:327
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:68
signed int int32
Definition: c.h:346
IndexTupleData * IndexTuple
Definition: itup.h:53
FmgrInfo sk_func
Definition: skey.h:71
#define IndexRelationGetNumberOfKeyAttributes(relation)
Definition: rel.h:435
#define SK_ISNULL
Definition: skey.h:115
bool pivotsearch
Definition: nbtree.h:474
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:235
#define SK_BT_NULLS_FIRST
Definition: nbtree.h:681
uintptr_t Datum
Definition: postgres.h:367
bool _bt_check_natts(Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum)
Definition: nbtutils.c:2377
int sk_flags
Definition: skey.h:66
#define Assert(condition)
Definition: c.h:732
#define SK_BT_DESC
Definition: nbtree.h:680
bool heapkeyspace
Definition: nbtree.h:471
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
#define index_getattr(tup, attnum, tupleDesc, isnull)
Definition: itup.h:100
ScanKeyData scankeys[INDEX_MAX_KEYS]
Definition: nbtree.h:477
Oid sk_collation
Definition: skey.h:70
int i
Datum sk_argument
Definition: skey.h:72
#define INVERT_COMPARE_RESULT(var)
Definition: c.h:1044
#define PageGetItem(page, itemId)
Definition: bufpage.h:340
AttrNumber sk_attno
Definition: skey.h:67
#define P_ISLEAF(opaque)
Definition: nbtree.h:189

◆ _bt_delitems_delete()

void _bt_delitems_delete ( Relation  rel,
Buffer  buf,
OffsetNumber itemnos,
int  nitems,
Relation  heapRel 
)

Definition at line 1057 of file nbtpage.c.

References Assert, BTP_HAS_GARBAGE, BTPageOpaqueData::btpo_flags, BufferGetPage, END_CRIT_SECTION, index_compute_xid_horizon_for_tuples(), InvalidTransactionId, xl_btree_delete::latestRemovedXid, MarkBufferDirty(), xl_btree_delete::nitems, PageGetSpecialPointer, PageIndexMultiDelete(), PageSetLSN, REGBUF_STANDARD, RelationNeedsWAL, SizeOfBtreeDelete, START_CRIT_SECTION, XLOG_BTREE_DELETE, XLogBeginInsert(), XLogInsert(), XLogRegisterBuffer(), XLogRegisterData(), and XLogStandbyInfoActive.

Referenced by _bt_vacuum_one_page().

1060 {
1061  Page page = BufferGetPage(buf);
1062  BTPageOpaque opaque;
1063  TransactionId latestRemovedXid = InvalidTransactionId;
1064 
1065  /* Shouldn't be called unless there's something to do */
1066  Assert(nitems > 0);
1067 
1069  latestRemovedXid =
1071  itemnos, nitems);
1072 
1073  /* No ereport(ERROR) until changes are logged */
1075 
1076  /* Fix the page */
1077  PageIndexMultiDelete(page, itemnos, nitems);
1078 
1079  /*
1080  * Unlike _bt_delitems_vacuum, we *must not* clear the vacuum cycle ID,
1081  * because this is not called by VACUUM.
1082  */
1083 
1084  /*
1085  * Mark the page as not containing any LP_DEAD items. This is not
1086  * certainly true (there might be some that have recently been marked, but
1087  * weren't included in our target-item list), but it will almost always be
1088  * true and it doesn't seem worth an additional page scan to check it.
1089  * Remember that BTP_HAS_GARBAGE is only a hint anyway.
1090  */
1091  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1092  opaque->btpo_flags &= ~BTP_HAS_GARBAGE;
1093 
1095 
1096  /* XLOG stuff */
1097  if (RelationNeedsWAL(rel))
1098  {
1099  XLogRecPtr recptr;
1100  xl_btree_delete xlrec_delete;
1101 
1102  xlrec_delete.latestRemovedXid = latestRemovedXid;
1103  xlrec_delete.nitems = nitems;
1104 
1105  XLogBeginInsert();
1107  XLogRegisterData((char *) &xlrec_delete, SizeOfBtreeDelete);
1108 
1109  /*
1110  * We need the target-offsets array whether or not we store the whole
1111  * buffer, to allow us to find the latestRemovedXid on a standby
1112  * server.
1113  */
1114  XLogRegisterData((char *) itemnos, nitems * sizeof(OffsetNumber));
1115 
1116  recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DELETE);
1117 
1118  PageSetLSN(page, recptr);
1119  }
1120 
1121  END_CRIT_SECTION();
1122 }
TransactionId latestRemovedXid
Definition: nbtxlog.h:128
uint32 TransactionId
Definition: c.h:507
void MarkBufferDirty(Buffer buffer)
Definition: bufmgr.c:1458
void XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags)
Definition: xloginsert.c:213
#define END_CRIT_SECTION()
Definition: miscadmin.h:134
#define START_CRIT_SECTION()
Definition: miscadmin.h:132
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:68
uint16 OffsetNumber
Definition: off.h:24
static char * buf
Definition: pg_test_fsync.c:68
#define REGBUF_STANDARD
Definition: xloginsert.h:35
#define InvalidTransactionId
Definition: transam.h:31
#define BufferGetPage(buffer)
Definition: bufmgr.h:159
#define XLOG_BTREE_DELETE
Definition: nbtxlog.h:32
void XLogRegisterData(char *data, int len)
Definition: xloginsert.c:323
XLogRecPtr XLogInsert(RmgrId rmid, uint8 info)
Definition: xloginsert.c:415
#define XLogStandbyInfoActive()
Definition: xlog.h:195
uint64 XLogRecPtr
Definition: xlogdefs.h:21
#define Assert(condition)
Definition: c.h:732
void PageIndexMultiDelete(Page page, OffsetNumber *itemnos, int nitems)
Definition: bufpage.c:835
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
#define RelationNeedsWAL(relation)
Definition: rel.h:518
#define SizeOfBtreeDelete
Definition: nbtxlog.h:134
void XLogBeginInsert(void)
Definition: xloginsert.c:120
uint16 btpo_flags
Definition: nbtree.h:64
#define PageSetLSN(page, lsn)
Definition: bufpage.h:368
#define BTP_HAS_GARBAGE
Definition: nbtree.h:77
Pointer Page
Definition: bufpage.h:78
TransactionId index_compute_xid_horizon_for_tuples(Relation irel, Relation hrel, Buffer ibuf, OffsetNumber *itemnos, int nitems)
Definition: genam.c:281

◆ _bt_delitems_vacuum()

void _bt_delitems_vacuum ( Relation  rel,
Buffer  buf,
OffsetNumber itemnos,
int  nitems,
BlockNumber  lastBlockVacuumed 
)

Definition at line 984 of file nbtpage.c.

References BTP_HAS_GARBAGE, BTPageOpaqueData::btpo_cycleid, BTPageOpaqueData::btpo_flags, BufferGetPage, END_CRIT_SECTION, xl_btree_vacuum::lastBlockVacuumed, MarkBufferDirty(), PageGetSpecialPointer, PageIndexMultiDelete(), PageSetLSN, REGBUF_STANDARD, RelationNeedsWAL, SizeOfBtreeVacuum, START_CRIT_SECTION, XLOG_BTREE_VACUUM, XLogBeginInsert(), XLogInsert(), XLogRegisterBufData(), XLogRegisterBuffer(), and XLogRegisterData().

Referenced by btvacuumpage(), and btvacuumscan().

987 {
988  Page page = BufferGetPage(buf);
989  BTPageOpaque opaque;
990 
991  /* No ereport(ERROR) until changes are logged */
993 
994  /* Fix the page */
995  if (nitems > 0)
996  PageIndexMultiDelete(page, itemnos, nitems);
997 
998  /*
999  * We can clear the vacuum cycle ID since this page has certainly been
1000  * processed by the current vacuum scan.
1001  */
1002  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1003  opaque->btpo_cycleid = 0;
1004 
1005  /*
1006  * Mark the page as not containing any LP_DEAD items. This is not
1007  * certainly true (there might be some that have recently been marked, but
1008  * weren't included in our target-item list), but it will almost always be
1009  * true and it doesn't seem worth an additional page scan to check it.
1010  * Remember that BTP_HAS_GARBAGE is only a hint anyway.
1011  */
1012  opaque->btpo_flags &= ~BTP_HAS_GARBAGE;
1013 
1015 
1016  /* XLOG stuff */
1017  if (RelationNeedsWAL(rel))
1018  {
1019  XLogRecPtr recptr;
1020  xl_btree_vacuum xlrec_vacuum;
1021 
1022  xlrec_vacuum.lastBlockVacuumed = lastBlockVacuumed;
1023 
1024  XLogBeginInsert();
1026  XLogRegisterData((char *) &xlrec_vacuum, SizeOfBtreeVacuum);
1027 
1028  /*
1029  * The target-offsets array is not in the buffer, but pretend that it
1030  * is. When XLogInsert stores the whole buffer, the offsets array
1031  * need not be stored too.
1032  */
1033  if (nitems > 0)
1034  XLogRegisterBufData(0, (char *) itemnos, nitems * sizeof(OffsetNumber));
1035 
1036  recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_VACUUM);
1037 
1038  PageSetLSN(page, recptr);
1039  }
1040 
1041  END_CRIT_SECTION();
1042 }
BlockNumber lastBlockVacuumed
Definition: nbtxlog.h:173
void XLogRegisterBufData(uint8 block_id, char *data, int len)
Definition: xloginsert.c:361
void MarkBufferDirty(Buffer buffer)
Definition: bufmgr.c:1458
void XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags)
Definition: xloginsert.c:213
#define END_CRIT_SECTION()
Definition: miscadmin.h:134
#define START_CRIT_SECTION()
Definition: miscadmin.h:132
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:68
uint16 OffsetNumber
Definition: off.h:24
#define SizeOfBtreeVacuum
Definition: nbtxlog.h:178
BTCycleId btpo_cycleid
Definition: nbtree.h:65
static char * buf
Definition: pg_test_fsync.c:68
#define REGBUF_STANDARD
Definition: xloginsert.h:35
#define XLOG_BTREE_VACUUM
Definition: nbtxlog.h:37
#define BufferGetPage(buffer)
Definition: bufmgr.h:159
void XLogRegisterData(char *data, int len)
Definition: xloginsert.c:323
XLogRecPtr XLogInsert(RmgrId rmid, uint8 info)
Definition: xloginsert.c:415
uint64 XLogRecPtr
Definition: xlogdefs.h:21
void PageIndexMultiDelete(Page page, OffsetNumber *itemnos, int nitems)
Definition: bufpage.c:835
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
#define RelationNeedsWAL(relation)
Definition: rel.h:518
void XLogBeginInsert(void)
Definition: xloginsert.c:120
uint16 btpo_flags
Definition: nbtree.h:64
#define PageSetLSN(page, lsn)
Definition: bufpage.h:368
#define BTP_HAS_GARBAGE
Definition: nbtree.h:77
Pointer Page
Definition: bufpage.h:78

◆ _bt_doinsert()

bool _bt_doinsert ( Relation  rel,
IndexTuple  itup,
IndexUniqueCheck  checkUnique,
Relation  heapRel 
)

Definition at line 79 of file nbtinsert.c.

References _bt_check_unique(), _bt_checkpage(), _bt_compare(), _bt_findinsertloc(), _bt_freestack(), _bt_insertonpg(), _bt_mkscankey(), _bt_relbuf(), _bt_search(), BTScanInsertData::anynullkeys, Assert, BTInsertStateData::bounds_valid, BT_WRITE, buf, BTInsertStateData::buf, BufferGetPage, CheckForSerializableConflictIn(), ConditionalLockBuffer(), BTScanInsertData::heapkeyspace, IndexTupleSize, InvalidBlockNumber, InvalidBuffer, BTInsertStateData::itemsz, BTInsertStateData::itup, BTInsertStateData::itup_key, MAXALIGN, P_FIRSTDATAKEY, P_IGNORE, P_INCOMPLETE_SPLIT, P_ISLEAF, P_RIGHTMOST, PageGetFreeSpace(), PageGetMaxOffsetNumber, PageGetSpecialPointer, pfree(), ReadBuffer(), RelationGetTargetBlock, RelationSetTargetBlock, ReleaseBuffer(), BTScanInsertData::scantid, SpeculativeInsertionWait(), IndexTupleData::t_tid, TransactionIdIsValid, UNIQUE_CHECK_EXISTING, UNIQUE_CHECK_NO, XactLockTableWait(), and XLTW_InsertIndex.

Referenced by btinsert().

81 {
82  bool is_unique = false;
83  BTInsertStateData insertstate;
84  BTScanInsert itup_key;
85  BTStack stack = NULL;
86  Buffer buf;
87  bool fastpath;
88  bool checkingunique = (checkUnique != UNIQUE_CHECK_NO);
89 
90  /* we need an insertion scan key to do our search, so build one */
91  itup_key = _bt_mkscankey(rel, itup);
92 
93  if (checkingunique)
94  {
95  if (!itup_key->anynullkeys)
96  {
97  /* No (heapkeyspace) scantid until uniqueness established */
98  itup_key->scantid = NULL;
99  }
100  else
101  {
102  /*
103  * Scan key for new tuple contains NULL key values. Bypass
104  * checkingunique steps. They are unnecessary because core code
105  * considers NULL unequal to every value, including NULL.
106  *
107  * This optimization avoids O(N^2) behavior within the
108  * _bt_findinsertloc() heapkeyspace path when a unique index has a
109  * large number of "duplicates" with NULL key values.
110  */
111  checkingunique = false;
112  /* Tuple is unique in the sense that core code cares about */
113  Assert(checkUnique != UNIQUE_CHECK_EXISTING);
114  is_unique = true;
115  }
116  }
117 
118  /*
119  * Fill in the BTInsertState working area, to track the current page and
120  * position within the page to insert on
121  */
122  insertstate.itup = itup;
123  /* PageAddItem will MAXALIGN(), but be consistent */
124  insertstate.itemsz = MAXALIGN(IndexTupleSize(itup));
125  insertstate.itup_key = itup_key;
126  insertstate.bounds_valid = false;
127  insertstate.buf = InvalidBuffer;
128 
129  /*
130  * It's very common to have an index on an auto-incremented or
131  * monotonically increasing value. In such cases, every insertion happens
132  * towards the end of the index. We try to optimize that case by caching
133  * the right-most leaf of the index. If our cached block is still the
134  * rightmost leaf, has enough free space to accommodate a new entry and
135  * the insertion key is strictly greater than the first key in this page,
136  * then we can safely conclude that the new key will be inserted in the
137  * cached block. So we simply search within the cached block and insert
138  * the key at the appropriate location. We call it a fastpath.
139  *
140  * Testing has revealed, though, that the fastpath can result in increased
141  * contention on the exclusive-lock on the rightmost leaf page. So we
142  * conditionally check if the lock is available. If it's not available
143  * then we simply abandon the fastpath and take the regular path. This
144  * makes sense because unavailability of the lock also signals that some
145  * other backend might be concurrently inserting into the page, thus
146  * reducing our chances to finding an insertion place in this page.
147  */
148 top:
149  fastpath = false;
151  {
152  Page page;
153  BTPageOpaque lpageop;
154 
155  /*
156  * Conditionally acquire exclusive lock on the buffer before doing any
157  * checks. If we don't get the lock, we simply follow slowpath. If we
158  * do get the lock, this ensures that the index state cannot change,
159  * as far as the rightmost part of the index is concerned.
160  */
161  buf = ReadBuffer(rel, RelationGetTargetBlock(rel));
162 
163  if (ConditionalLockBuffer(buf))
164  {
165  _bt_checkpage(rel, buf);
166 
167  page = BufferGetPage(buf);
168 
169  lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
170 
171  /*
172  * Check if the page is still the rightmost leaf page, has enough
173  * free space to accommodate the new tuple, and the insertion scan
174  * key is strictly greater than the first key on the page.
175  */
176  if (P_ISLEAF(lpageop) && P_RIGHTMOST(lpageop) &&
177  !P_IGNORE(lpageop) &&
178  (PageGetFreeSpace(page) > insertstate.itemsz) &&
179  PageGetMaxOffsetNumber(page) >= P_FIRSTDATAKEY(lpageop) &&
180  _bt_compare(rel, itup_key, page, P_FIRSTDATAKEY(lpageop)) > 0)
181  {
182  /*
183  * The right-most block should never have an incomplete split.
184  * But be paranoid and check for it anyway.
185  */
186  Assert(!P_INCOMPLETE_SPLIT(lpageop));
187  fastpath = true;
188  }
189  else
190  {
191  _bt_relbuf(rel, buf);
192 
193  /*
194  * Something did not work out. Just forget about the cached
195  * block and follow the normal path. It might be set again if
196  * the conditions are favourable.
197  */
199  }
200  }
201  else
202  {
203  ReleaseBuffer(buf);
204 
205  /*
206  * If someone's holding a lock, it's likely to change anyway, so
207  * don't try again until we get an updated rightmost leaf.
208  */
210  }
211  }
212 
213  if (!fastpath)
214  {
215  /*
216  * Find the first page containing this key. Buffer returned by
217  * _bt_search() is locked in exclusive mode.
218  */
219  stack = _bt_search(rel, itup_key, &buf, BT_WRITE, NULL);
220  }
221 
222  insertstate.buf = buf;
223  buf = InvalidBuffer; /* insertstate.buf now owns the buffer */
224 
225  /*
226  * If we're not allowing duplicates, make sure the key isn't already in
227  * the index.
228  *
229  * NOTE: obviously, _bt_check_unique can only detect keys that are already
230  * in the index; so it cannot defend against concurrent insertions of the
231  * same key. We protect against that by means of holding a write lock on
232  * the first page the value could be on, with omitted/-inf value for the
233  * implicit heap TID tiebreaker attribute. Any other would-be inserter of
234  * the same key must acquire a write lock on the same page, so only one
235  * would-be inserter can be making the check at one time. Furthermore,
236  * once we are past the check we hold write locks continuously until we
237  * have performed our insertion, so no later inserter can fail to see our
238  * insertion. (This requires some care in _bt_findinsertloc.)
239  *
240  * If we must wait for another xact, we release the lock while waiting,
241  * and then must start over completely.
242  *
243  * For a partial uniqueness check, we don't wait for the other xact. Just
244  * let the tuple in and return false for possibly non-unique, or true for
245  * definitely unique.
246  */
247  if (checkingunique)
248  {
249  TransactionId xwait;
250  uint32 speculativeToken;
251 
252  xwait = _bt_check_unique(rel, &insertstate, heapRel, checkUnique,
253  &is_unique, &speculativeToken);
254 
255  if (TransactionIdIsValid(xwait))
256  {
257  /* Have to wait for the other guy ... */
258  _bt_relbuf(rel, insertstate.buf);
259  insertstate.buf = InvalidBuffer;
260 
261  /*
262  * If it's a speculative insertion, wait for it to finish (ie. to
263  * go ahead with the insertion, or kill the tuple). Otherwise
264  * wait for the transaction to finish as usual.
265  */
266  if (speculativeToken)
267  SpeculativeInsertionWait(xwait, speculativeToken);
268  else
269  XactLockTableWait(xwait, rel, &itup->t_tid, XLTW_InsertIndex);
270 
271  /* start over... */
272  if (stack)
273  _bt_freestack(stack);
274  goto top;
275  }
276 
277  /* Uniqueness is established -- restore heap tid as scantid */
278  if (itup_key->heapkeyspace)
279  itup_key->scantid = &itup->t_tid;
280  }
281 
282  if (checkUnique != UNIQUE_CHECK_EXISTING)
283  {
284  OffsetNumber newitemoff;
285 
286  /*
287  * The only conflict predicate locking cares about for indexes is when
288  * an index tuple insert conflicts with an existing lock. We don't
289  * know the actual page we're going to insert on for sure just yet in
290  * checkingunique and !heapkeyspace cases, but it's okay to use the
291  * first page the value could be on (with scantid omitted) instead.
292  */
293  CheckForSerializableConflictIn(rel, NULL, insertstate.buf);
294 
295  /*
296  * Do the insertion. Note that insertstate contains cached binary
297  * search bounds established within _bt_check_unique when insertion is
298  * checkingunique.
299  */
300  newitemoff = _bt_findinsertloc(rel, &insertstate, checkingunique,
301  stack, heapRel);
302  _bt_insertonpg(rel, itup_key, insertstate.buf, InvalidBuffer, stack,
303  itup, newitemoff, false);
304  }
305  else
306  {
307  /* just release the buffer */
308  _bt_relbuf(rel, insertstate.buf);
309  }
310 
311  /* be tidy */
312  if (stack)
313  _bt_freestack(stack);
314  pfree(itup_key);
315 
316  return is_unique;
317 }
void _bt_freestack(BTStack stack)
Definition: nbtutils.c:163
static void _bt_insertonpg(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, BTStack stack, IndexTuple itup, OffsetNumber newitemoff, bool split_only_page)
Definition: nbtinsert.c:932
#define P_IGNORE(opaque)
Definition: nbtree.h:194
bool bounds_valid
Definition: nbtree.h:507
uint32 TransactionId
Definition: c.h:507
BTScanInsert _bt_mkscankey(Relation rel, IndexTuple itup)
Definition: nbtutils.c:85
ItemPointer scantid
Definition: nbtree.h:475
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:219
ItemPointerData t_tid
Definition: itup.h:37
static OffsetNumber _bt_findinsertloc(Relation rel, BTInsertState insertstate, bool checkingunique, BTStack stack, Relation heapRel)
Definition: nbtinsert.c:683
#define InvalidBuffer
Definition: buf.h:25
void ReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:3353
#define P_INCOMPLETE_SPLIT(opaque)
Definition: nbtree.h:196
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:357
#define RelationGetTargetBlock(relation)
Definition: rel.h:501
void CheckForSerializableConflictIn(Relation relation, HeapTuple tuple, Buffer buffer)
Definition: predicate.c:4442
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:68
Size PageGetFreeSpace(Page page)
Definition: bufpage.c:581
uint16 OffsetNumber
Definition: off.h:24
void pfree(void *pointer)
Definition: mcxt.c:1031
void _bt_checkpage(Relation rel, Buffer buf)
Definition: nbtpage.c:690
void SpeculativeInsertionWait(TransactionId xid, uint32 token)
Definition: lmgr.c:781
static char * buf
Definition: pg_test_fsync.c:68
unsigned int uint32
Definition: c.h:358
IndexTuple itup
Definition: nbtree.h:495
int32 _bt_compare(Relation rel, BTScanInsert key, Page page, OffsetNumber offnum)
Definition: nbtsearch.c:554
#define BufferGetPage(buffer)
Definition: bufmgr.h:159
bool ConditionalLockBuffer(Buffer buffer)
Definition: bufmgr.c:3616
bool anynullkeys
Definition: nbtree.h:472
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:912
void XactLockTableWait(TransactionId xid, Relation rel, ItemPointer ctid, XLTW_Oper oper)
Definition: lmgr.c:624
BTScanInsert itup_key
Definition: nbtree.h:497
#define Assert(condition)
Definition: c.h:732
bool heapkeyspace
Definition: nbtree.h:471
Buffer ReadBuffer(Relation reln, BlockNumber blockNum)
Definition: bufmgr.c:596
#define RelationSetTargetBlock(relation, targblock)
Definition: rel.h:508
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
#define InvalidBlockNumber
Definition: block.h:33
#define MAXALIGN(LEN)
Definition: c.h:685
static TransactionId _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel, IndexUniqueCheck checkUnique, bool *is_unique, uint32 *speculativeToken)
Definition: nbtinsert.c:343
#define TransactionIdIsValid(xid)
Definition: transam.h:41
#define BT_WRITE
Definition: nbtree.h:403
int Buffer
Definition: buf.h:23
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:188
BTStack _bt_search(Relation rel, BTScanInsert key, Buffer *bufP, int access, Snapshot snapshot)
Definition: nbtsearch.c:92
Pointer Page
Definition: bufpage.h:78
#define IndexTupleSize(itup)
Definition: itup.h:71
#define P_ISLEAF(opaque)
Definition: nbtree.h:189

◆ _bt_end_vacuum()

void _bt_end_vacuum ( Relation  rel)

Definition at line 1950 of file nbtutils.c.

References LockRelId::dbId, i, LockInfoData::lockRelId, LW_EXCLUSIVE, LWLockAcquire(), LWLockRelease(), BTVacInfo::num_vacuums, RelationData::rd_lockInfo, LockRelId::relId, BTOneVacInfo::relid, and BTVacInfo::vacuums.

Referenced by _bt_end_vacuum_callback(), and btbulkdelete().

1951 {
1952  int i;
1953 
1954  LWLockAcquire(BtreeVacuumLock, LW_EXCLUSIVE);
1955 
1956  /* Find the array entry */
1957  for (i = 0; i < btvacinfo->num_vacuums; i++)
1958  {
1959  BTOneVacInfo *vac = &btvacinfo->vacuums[i];
1960 
1961  if (vac->relid.relId == rel->rd_lockInfo.lockRelId.relId &&
1962  vac->relid.dbId == rel->rd_lockInfo.lockRelId.dbId)
1963  {
1964  /* Remove it by shifting down the last entry */
1965  *vac = btvacinfo->vacuums[btvacinfo->num_vacuums - 1];
1967  break;
1968  }
1969  }
1970 
1971  LWLockRelease(BtreeVacuumLock);
1972 }
LockRelId lockRelId
Definition: rel.h:43
static BTVacInfo * btvacinfo
Definition: nbtutils.c:1846
LockRelId relid
Definition: nbtutils.c:1834
Oid dbId
Definition: rel.h:38
BTOneVacInfo vacuums[FLEXIBLE_ARRAY_MEMBER]
Definition: nbtutils.c:1843
void LWLockRelease(LWLock *lock)
Definition: lwlock.c:1726
LockInfoData rd_lockInfo
Definition: rel.h:86
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1122
int num_vacuums
Definition: nbtutils.c:1841
int i
Oid relId
Definition: rel.h:37

◆ _bt_end_vacuum_callback()

void _bt_end_vacuum_callback ( int  code,
Datum  arg 
)

Definition at line 1978 of file nbtutils.c.

References _bt_end_vacuum(), and DatumGetPointer.

Referenced by btbulkdelete().

1979 {
1981 }
void _bt_end_vacuum(Relation rel)
Definition: nbtutils.c:1950
#define DatumGetPointer(X)
Definition: postgres.h:549
void * arg

◆ _bt_findsplitloc()

OffsetNumber _bt_findsplitloc ( Relation  rel,
Page  page,
OffsetNumber  newitemoff,
Size  newitemsz,
IndexTuple  newitem,
bool newitemonleft 
)

Definition at line 127 of file nbtsplitloc.c.

References _bt_afternewitemoff(), _bt_bestsplitloc(), _bt_deltasortsplits(), _bt_recsplitloc(), _bt_strategy(), Assert, BTREE_DEFAULT_FILLFACTOR, BTREE_NONLEAF_FILLFACTOR, BTREE_SINGLEVAL_FILLFACTOR, elog, ERROR, SplitPoint::firstoldonright, i, IndexRelationGetNumberOfKeyAttributes, FindSplitData::interval, FindSplitData::is_leaf, FindSplitData::is_rightmost, ItemIdGetLength, FindSplitData::leftspace, Max, MAX_INTERNAL_INTERVAL, MAX_LEAF_INTERVAL, MAXALIGN, FindSplitData::maxsplits, Min, FindSplitData::minfirstrightsz, FindSplitData::newitem, FindSplitData::newitemoff, SplitPoint::newitemonleft, FindSplitData::newitemsz, FindSplitData::nsplits, OffsetNumberNext, FindSplitData::olddataitemstotal, P_FIRSTDATAKEY, P_HIKEY, P_ISLEAF, P_RIGHTMOST, FindSplitData::page, PageGetExactFreeSpace(), PageGetItemId, PageGetMaxOffsetNumber, PageGetPageSize, PageGetSpecialPointer, palloc(), pfree(), FindSplitData::rel, RelationGetFillFactor, RelationGetRelationName, FindSplitData::rightspace, SIZE_MAX, SizeOfPageHeaderData, SPLIT_DEFAULT, SPLIT_MANY_DUPLICATES, SPLIT_SINGLE_VALUE, and FindSplitData::splits.

Referenced by _bt_split().

133 {
134  BTPageOpaque opaque;
135  int leftspace,
136  rightspace,
137  olddataitemstotal,
138  olddataitemstoleft,
139  perfectpenalty,
140  leaffillfactor;
142  FindSplitStrat strategy;
143  ItemId itemid;
144  OffsetNumber offnum,
145  maxoff,
146  foundfirstright;
147  double fillfactormult;
148  bool usemult;
149  SplitPoint leftpage,
150  rightpage;
151 
152  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
153  maxoff = PageGetMaxOffsetNumber(page);
154 
155  /* Total free space available on a btree page, after fixed overhead */
156  leftspace = rightspace =
158  MAXALIGN(sizeof(BTPageOpaqueData));
159 
160  /* The right page will have the same high key as the old page */
161  if (!P_RIGHTMOST(opaque))
162  {
163  itemid = PageGetItemId(page, P_HIKEY);
164  rightspace -= (int) (MAXALIGN(ItemIdGetLength(itemid)) +
165  sizeof(ItemIdData));
166  }
167 
168  /* Count up total space in data items before actually scanning 'em */
169  olddataitemstotal = rightspace - (int) PageGetExactFreeSpace(page);
170  leaffillfactor = RelationGetFillFactor(rel, BTREE_DEFAULT_FILLFACTOR);
171 
172  /* Passed-in newitemsz is MAXALIGNED but does not include line pointer */
173  newitemsz += sizeof(ItemIdData);
174  state.rel = rel;
175  state.page = page;
176  state.newitem = newitem;
177  state.newitemsz = newitemsz;
178  state.is_leaf = P_ISLEAF(opaque);
179  state.is_rightmost = P_RIGHTMOST(opaque);
180  state.leftspace = leftspace;
181  state.rightspace = rightspace;
182  state.olddataitemstotal = olddataitemstotal;
183  state.minfirstrightsz = SIZE_MAX;
184  state.newitemoff = newitemoff;
185 
186  /*
187  * maxsplits should never exceed maxoff because there will be at most as
188  * many candidate split points as there are points _between_ tuples, once
189  * you imagine that the new item is already on the original page (the
190  * final number of splits may be slightly lower because not all points
191  * between tuples will be legal).
192  */
193  state.maxsplits = maxoff;
194  state.splits = palloc(sizeof(SplitPoint) * state.maxsplits);
195  state.nsplits = 0;
196 
197  /*
198  * Scan through the data items and calculate space usage for a split at
199  * each possible position
200  */
201  olddataitemstoleft = 0;
202 
203  for (offnum = P_FIRSTDATAKEY(opaque);
204  offnum <= maxoff;
205  offnum = OffsetNumberNext(offnum))
206  {
207  Size itemsz;
208 
209  itemid = PageGetItemId(page, offnum);
210  itemsz = MAXALIGN(ItemIdGetLength(itemid)) + sizeof(ItemIdData);
211 
212  /*
213  * When item offset number is not newitemoff, neither side of the
214  * split can be newitem. Record a split after the previous data item
215  * from original page, but before the current data item from original
216  * page. (_bt_recsplitloc() will reject the split when there are no
217  * previous items, which we rely on.)
218  */
219  if (offnum < newitemoff)
220  _bt_recsplitloc(&state, offnum, false, olddataitemstoleft, itemsz);
221  else if (offnum > newitemoff)
222  _bt_recsplitloc(&state, offnum, true, olddataitemstoleft, itemsz);
223  else
224  {
225  /*
226  * Record a split after all "offnum < newitemoff" original page
227  * data items, but before newitem
228  */
229  _bt_recsplitloc(&state, offnum, false, olddataitemstoleft, itemsz);
230 
231  /*
232  * Record a split after newitem, but before data item from
233  * original page at offset newitemoff/current offset
234  */
235  _bt_recsplitloc(&state, offnum, true, olddataitemstoleft, itemsz);
236  }
237 
238  olddataitemstoleft += itemsz;
239  }
240 
241  /*
242  * Record a split after all original page data items, but before newitem.
243  * (Though only when it's possible that newitem will end up alone on new
244  * right page.)
245  */
246  Assert(olddataitemstoleft == olddataitemstotal);
247  if (newitemoff > maxoff)
248  _bt_recsplitloc(&state, newitemoff, false, olddataitemstotal, 0);
249 
250  /*
251  * I believe it is not possible to fail to find a feasible split, but just
252  * in case ...
253  */
254  if (state.nsplits == 0)
255  elog(ERROR, "could not find a feasible split point for index \"%s\"",
257 
258  /*
259  * Start search for a split point among list of legal split points. Give
260  * primary consideration to equalizing available free space in each half
261  * of the split initially (start with default strategy), while applying
262  * rightmost and split-after-new-item optimizations where appropriate.
263  * Either of the two other fallback strategies may be required for cases
264  * with a large number of duplicates around the original/space-optimal
265  * split point.
266  *
267  * Default strategy gives some weight to suffix truncation in deciding a
268  * split point on leaf pages. It attempts to select a split point where a
269  * distinguishing attribute appears earlier in the new high key for the
270  * left side of the split, in order to maximize the number of trailing
271  * attributes that can be truncated away. Only candidate split points
272  * that imply an acceptable balance of free space on each side are
273  * considered.
274  */
275  if (!state.is_leaf)
276  {
277  /* fillfactormult only used on rightmost page */
278  usemult = state.is_rightmost;
279  fillfactormult = BTREE_NONLEAF_FILLFACTOR / 100.0;
280  }
281  else if (state.is_rightmost)
282  {
283  /* Rightmost leaf page -- fillfactormult always used */
284  usemult = true;
285  fillfactormult = leaffillfactor / 100.0;
286  }
287  else if (_bt_afternewitemoff(&state, maxoff, leaffillfactor, &usemult))
288  {
289  /*
290  * New item inserted at rightmost point among a localized grouping on
291  * a leaf page -- apply "split after new item" optimization, either by
292  * applying leaf fillfactor multiplier, or by choosing the exact split
293  * point that leaves the new item as last on the left. (usemult is set
294  * for us.)
295  */
296  if (usemult)
297  {
298  /* fillfactormult should be set based on leaf fillfactor */
299  fillfactormult = leaffillfactor / 100.0;
300  }
301  else
302  {
303  /* find precise split point after newitemoff */
304  for (int i = 0; i < state.nsplits; i++)
305  {
306  SplitPoint *split = state.splits + i;
307 
308  if (split->newitemonleft &&
309  newitemoff == split->firstoldonright)
310  {
311  pfree(state.splits);
312  *newitemonleft = true;
313  return newitemoff;
314  }
315  }
316 
317  /*
318  * Cannot legally split after newitemoff; proceed with split
319  * without using fillfactor multiplier. This is defensive, and
320  * should never be needed in practice.
321  */
322  fillfactormult = 0.50;
323  }
324  }
325  else
326  {
327  /* Other leaf page. 50:50 page split. */
328  usemult = false;
329  /* fillfactormult not used, but be tidy */
330  fillfactormult = 0.50;
331  }
332 
333  /*
334  * Set an initial limit on the split interval/number of candidate split
335  * points as appropriate. The "Prefix B-Trees" paper refers to this as
336  * sigma l for leaf splits and sigma b for internal ("branch") splits.
337  * It's hard to provide a theoretical justification for the initial size
338  * of the split interval, though it's clear that a small split interval
339  * makes suffix truncation much more effective without noticeably
340  * affecting space utilization over time.
341  */
342  state.interval = Min(Max(1, state.nsplits * 0.05),
343  state.is_leaf ? MAX_LEAF_INTERVAL :
345 
346  /*
347  * Save leftmost and rightmost splits for page before original ordinal
348  * sort order is lost by delta/fillfactormult sort
349  */
350  leftpage = state.splits[0];
351  rightpage = state.splits[state.nsplits - 1];
352 
353  /* Give split points a fillfactormult-wise delta, and sort on deltas */
354  _bt_deltasortsplits(&state, fillfactormult, usemult);
355 
356  /*
357  * Determine if default strategy/split interval will produce a
358  * sufficiently distinguishing split, or if we should change strategies.
359  * Alternative strategies change the range of split points that are
360  * considered acceptable (split interval), and possibly change
361  * fillfactormult, in order to deal with pages with a large number of
362  * duplicates gracefully.
363  *
364  * Pass low and high splits for the entire page (actually, they're for an
365  * imaginary version of the page that includes newitem). These are used
366  * when the initial split interval encloses split points that are full of
367  * duplicates, and we need to consider if it's even possible to avoid
368  * appending a heap TID.
369  */
370  perfectpenalty = _bt_strategy(&state, &leftpage, &rightpage, &strategy);
371 
372  if (strategy == SPLIT_DEFAULT)
373  {
374  /*
375  * Default strategy worked out (always works out with internal page).
376  * Original split interval still stands.
377  */
378  }
379 
380  /*
381  * Many duplicates strategy is used when a heap TID would otherwise be
382  * appended, but the page isn't completely full of logical duplicates.
383  *
384  * The split interval is widened to include all legal candidate split
385  * points. There might be a few as two distinct values in the whole-page
386  * split interval, though it's also possible that most of the values on
387  * the page are unique. The final split point will either be to the
388  * immediate left or to the immediate right of the group of duplicate
389  * tuples that enclose the first/delta-optimal split point (perfect
390  * penalty was set so that the lowest delta split point that avoids
391  * appending a heap TID will be chosen). Maximizing the number of
392  * attributes that can be truncated away is not a goal of the many
393  * duplicates strategy.
394  *
395  * Single value strategy is used when it is impossible to avoid appending
396  * a heap TID. It arranges to leave the left page very full. This
397  * maximizes space utilization in cases where tuples with the same
398  * attribute values span many pages. Newly inserted duplicates will tend
399  * to have higher heap TID values, so we'll end up splitting to the right
400  * consistently. (Single value strategy is harmless though not
401  * particularly useful with !heapkeyspace indexes.)
402  */
403  else if (strategy == SPLIT_MANY_DUPLICATES)
404  {
405  Assert(state.is_leaf);
406  /* Shouldn't try to truncate away extra user attributes */
407  Assert(perfectpenalty ==
409  /* No need to resort splits -- no change in fillfactormult/deltas */
410  state.interval = state.nsplits;
411  }
412  else if (strategy == SPLIT_SINGLE_VALUE)
413  {
414  Assert(state.is_leaf);
415  /* Split near the end of the page */
416  usemult = true;
417  fillfactormult = BTREE_SINGLEVAL_FILLFACTOR / 100.0;
418  /* Resort split points with new delta */
419  _bt_deltasortsplits(&state, fillfactormult, usemult);
420  /* Appending a heap TID is unavoidable, so interval of 1 is fine */
421  state.interval = 1;
422  }
423 
424  /*
425  * Search among acceptable split points (using final split interval) for
426  * the entry that has the lowest penalty, and is therefore expected to
427  * maximize fan-out. Sets *newitemonleft for us.
428  */
429  foundfirstright = _bt_bestsplitloc(&state, perfectpenalty, newitemonleft,
430  strategy);
431  pfree(state.splits);
432 
433  return foundfirstright;
434 }
OffsetNumber firstoldonright
Definition: nbtsplitloc.c:40
bool is_rightmost
Definition: nbtsplitloc.c:53
Size minfirstrightsz
Definition: nbtsplitloc.c:58
#define MAX_LEAF_INTERVAL
Definition: nbtsplitloc.c:21
IndexTuple newitem
Definition: nbtsplitloc.c:50
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:219
#define Min(x, y)
Definition: c.h:904
bool newitemonleft
Definition: nbtsplitloc.c:41
#define SizeOfPageHeaderData
Definition: bufpage.h:216
static int _bt_strategy(FindSplitData *state, SplitPoint *leftpage, SplitPoint *rightpage, FindSplitStrat *strategy)
Definition: nbtsplitloc.c:839
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:357
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:68
SplitPoint * splits
Definition: nbtsplitloc.c:63
uint16 OffsetNumber
Definition: off.h:24
void pfree(void *pointer)
Definition: mcxt.c:1031
#define ItemIdGetLength(itemId)
Definition: itemid.h:59
#define BTREE_NONLEAF_FILLFACTOR
Definition: nbtree.h:170
#define ERROR
Definition: elog.h:43
static OffsetNumber _bt_bestsplitloc(FindSplitData *state, int perfectpenalty, bool *newitemonleft, FindSplitStrat strategy)
Definition: nbtsplitloc.c:764
#define PageGetPageSize(page)
Definition: bufpage.h:268
static bool _bt_afternewitemoff(FindSplitData *state, OffsetNumber maxoff, int leaffillfactor, bool *usemult)
Definition: nbtsplitloc.c:607
#define RelationGetRelationName(relation)
Definition: rel.h:450
struct ItemIdData ItemIdData
Relation rel
Definition: nbtsplitloc.c:48
#define IndexRelationGetNumberOfKeyAttributes(relation)
Definition: rel.h:435
#define BTREE_DEFAULT_FILLFACTOR
Definition: nbtree.h:169
#define BTREE_SINGLEVAL_FILLFACTOR
Definition: nbtree.h:171
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:235
#define RelationGetFillFactor(relation, defaultff)
Definition: rel.h:292
FindSplitStrat
Definition: nbtsplitloc.c:24
#define SIZE_MAX
Definition: c.h:452
#define Max(x, y)
Definition: c.h:898
#define MAX_INTERNAL_INTERVAL
Definition: nbtsplitloc.c:22
int olddataitemstotal
Definition: nbtsplitloc.c:57
#define Assert(condition)
Definition: c.h:732
Definition: regguts.h:298
OffsetNumber newitemoff
Definition: nbtsplitloc.c:54
#define OffsetNumberNext(offsetNumber)
Definition: off.h:52
size_t Size
Definition: c.h:466
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
#define MAXALIGN(LEN)
Definition: c.h:685
static void _bt_recsplitloc(FindSplitData *state, OffsetNumber firstoldonright, bool newitemonleft, int olddataitemstoleft, Size firstoldonrightsz)
Definition: nbtsplitloc.c:453
Size PageGetExactFreeSpace(Page page)
Definition: bufpage.c:632
#define P_HIKEY
Definition: nbtree.h:217
void * palloc(Size size)
Definition: mcxt.c:924
#define elog(elevel,...)
Definition: elog.h:226
int i
static void _bt_deltasortsplits(FindSplitData *state, double fillfactormult, bool usemult)
Definition: nbtsplitloc.c:538
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:188
#define P_ISLEAF(opaque)
Definition: nbtree.h:189

◆ _bt_finish_split()

void _bt_finish_split ( Relation  rel,
Buffer  bbuf,
BTStack  stack 
)

Definition at line 1856 of file nbtinsert.c.

References _bt_getbuf(), _bt_insert_parent(), _bt_relbuf(), Assert, BT_WRITE, BTMetaPageData::btm_root, BTPageGetMeta, BTPageOpaqueData::btpo_next, BTREE_METAPAGE, BufferGetBlockNumber(), BufferGetPage, DEBUG1, elog, P_INCOMPLETE_SPLIT, P_LEFTMOST, P_RIGHTMOST, and PageGetSpecialPointer.

Referenced by _bt_getstackbuf(), _bt_moveright(), and _bt_stepright().

1857 {
1858  Page lpage = BufferGetPage(lbuf);
1859  BTPageOpaque lpageop = (BTPageOpaque) PageGetSpecialPointer(lpage);
1860  Buffer rbuf;
1861  Page rpage;
1862  BTPageOpaque rpageop;
1863  bool was_root;
1864  bool was_only;
1865 
1866  Assert(P_INCOMPLETE_SPLIT(lpageop));
1867 
1868  /* Lock right sibling, the one missing the downlink */
1869  rbuf = _bt_getbuf(rel, lpageop->btpo_next, BT_WRITE);
1870  rpage = BufferGetPage(rbuf);
1871  rpageop = (BTPageOpaque) PageGetSpecialPointer(rpage);
1872 
1873  /* Could this be a root split? */
1874  if (!stack)
1875  {
1876  Buffer metabuf;
1877  Page metapg;
1878  BTMetaPageData *metad;
1879 
1880  /* acquire lock on the metapage */
1881  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
1882  metapg = BufferGetPage(metabuf);
1883  metad = BTPageGetMeta(metapg);
1884 
1885  was_root = (metad->btm_root == BufferGetBlockNumber(lbuf));
1886 
1887  _bt_relbuf(rel, metabuf);
1888  }
1889  else
1890  was_root = false;
1891 
1892  /* Was this the only page on the level before split? */
1893  was_only = (P_LEFTMOST(lpageop) && P_RIGHTMOST(rpageop));
1894 
1895  elog(DEBUG1, "finishing incomplete split of %u/%u",
1897 
1898  _bt_insert_parent(rel, lbuf, rbuf, stack, was_root, was_only);
1899 }
BlockNumber btpo_next
Definition: nbtree.h:58
#define DEBUG1
Definition: elog.h:25
Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access)
Definition: nbtpage.c:757
static void _bt_insert_parent(Relation rel, Buffer buf, Buffer rbuf, BTStack stack, bool is_root, bool is_only)
Definition: nbtinsert.c:1744
#define P_INCOMPLETE_SPLIT(opaque)
Definition: nbtree.h:196
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:68
#define BTPageGetMeta(p)
Definition: nbtree.h:112
#define P_LEFTMOST(opaque)
Definition: nbtree.h:187
#define BufferGetPage(buffer)
Definition: bufmgr.h:159
#define BTREE_METAPAGE
Definition: nbtree.h:131
BlockNumber btm_root
Definition: nbtree.h:101
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:912
#define Assert(condition)
Definition: c.h:732
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2613
#define elog(elevel,...)
Definition: elog.h:226
#define BT_WRITE
Definition: nbtree.h:403
int Buffer
Definition: buf.h:23
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:188
Pointer Page
Definition: bufpage.h:78

◆ _bt_first()

bool _bt_first ( IndexScanDesc  scan,
ScanDirection  dir 
)

Definition at line 741 of file nbtsearch.c.

References _bt_binsrch(), _bt_drop_lock_and_maybe_pin(), _bt_endpoint(), _bt_freestack(), _bt_heapkeyspace(), _bt_initialize_more_data(), _bt_parallel_done(), _bt_parallel_readpage(), _bt_parallel_seize(), _bt_preprocess_keys(), _bt_readpage(), _bt_search(), _bt_steppage(), Assert, BT_READ, BTEqualStrategyNumber, BTGreaterEqualStrategyNumber, BTGreaterStrategyNumber, BTLessEqualStrategyNumber, BTLessStrategyNumber, BTORDER_PROC, BTScanPosInvalidate, BTScanPosIsValid, buf, BTScanPosData::buf, BUFFER_LOCK_UNLOCK, BufferGetBlockNumber(), BufferIsValid, cur, BTScanOpaqueData::currPos, BTScanOpaqueData::currTuples, DatumGetPointer, elog, ERROR, get_opfamily_proc(), i, index_getprocinfo(), INDEX_MAX_KEYS, IndexScanDescData::indexRelation, InvalidBlockNumber, InvalidOid, InvalidStrategy, BTScanPosData::itemIndex, BTScanPosData::items, BTScanOpaqueData::keyData, LockBuffer(), BTScanOpaqueData::numberOfKeys, OffsetNumberPrev, IndexScanDescData::opaque, P_NONE, IndexScanDescData::parallel_scan, pgstat_count_index_scan, PredicateLockPage(), PredicateLockRelation(), BTScanOpaqueData::qual_ok, RelationData::rd_opcintype, RelationData::rd_opfamily, RegProcedureIsValid, RelationGetRelationName, ScanDirectionIsBackward, ScanDirectionIsForward, ScanKeyEntryInitialize(), ScanKeyEntryInitializeWithInfo(), ScanKeyData::sk_argument, ScanKeyData::sk_attno, SK_BT_DESC, SK_BT_NULLS_FIRST, ScanKeyData::sk_collation, ScanKeyData::sk_flags, SK_ISNULL, SK_ROW_END, SK_ROW_HEADER, SK_ROW_MEMBER, SK_SEARCHNOTNULL, ScanKeyData::sk_strategy, ScanKeyData::sk_subtype, status(), IndexScanDescData::xs_heaptid, IndexScanDescData::xs_itup, IndexScanDescData::xs_snapshot, and IndexScanDescData::xs_want_itup.

Referenced by btgetbitmap(), and btgettuple().

742 {
743  Relation rel = scan->indexRelation;
744  BTScanOpaque so = (BTScanOpaque) scan->opaque;
745  Buffer buf;
746  BTStack stack;
747  OffsetNumber offnum;
748  StrategyNumber strat;
749  bool nextkey;
750  bool goback;
751  BTScanInsertData inskey;
752  ScanKey startKeys[INDEX_MAX_KEYS];
753  ScanKeyData notnullkeys[INDEX_MAX_KEYS];
754  int keysCount = 0;
755  int i;
756  bool status = true;
757  StrategyNumber strat_total;
758  BTScanPosItem *currItem;
759  BlockNumber blkno;
760 
762 
764 
765  /*
766  * Examine the scan keys and eliminate any redundant keys; also mark the
767  * keys that must be matched to continue the scan.
768  */
769  _bt_preprocess_keys(scan);
770 
771  /*
772  * Quit now if _bt_preprocess_keys() discovered that the scan keys can
773  * never be satisfied (eg, x == 1 AND x > 2).
774  */
775  if (!so->qual_ok)
776  return false;
777 
778  /*
779  * For parallel scans, get the starting page from shared state. If the
780  * scan has not started, proceed to find out first leaf page in the usual
781  * way while keeping other participating processes waiting. If the scan
782  * has already begun, use the page number from the shared structure.
783  */
784  if (scan->parallel_scan != NULL)
785  {
786  status = _bt_parallel_seize(scan, &blkno);
787  if (!status)
788  return false;
789  else if (blkno == P_NONE)
790  {
791  _bt_parallel_done(scan);
792  return false;
793  }
794  else if (blkno != InvalidBlockNumber)
795  {
796  if (!_bt_parallel_readpage(scan, blkno, dir))
797  return false;
798  goto readcomplete;
799  }
800  }
801 
802  /*----------
803  * Examine the scan keys to discover where we need to start the scan.
804  *
805  * We want to identify the keys that can be used as starting boundaries;
806  * these are =, >, or >= keys for a forward scan or =, <, <= keys for
807  * a backwards scan. We can use keys for multiple attributes so long as
808  * the prior attributes had only =, >= (resp. =, <=) keys. Once we accept
809  * a > or < boundary or find an attribute with no boundary (which can be
810  * thought of as the same as "> -infinity"), we can't use keys for any
811  * attributes to its right, because it would break our simplistic notion
812  * of what initial positioning strategy to use.
813  *
814  * When the scan keys include cross-type operators, _bt_preprocess_keys
815  * may not be able to eliminate redundant keys; in such cases we will
816  * arbitrarily pick a usable one for each attribute. This is correct
817  * but possibly not optimal behavior. (For example, with keys like
818  * "x >= 4 AND x >= 5" we would elect to scan starting at x=4 when
819  * x=5 would be more efficient.) Since the situation only arises given
820  * a poorly-worded query plus an incomplete opfamily, live with it.
821  *
822  * When both equality and inequality keys appear for a single attribute
823  * (again, only possible when cross-type operators appear), we *must*
824  * select one of the equality keys for the starting point, because
825  * _bt_checkkeys() will stop the scan as soon as an equality qual fails.
826  * For example, if we have keys like "x >= 4 AND x = 10" and we elect to
827  * start at x=4, we will fail and stop before reaching x=10. If multiple
828  * equality quals survive preprocessing, however, it doesn't matter which
829  * one we use --- by definition, they are either redundant or
830  * contradictory.
831  *
832  * Any regular (not SK_SEARCHNULL) key implies a NOT NULL qualifier.
833  * If the index stores nulls at the end of the index we'll be starting
834  * from, and we have no boundary key for the column (which means the key
835  * we deduced NOT NULL from is an inequality key that constrains the other
836  * end of the index), then we cons up an explicit SK_SEARCHNOTNULL key to
837  * use as a boundary key. If we didn't do this, we might find ourselves
838  * traversing a lot of null entries at the start of the scan.
839  *
840  * In this loop, row-comparison keys are treated the same as keys on their
841  * first (leftmost) columns. We'll add on lower-order columns of the row
842  * comparison below, if possible.
843  *
844  * The selected scan keys (at most one per index column) are remembered by
845  * storing their addresses into the local startKeys[] array.
846  *----------
847  */
848  strat_total = BTEqualStrategyNumber;
849  if (so->numberOfKeys > 0)
850  {
851  AttrNumber curattr;
852  ScanKey chosen;
853  ScanKey impliesNN;
854  ScanKey cur;
855 
856  /*
857  * chosen is the so-far-chosen key for the current attribute, if any.
858  * We don't cast the decision in stone until we reach keys for the
859  * next attribute.
860  */
861  curattr = 1;
862  chosen = NULL;
863  /* Also remember any scankey that implies a NOT NULL constraint */
864  impliesNN = NULL;
865 
866  /*
867  * Loop iterates from 0 to numberOfKeys inclusive; we use the last
868  * pass to handle after-last-key processing. Actual exit from the
869  * loop is at one of the "break" statements below.
870  */
871  for (cur = so->keyData, i = 0;; cur++, i++)
872  {
873  if (i >= so->numberOfKeys || cur->sk_attno != curattr)
874  {
875  /*
876  * Done looking at keys for curattr. If we didn't find a
877  * usable boundary key, see if we can deduce a NOT NULL key.
878  */
879  if (chosen == NULL && impliesNN != NULL &&
880  ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
883  {
884  /* Yes, so build the key in notnullkeys[keysCount] */
885  chosen = &notnullkeys[keysCount];
886  ScanKeyEntryInitialize(chosen,
888  (impliesNN->sk_flags &
890  curattr,
891  ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
894  InvalidOid,
895  InvalidOid,
896  InvalidOid,
897  (Datum) 0);
898  }
899 
900  /*
901  * If we still didn't find a usable boundary key, quit; else
902  * save the boundary key pointer in startKeys.
903  */
904  if (chosen == NULL)
905  break;
906  startKeys[keysCount++] = chosen;
907 
908  /*
909  * Adjust strat_total, and quit if we have stored a > or <
910  * key.
911  */
912  strat = chosen->sk_strategy;
913  if (strat != BTEqualStrategyNumber)
914  {
915  strat_total = strat;
916  if (strat == BTGreaterStrategyNumber ||
917  strat == BTLessStrategyNumber)
918  break;
919  }
920 
921  /*
922  * Done if that was the last attribute, or if next key is not
923  * in sequence (implying no boundary key is available for the
924  * next attribute).
925  */
926  if (i >= so->numberOfKeys ||
927  cur->sk_attno != curattr + 1)
928  break;
929 
930  /*
931  * Reset for next attr.
932  */
933  curattr = cur->sk_attno;
934  chosen = NULL;
935  impliesNN = NULL;
936  }
937 
938  /*
939  * Can we use this key as a starting boundary for this attr?
940  *
941  * If not, does it imply a NOT NULL constraint? (Because
942  * SK_SEARCHNULL keys are always assigned BTEqualStrategyNumber,
943  * *any* inequality key works for that; we need not test.)
944  */
945  switch (cur->sk_strategy)
946  {
949  if (chosen == NULL)
950  {
951  if (ScanDirectionIsBackward(dir))
952  chosen = cur;
953  else
954  impliesNN = cur;
955  }
956  break;
958  /* override any non-equality choice */
959  chosen = cur;
960  break;
963  if (chosen == NULL)
964  {
965  if (ScanDirectionIsForward(dir))
966  chosen = cur;
967  else
968  impliesNN = cur;
969  }
970  break;
971  }
972  }
973  }
974 
975  /*
976  * If we found no usable boundary keys, we have to start from one end of
977  * the tree. Walk down that edge to the first or last key, and scan from
978  * there.
979  */
980  if (keysCount == 0)
981  {
982  bool match;
983 
984  match = _bt_endpoint(scan, dir);
985 
986  if (!match)
987  {
988  /* No match, so mark (parallel) scan finished */
989  _bt_parallel_done(scan);
990  }
991 
992  return match;
993  }
994 
995  /*
996  * We want to start the scan somewhere within the index. Set up an
997  * insertion scankey we can use to search for the boundary point we
998  * identified above. The insertion scankey is built using the keys
999  * identified by startKeys[]. (Remaining insertion scankey fields are
1000  * initialized after initial-positioning strategy is finalized.)
1001  */
1002  Assert(keysCount <= INDEX_MAX_KEYS);
1003  for (i = 0; i < keysCount; i++)
1004  {
1005  ScanKey cur = startKeys[i];
1006 
1007  Assert(cur->sk_attno == i + 1);
1008 
1009  if (cur->sk_flags & SK_ROW_HEADER)
1010  {
1011  /*
1012  * Row comparison header: look to the first row member instead.
1013  *
1014  * The member scankeys are already in insertion format (ie, they
1015  * have sk_func = 3-way-comparison function), but we have to watch
1016  * out for nulls, which _bt_preprocess_keys didn't check. A null
1017  * in the first row member makes the condition unmatchable, just
1018  * like qual_ok = false.
1019  */
1020  ScanKey subkey = (ScanKey) DatumGetPointer(cur->sk_argument);
1021 
1022  Assert(subkey->sk_flags & SK_ROW_MEMBER);
1023  if (subkey->sk_flags & SK_ISNULL)
1024  {
1025  _bt_parallel_done(scan);
1026  return false;
1027  }
1028  memcpy(inskey.scankeys + i, subkey, sizeof(ScanKeyData));
1029 
1030  /*
1031  * If the row comparison is the last positioning key we accepted,
1032  * try to add additional keys from the lower-order row members.
1033  * (If we accepted independent conditions on additional index
1034  * columns, we use those instead --- doesn't seem worth trying to
1035  * determine which is more restrictive.) Note that this is OK
1036  * even if the row comparison is of ">" or "<" type, because the
1037  * condition applied to all but the last row member is effectively
1038  * ">=" or "<=", and so the extra keys don't break the positioning
1039  * scheme. But, by the same token, if we aren't able to use all
1040  * the row members, then the part of the row comparison that we
1041  * did use has to be treated as just a ">=" or "<=" condition, and
1042  * so we'd better adjust strat_total accordingly.
1043  */
1044  if (i == keysCount - 1)
1045  {
1046  bool used_all_subkeys = false;
1047 
1048  Assert(!(subkey->sk_flags & SK_ROW_END));
1049  for (;;)
1050  {
1051  subkey++;
1052  Assert(subkey->sk_flags & SK_ROW_MEMBER);
1053  if (subkey->sk_attno != keysCount + 1)
1054  break; /* out-of-sequence, can't use it */
1055  if (subkey->sk_strategy != cur->sk_strategy)
1056  break; /* wrong direction, can't use it */
1057  if (subkey->sk_flags & SK_ISNULL)
1058  break; /* can't use null keys */
1059  Assert(keysCount < INDEX_MAX_KEYS);
1060  memcpy(inskey.scankeys + keysCount, subkey,
1061  sizeof(ScanKeyData));
1062  keysCount++;
1063  if (subkey->sk_flags & SK_ROW_END)
1064  {
1065  used_all_subkeys = true;
1066  break;
1067  }
1068  }
1069  if (!used_all_subkeys)
1070  {
1071  switch (strat_total)
1072  {
1073  case BTLessStrategyNumber:
1074  strat_total = BTLessEqualStrategyNumber;
1075  break;
1077  strat_total = BTGreaterEqualStrategyNumber;
1078  break;
1079  }
1080  }
1081  break; /* done with outer loop */
1082  }
1083  }
1084  else
1085  {
1086  /*
1087  * Ordinary comparison key. Transform the search-style scan key
1088  * to an insertion scan key by replacing the sk_func with the
1089  * appropriate btree comparison function.
1090  *
1091  * If scankey operator is not a cross-type comparison, we can use
1092  * the cached comparison function; otherwise gotta look it up in
1093  * the catalogs. (That can't lead to infinite recursion, since no
1094  * indexscan initiated by syscache lookup will use cross-data-type
1095  * operators.)
1096  *
1097  * We support the convention that sk_subtype == InvalidOid means
1098  * the opclass input type; this is a hack to simplify life for
1099  * ScanKeyInit().
1100  */
1101  if (cur->sk_subtype == rel->rd_opcintype[i] ||
1102  cur->sk_subtype == InvalidOid)
1103  {
1104  FmgrInfo *procinfo;
1105 
1106  procinfo = index_getprocinfo(rel, cur->sk_attno, BTORDER_PROC);
1107  ScanKeyEntryInitializeWithInfo(inskey.scankeys + i,
1108  cur->sk_flags,
1109  cur->sk_attno,
1111  cur->sk_subtype,
1112  cur->sk_collation,
1113  procinfo,
1114  cur->sk_argument);
1115  }
1116  else
1117  {
1118  RegProcedure cmp_proc;
1119 
1120  cmp_proc = get_opfamily_proc(rel->rd_opfamily[i],
1121  rel->rd_opcintype[i],
1122  cur->sk_subtype,
1123  BTORDER_PROC);
1124  if (!RegProcedureIsValid(cmp_proc))
1125  elog(ERROR, "missing support function %d(%u,%u) for attribute %d of index \"%s\"",
1126  BTORDER_PROC, rel->rd_opcintype[i], cur->sk_subtype,
1127  cur->sk_attno, RelationGetRelationName(rel));
1128  ScanKeyEntryInitialize(inskey.scankeys + i,
1129  cur->sk_flags,
1130  cur->sk_attno,
1132  cur->sk_subtype,
1133  cur->sk_collation,
1134  cmp_proc,
1135  cur->sk_argument);
1136  }
1137  }
1138  }
1139 
1140  /*----------
1141  * Examine the selected initial-positioning strategy to determine exactly
1142  * where we need to start the scan, and set flag variables to control the
1143  * code below.
1144  *
1145  * If nextkey = false, _bt_search and _bt_binsrch will locate the first
1146  * item >= scan key. If nextkey = true, they will locate the first
1147  * item > scan key.
1148  *
1149  * If goback = true, we will then step back one item, while if
1150  * goback = false, we will start the scan on the located item.
1151  *----------
1152  */
1153  switch (strat_total)
1154  {
1155  case BTLessStrategyNumber:
1156 
1157  /*
1158  * Find first item >= scankey, then back up one to arrive at last
1159  * item < scankey. (Note: this positioning strategy is only used
1160  * for a backward scan, so that is always the correct starting
1161  * position.)
1162  */
1163  nextkey = false;
1164  goback = true;
1165  break;
1166 
1168 
1169  /*
1170  * Find first item > scankey, then back up one to arrive at last
1171  * item <= scankey. (Note: this positioning strategy is only used
1172  * for a backward scan, so that is always the correct starting
1173  * position.)
1174  */
1175  nextkey = true;
1176  goback = true;
1177  break;
1178 
1179  case BTEqualStrategyNumber:
1180 
1181  /*
1182  * If a backward scan was specified, need to start with last equal
1183  * item not first one.
1184  */
1185  if (ScanDirectionIsBackward(dir))
1186  {
1187  /*
1188  * This is the same as the <= strategy. We will check at the
1189  * end whether the found item is actually =.
1190  */
1191  nextkey = true;
1192  goback = true;
1193  }
1194  else
1195  {
1196  /*
1197  * This is the same as the >= strategy. We will check at the
1198  * end whether the found item is actually =.
1199  */
1200  nextkey = false;
1201  goback = false;
1202  }
1203  break;
1204 
1206 
1207  /*
1208  * Find first item >= scankey. (This is only used for forward
1209  * scans.)
1210  */
1211  nextkey = false;
1212  goback = false;
1213  break;
1214 
1216 
1217  /*
1218  * Find first item > scankey. (This is only used for forward
1219  * scans.)
1220  */
1221  nextkey = true;
1222  goback = false;
1223  break;
1224 
1225  default:
1226  /* can't get here, but keep compiler quiet */
1227  elog(ERROR, "unrecognized strat_total: %d", (int) strat_total);
1228  return false;
1229  }
1230 
1231  /* Initialize remaining insertion scan key fields */
1232  inskey.heapkeyspace = _bt_heapkeyspace(rel);
1233  inskey.anynullkeys = false; /* unused */
1234  inskey.nextkey = nextkey;
1235  inskey.pivotsearch = false;
1236  inskey.scantid = NULL;
1237  inskey.keysz = keysCount;
1238 
1239  /*
1240  * Use the manufactured insertion scan key to descend the tree and
1241  * position ourselves on the target leaf page.
1242  */
1243  stack = _bt_search(rel, &inskey, &buf, BT_READ, scan->xs_snapshot);
1244 
1245  /* don't need to keep the stack around... */
1246  _bt_freestack(stack);
1247 
1248  if (!BufferIsValid(buf))
1249  {
1250  /*
1251  * We only get here if the index is completely empty. Lock relation
1252  * because nothing finer to lock exists.
1253  */
1254  PredicateLockRelation(rel, scan->xs_snapshot);
1255 
1256  /*
1257  * mark parallel scan as done, so that all the workers can finish
1258  * their scan
1259  */
1260  _bt_parallel_done(scan);
1262 
1263  return false;
1264  }
1265  else
1267  scan->xs_snapshot);
1268 
1269  _bt_initialize_more_data(so, dir);
1270 
1271  /* position to the precise item on the page */
1272  offnum = _bt_binsrch(rel, &inskey, buf);
1273 
1274  /*
1275  * If nextkey = false, we are positioned at the first item >= scan key, or
1276  * possibly at the end of a page on which all the existing items are less
1277  * than the scan key and we know that everything on later pages is greater
1278  * than or equal to scan key.
1279  *
1280  * If nextkey = true, we are positioned at the first item > scan key, or
1281  * possibly at the end of a page on which all the existing items are less
1282  * than or equal to the scan key and we know that everything on later
1283  * pages is greater than scan key.
1284  *
1285  * The actually desired starting point is either this item or the prior
1286  * one, or in the end-of-page case it's the first item on the next page or
1287  * the last item on this page. Adjust the starting offset if needed. (If
1288  * this results in an offset before the first item or after the last one,
1289  * _bt_readpage will report no items found, and then we'll step to the
1290  * next page as needed.)
1291  */
1292  if (goback)
1293  offnum = OffsetNumberPrev(offnum);
1294 
1295  /* remember which buffer we have pinned, if any */
1297  so->currPos.buf = buf;
1298 
1299  /*
1300  * Now load data from the first page of the scan.
1301  */
1302  if (!_bt_readpage(scan, dir, offnum))
1303  {
1304  /*
1305  * There's no actually-matching data on this page. Try to advance to
1306  * the next page. Return false if there's no matching data at all.
1307  */
1309  if (!_bt_steppage(scan, dir))
1310  return false;
1311  }
1312  else
1313  {
1314  /* Drop the lock, and maybe the pin, on the current page */
1316  }
1317 
1318 readcomplete:
1319  /* OK, itemIndex says what to return */
1320  currItem = &so->currPos.items[so->currPos.itemIndex];
1321  scan->xs_heaptid = currItem->heapTid;
1322  if (scan->xs_want_itup)
1323  scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
1324 
1325  return true;
1326 }
#define InvalidStrategy
Definition: stratnum.h:24
Oid sk_subtype
Definition: skey.h:69
#define BUFFER_LOCK_UNLOCK
Definition: bufmgr.h:86
Definition: fmgr.h:56
#define SK_ROW_MEMBER
Definition: skey.h:118
void _bt_freestack(BTStack stack)
Definition: nbtutils.c:163
void PredicateLockPage(Relation relation, BlockNumber blkno, Snapshot snapshot)
Definition: predicate.c:2526
#define BTORDER_PROC
Definition: nbtree.h:392
#define BTGreaterStrategyNumber
Definition: stratnum.h:33
FmgrInfo * index_getprocinfo(Relation irel, AttrNumber attnum, uint16 procnum)
Definition: indexam.c:794
#define ScanDirectionIsForward(direction)
Definition: sdir.h:55
IndexTuple xs_itup
Definition: relscan.h:127
BTScanPosItem items[MaxIndexTuplesPerPage]
Definition: nbtree.h:581
void ScanKeyEntryInitializeWithInfo(ScanKey entry, int flags, AttrNumber attributeNumber, StrategyNumber strategy, Oid subtype, Oid collation, FmgrInfo *finfo, Datum argument)
Definition: scankey.c:101
void _bt_parallel_done(IndexScanDesc scan)
Definition: nbtree.c:722
static bool _bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
Definition: nbtsearch.c:1910
static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp)
Definition: nbtsearch.c:57
int itemIndex
Definition: nbtree.h:579
void PredicateLockRelation(Relation relation, Snapshot snapshot)
Definition: predicate.c:2503
char * currTuples
Definition: nbtree.h:653
regproc RegProcedure
Definition: c.h:505
#define P_NONE
Definition: nbtree.h:181
struct cursor * cur
Definition: ecpg.c:28
uint16 StrategyNumber
Definition: stratnum.h:22
#define BTScanPosIsValid(scanpos)
Definition: nbtree.h:603
struct SnapshotData * xs_snapshot
Definition: relscan.h:104
uint32 BlockNumber
Definition: block.h:31
static OffsetNumber _bt_binsrch(Relation rel, BTScanInsert key, Buffer buf)
Definition: nbtsearch.c:339
#define BTLessEqualStrategyNumber
Definition: stratnum.h:30
Relation indexRelation
Definition: relscan.h:103
uint16 OffsetNumber
Definition: off.h:24
#define SK_ROW_END
Definition: skey.h:119
#define ScanDirectionIsBackward(direction)
Definition: sdir.h:41
#define BT_READ
Definition: nbtree.h:402
#define ERROR
Definition: elog.h:43
BTScanOpaqueData * BTScanOpaque
Definition: nbtree.h:670
StrategyNumber sk_strategy
Definition: skey.h:68
ItemPointerData xs_heaptid
Definition: relscan.h:132
static char * buf
Definition: pg_test_fsync.c:68
void ScanKeyEntryInitialize(ScanKey entry, int flags, AttrNumber attributeNumber, StrategyNumber strategy, Oid subtype, Oid collation, RegProcedure procedure, Datum argument)
Definition: scankey.c:32
ScanKeyData * ScanKey
Definition: skey.h:75
static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:2143
#define RegProcedureIsValid(p)
Definition: c.h:640
IndexTupleData * IndexTuple
Definition: itup.h:53
#define RelationGetRelationName(relation)
Definition: rel.h:450
#define pgstat_count_index_scan(rel)
Definition: pgstat.h:1373
#define SK_SEARCHNOTNULL
Definition: skey.h:122
bool _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno)
Definition: nbtree.c:641
Oid * rd_opfamily
Definition: rel.h:158
#define SK_ISNULL
Definition: skey.h:115
#define SK_ROW_HEADER
Definition: skey.h:117
void _bt_preprocess_keys(IndexScanDesc scan)
Definition: nbtutils.c:744
#define SK_BT_NULLS_FIRST
Definition: nbtree.h:681
static void _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir)
Definition: nbtsearch.c:2233
struct ParallelIndexScanDescData * parallel_scan
Definition: relscan.h:151
uintptr_t Datum
Definition: postgres.h:367
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:3590
bool _bt_heapkeyspace(Relation rel)
Definition: nbtpage.c:636
#define InvalidOid
Definition: postgres_ext.h:36
int sk_flags
Definition: skey.h:66
#define Assert(condition)
Definition: c.h:732
#define SK_BT_DESC
Definition: nbtree.h:680
Oid get_opfamily_proc(Oid opfamily, Oid lefttype, Oid righttype, int16 procnum)
Definition: lsyscache.c:744
#define BTScanPosInvalidate(scanpos)
Definition: nbtree.h:609
#define INDEX_MAX_KEYS
#define InvalidBlockNumber
Definition: block.h:33
#define OffsetNumberPrev(offsetNumber)
Definition: off.h:54
int numberOfKeys
Definition: nbtree.h:632
#define BufferIsValid(bufnum)
Definition: bufmgr.h:113
#define DatumGetPointer(X)
Definition: postgres.h:549
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2613
BTScanPosData currPos
Definition: nbtree.h:666
ScanKey keyData
Definition: nbtree.h:633
Oid sk_collation
Definition: skey.h:70
#define elog(elevel,...)
Definition: elog.h:226
int i
static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:1625
Buffer buf
Definition: nbtree.h:549
Oid * rd_opcintype
Definition: rel.h:159
static void static void status(const char *fmt,...) pg_attribute_printf(1
Definition: pg_regress.c:227
#define BTLessStrategyNumber
Definition: stratnum.h:29
int Buffer
Definition: buf.h:23
Datum sk_argument
Definition: skey.h:72
static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
Definition: nbtsearch.c:1398
int16 AttrNumber
Definition: attnum.h:21
BTStack _bt_search(Relation rel, BTScanInsert key, Buffer *bufP, int access, Snapshot snapshot)
Definition: nbtsearch.c:92
#define BTEqualStrategyNumber
Definition: stratnum.h:31
#define BTGreaterEqualStrategyNumber
Definition: stratnum.h:32
AttrNumber sk_attno
Definition: skey.h:67

◆ _bt_freestack()

void _bt_freestack ( BTStack  stack)

Definition at line 163 of file nbtutils.c.

References BTStackData::bts_parent, and pfree().

Referenced by _bt_doinsert(), _bt_first(), and bt_rootdescend().

164 {
165  BTStack ostack;
166 
167  while (stack != NULL)
168  {
169  ostack = stack;
170  stack = stack->bts_parent;
171  pfree(ostack);
172  }
173 }
void pfree(void *pointer)
Definition: mcxt.c:1031
struct BTStackData * bts_parent
Definition: nbtree.h:419

◆ _bt_get_endpoint()

Buffer _bt_get_endpoint ( Relation  rel,
uint32  level,
bool  rightmost,
Snapshot  snapshot 
)

Definition at line 2059 of file nbtsearch.c.

References _bt_getroot(), _bt_gettrueroot(), _bt_relandgetbuf(), BT_READ, BTPageOpaqueData::btpo, BTPageOpaqueData::btpo_next, BTreeInnerTupleGetDownLink, buf, BufferGetPage, BufferIsValid, elog, ereport, errcode(), errmsg_internal(), ERROR, InvalidBuffer, BTPageOpaqueData::level, P_FIRSTDATAKEY, P_IGNORE, P_NONE, P_RIGHTMOST, PageGetItem, PageGetItemId, PageGetMaxOffsetNumber, PageGetSpecialPointer, RelationGetRelationName, and TestForOldSnapshot().

Referenced by _bt_endpoint(), and _bt_insert_parent().

2061 {
2062  Buffer buf;
2063  Page page;
2064  BTPageOpaque opaque;
2065  OffsetNumber offnum;
2066  BlockNumber blkno;
2067  IndexTuple itup;
2068 
2069  /*
2070  * If we are looking for a leaf page, okay to descend from fast root;
2071  * otherwise better descend from true root. (There is no point in being
2072  * smarter about intermediate levels.)
2073  */
2074  if (level == 0)
2075  buf = _bt_getroot(rel, BT_READ);
2076  else
2077  buf = _bt_gettrueroot(rel);
2078 
2079  if (!BufferIsValid(buf))
2080  return InvalidBuffer;
2081 
2082  page = BufferGetPage(buf);
2083  TestForOldSnapshot(snapshot, rel, page);
2084  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2085 
2086  for (;;)
2087  {
2088  /*
2089  * If we landed on a deleted page, step right to find a live page
2090  * (there must be one). Also, if we want the rightmost page, step
2091  * right if needed to get to it (this could happen if the page split
2092  * since we obtained a pointer to it).
2093  */
2094  while (P_IGNORE(opaque) ||
2095  (rightmost && !P_RIGHTMOST(opaque)))
2096  {
2097  blkno = opaque->btpo_next;
2098  if (blkno == P_NONE)
2099  elog(ERROR, "fell off the end of index \"%s\"",
2101  buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2102  page = BufferGetPage(buf);
2103  TestForOldSnapshot(snapshot, rel, page);
2104  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2105  }
2106 
2107  /* Done? */
2108  if (opaque->btpo.level == level)
2109  break;
2110  if (opaque->btpo.level < level)
2111  ereport(ERROR,
2112  (errcode(ERRCODE_INDEX_CORRUPTED),
2113  errmsg_internal("btree level %u not found in index \"%s\"",
2114  level, RelationGetRelationName(rel))));
2115 
2116  /* Descend to leftmost or rightmost child page */
2117  if (rightmost)
2118  offnum = PageGetMaxOffsetNumber(page);
2119  else
2120  offnum = P_FIRSTDATAKEY(opaque);
2121 
2122  itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
2123  blkno = BTreeInnerTupleGetDownLink(itup);
2124 
2125  buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2126  page = BufferGetPage(buf);
2127  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2128  }
2129 
2130  return buf;
2131 }
BlockNumber btpo_next
Definition: nbtree.h:58
static void TestForOldSnapshot(Snapshot snapshot, Relation relation, Page page)
Definition: bufmgr.h:264
#define P_IGNORE(opaque)
Definition: nbtree.h:194
#define BTreeInnerTupleGetDownLink(itup)
Definition: nbtree.h:301
Buffer _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
Definition: nbtpage.c:893
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:219
union BTPageOpaqueData::@46 btpo
#define P_NONE
Definition: nbtree.h:181
#define InvalidBuffer
Definition: buf.h:25
int errcode(int sqlerrcode)
Definition: elog.c:570
uint32 BlockNumber
Definition: block.h:31
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:357
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:68
uint16 OffsetNumber
Definition: off.h:24
#define BT_READ
Definition: nbtree.h:402
#define ERROR
Definition: elog.h:43
static char * buf
Definition: pg_test_fsync.c:68
IndexTupleData * IndexTuple
Definition: itup.h:53
#define RelationGetRelationName(relation)
Definition: rel.h:450
Buffer _bt_getroot(Relation rel, int access)
Definition: nbtpage.c:255
#define BufferGetPage(buffer)
Definition: bufmgr.h:159
#define ereport(elevel, rest)
Definition: elog.h:141
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:235
uint32 level
Definition: nbtree.h:61
Buffer _bt_gettrueroot(Relation rel)
Definition: nbtpage.c:488
int errmsg_internal(const char *fmt,...)
Definition: elog.c:814
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
#define BufferIsValid(bufnum)
Definition: bufmgr.h:113
#define elog(elevel,...)
Definition: elog.h:226
int Buffer
Definition: buf.h:23
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:188
#define PageGetItem(page, itemId)
Definition: bufpage.h:340
Pointer Page
Definition: bufpage.h:78

◆ _bt_getbuf()

Buffer _bt_getbuf ( Relation  rel,
BlockNumber  blkno,
int  access 
)

Definition at line 757 of file nbtpage.c.

References _bt_checkpage(), _bt_log_reuse_page(), _bt_page_recyclable(), _bt_pageinit(), _bt_relbuf(), Assert, BT_WRITE, BTPageOpaqueData::btpo, buf, BufferGetPage, BufferGetPageSize, ConditionalLockBuffer(), DEBUG2, elog, ExclusiveLock, GetFreeIndexPage(), InvalidBlockNumber, LockBuffer(), LockRelationForExtension(), P_NEW, PageGetSpecialPointer, PageIsNew, ReadBuffer(), RELATION_IS_LOCAL, RelationNeedsWAL, ReleaseBuffer(), UnlockRelationForExtension(), BTPageOpaqueData::xact, and XLogStandbyInfoActive.

Referenced by _bt_finish_split(), _bt_getroot(), _bt_getrootheight(), _bt_getstackbuf(), _bt_gettrueroot(), _bt_heapkeyspace(), _bt_insertonpg(), _bt_is_page_halfdead(), _bt_killitems(), _bt_lock_branch_parent(), _bt_moveright(), _bt_newroot(), _bt_pagedel(), _bt_readnextpage(), _bt_split(), _bt_unlink_halfdead_page(), _bt_update_meta_cleanup_info(), _bt_vacuum_needs_cleanup(), and _bt_walk_left().

758 {
759  Buffer buf;
760 
761  if (blkno != P_NEW)
762  {
763  /* Read an existing block of the relation */
764  buf = ReadBuffer(rel, blkno);
765  LockBuffer(buf, access);
766  _bt_checkpage(rel, buf);
767  }
768  else
769  {
770  bool needLock;
771  Page page;
772 
773  Assert(access == BT_WRITE);
774 
775  /*
776  * First see if the FSM knows of any free pages.
777  *
778  * We can't trust the FSM's report unreservedly; we have to check that
779  * the page is still free. (For example, an already-free page could
780  * have been re-used between the time the last VACUUM scanned it and
781  * the time the VACUUM made its FSM updates.)
782  *
783  * In fact, it's worse than that: we can't even assume that it's safe
784  * to take a lock on the reported page. If somebody else has a lock
785  * on it, or even worse our own caller does, we could deadlock. (The
786  * own-caller scenario is actually not improbable. Consider an index
787  * on a serial or timestamp column. Nearly all splits will be at the
788  * rightmost page, so it's entirely likely that _bt_split will call us
789  * while holding a lock on the page most recently acquired from FSM. A
790  * VACUUM running concurrently with the previous split could well have
791  * placed that page back in FSM.)
792  *
793  * To get around that, we ask for only a conditional lock on the
794  * reported page. If we fail, then someone else is using the page,
795  * and we may reasonably assume it's not free. (If we happen to be
796  * wrong, the worst consequence is the page will be lost to use till
797  * the next VACUUM, which is no big problem.)
798  */
799  for (;;)
800  {
801  blkno = GetFreeIndexPage(rel);
802  if (blkno == InvalidBlockNumber)
803  break;
804  buf = ReadBuffer(rel, blkno);
805  if (ConditionalLockBuffer(buf))
806  {
807  page = BufferGetPage(buf);
808  if (_bt_page_recyclable(page))
809  {
810  /*
811  * If we are generating WAL for Hot Standby then create a
812  * WAL record that will allow us to conflict with queries
813  * running on standby, in case they have snapshots older
814  * than btpo.xact. This can only apply if the page does
815  * have a valid btpo.xact value, ie not if it's new. (We
816  * must check that because an all-zero page has no special
817  * space.)
818  */
819  if (XLogStandbyInfoActive() && RelationNeedsWAL(rel) &&
820  !PageIsNew(page))
821  {
823 
824  _bt_log_reuse_page(rel, blkno, opaque->btpo.xact);
825  }
826 
827  /* Okay to use page. Re-initialize and return it */
828  _bt_pageinit(page, BufferGetPageSize(buf));
829  return buf;
830  }
831  elog(DEBUG2, "FSM returned nonrecyclable page");
832  _bt_relbuf(rel, buf);
833  }
834  else
835  {
836  elog(DEBUG2, "FSM returned nonlockable page");
837  /* couldn't get lock, so just drop pin */
838  ReleaseBuffer(buf);
839  }
840  }
841 
842  /*
843  * Extend the relation by one page.
844  *
845  * We have to use a lock to ensure no one else is extending the rel at
846  * the same time, else we will both try to initialize the same new
847  * page. We can skip locking for new or temp relations, however,
848  * since no one else could be accessing them.
849  */
850  needLock = !RELATION_IS_LOCAL(rel);
851 
852  if (needLock)
854 
855  buf = ReadBuffer(rel, P_NEW);
856 
857  /* Acquire buffer lock on new page */
858  LockBuffer(buf, BT_WRITE);
859 
860  /*
861  * Release the file-extension lock; it's now OK for someone else to
862  * extend the relation some more. Note that we cannot release this
863  * lock before we have buffer lock on the new page, or we risk a race
864  * condition against btvacuumscan --- see comments therein.
865  */
866  if (needLock)
868 
869  /* Initialize the new page before returning it */
870  page = BufferGetPage(buf);
871  Assert(PageIsNew(page));
872  _bt_pageinit(page, BufferGetPageSize(buf));
873  }
874 
875  /* ref count and lock type are correct */
876  return buf;
877 }
#define ExclusiveLock
Definition: lockdefs.h:44
#define RELATION_IS_LOCAL(relation)
Definition: rel.h:536
union BTPageOpaqueData::@46 btpo
void ReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:3353
#define P_NEW
Definition: bufmgr.h:81
bool _bt_page_recyclable(Page page)
Definition: nbtpage.c:939
TransactionId xact
Definition: nbtree.h:62
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:68
void _bt_checkpage(Relation rel, Buffer buf)
Definition: nbtpage.c:690
#define DEBUG2
Definition: elog.h:24
static char * buf
Definition: pg_test_fsync.c:68
#define BufferGetPage(buffer)
Definition: bufmgr.h:159
bool ConditionalLockBuffer(Buffer buffer)
Definition: bufmgr.c:3616
static void _bt_log_reuse_page(Relation rel, BlockNumber blkno, TransactionId latestRemovedXid)
Definition: nbtpage.c:724
void LockRelationForExtension(Relation relation, LOCKMODE lockmode)
Definition: lmgr.c:402
void UnlockRelationForExtension(Relation relation, LOCKMODE lockmode)
Definition: lmgr.c:452
#define BufferGetPageSize(buffer)
Definition: bufmgr.h:146
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:3590
#define XLogStandbyInfoActive()
Definition: xlog.h:195
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:912
BlockNumber GetFreeIndexPage(Relation rel)
Definition: indexfsm.c:38
#define Assert(condition)
Definition: c.h:732
Buffer ReadBuffer(Relation reln, BlockNumber blockNum)
Definition: bufmgr.c:596
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
#define InvalidBlockNumber
Definition: block.h:33
#define RelationNeedsWAL(relation)
Definition: rel.h:518
#define PageIsNew(page)
Definition: bufpage.h:229
#define elog(elevel,...)
Definition: elog.h:226
void _bt_pageinit(Page page, Size size)
Definition: nbtpage.c:924
#define BT_WRITE
Definition: nbtree.h:403
int Buffer
Definition: buf.h:23
Pointer Page
Definition: bufpage.h:78

◆ _bt_getroot()

Buffer _bt_getroot ( Relation  rel,
int  access 
)

Definition at line 255 of file nbtpage.c.

References _bt_getbuf(), _bt_getmeta(), _bt_getroot(), _bt_relandgetbuf(), _bt_relbuf(), _bt_upgrademetapage(), Assert, BT_READ, BT_WRITE, BTMetaPageData::btm_fastlevel, BTMetaPageData::btm_fastroot, BTMetaPageData::btm_last_cleanup_num_heap_tuples, BTMetaPageData::btm_level, BTMetaPageData::btm_magic, BTMetaPageData::btm_oldest_btpo_xact, BTMetaPageData::btm_root, BTMetaPageData::btm_version, BTP_LEAF, BTP_ROOT, BTPageOpaqueData::btpo, BTPageOpaqueData::btpo_cycleid, BTPageOpaqueData::btpo_flags, BTPageOpaqueData::btpo_next, BTPageOpaqueData::btpo_prev, BTREE_MAGIC, BTREE_METAPAGE, BTREE_MIN_VERSION, BTREE_NOVAC_VERSION, BTREE_VERSION, BUFFER_LOCK_UNLOCK, BufferGetBlockNumber(), BufferGetPage, elog, END_CRIT_SECTION, ERROR, xl_btree_metadata::fastlevel, xl_btree_metadata::fastroot, InvalidBuffer, InvalidTransactionId, xl_btree_metadata::last_cleanup_num_heap_tuples, xl_btree_metadata::level, BTPageOpaqueData::level, xl_btree_newroot::level, LockBuffer(), MarkBufferDirty(), MemoryContextAlloc(), xl_btree_metadata::oldest_btpo_xact, P_IGNORE, P_LEFTMOST, P_NEW, P_NONE, P_RIGHTMOST, PageGetSpecialPointer, PageSetLSN, pfree(), RelationData::rd_amcache, RelationData::rd_indexcxt, REGBUF_STANDARD, REGBUF_WILL_INIT, RelationGetRelationName, RelationNeedsWAL, xl_btree_metadata::root, xl_btree_newroot::rootblk, SizeOfBtreeNewroot, START_CRIT_SECTION, xl_btree_metadata::version, XLOG_BTREE_NEWROOT, XLogBeginInsert(), XLogInsert(), XLogRegisterBufData(), XLogRegisterBuffer(), and XLogRegisterData().

Referenced by _bt_get_endpoint(), _bt_getroot(), and _bt_search().

256 {
257  Buffer metabuf;
258  Buffer rootbuf;
259  Page rootpage;
260  BTPageOpaque rootopaque;
261  BlockNumber rootblkno;
262  uint32 rootlevel;
263  BTMetaPageData *metad;
264 
265  /*
266  * Try to use previously-cached metapage data to find the root. This
267  * normally saves one buffer access per index search, which is a very
268  * helpful savings in bufmgr traffic and hence contention.
269  */
270  if (rel->rd_amcache != NULL)
271  {
272  metad = (BTMetaPageData *) rel->rd_amcache;
273  /* We shouldn't have cached it if any of these fail */
274  Assert(metad->btm_magic == BTREE_MAGIC);
276  Assert(metad->btm_version <= BTREE_VERSION);
277  Assert(metad->btm_root != P_NONE);
278 
279  rootblkno = metad->btm_fastroot;
280  Assert(rootblkno != P_NONE);
281  rootlevel = metad->btm_fastlevel;
282 
283  rootbuf = _bt_getbuf(rel, rootblkno, BT_READ);
284  rootpage = BufferGetPage(rootbuf);
285  rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
286 
287  /*
288  * Since the cache might be stale, we check the page more carefully
289  * here than normal. We *must* check that it's not deleted. If it's
290  * not alone on its level, then we reject too --- this may be overly
291  * paranoid but better safe than sorry. Note we don't check P_ISROOT,
292  * because that's not set in a "fast root".
293  */
294  if (!P_IGNORE(rootopaque) &&
295  rootopaque->btpo.level == rootlevel &&
296  P_LEFTMOST(rootopaque) &&
297  P_RIGHTMOST(rootopaque))
298  {
299  /* OK, accept cached page as the root */
300  return rootbuf;
301  }
302  _bt_relbuf(rel, rootbuf);
303  /* Cache is stale, throw it away */
304  if (rel->rd_amcache)
305  pfree(rel->rd_amcache);
306  rel->rd_amcache = NULL;
307  }
308 
309  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
310  metad = _bt_getmeta(rel, metabuf);
311 
312  /* if no root page initialized yet, do it */
313  if (metad->btm_root == P_NONE)
314  {
315  Page metapg;
316 
317  /* If access = BT_READ, caller doesn't want us to create root yet */
318  if (access == BT_READ)
319  {
320  _bt_relbuf(rel, metabuf);
321  return InvalidBuffer;
322  }
323 
324  /* trade in our read lock for a write lock */
325  LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
326  LockBuffer(metabuf, BT_WRITE);
327 
328  /*
329  * Race condition: if someone else initialized the metadata between
330  * the time we released the read lock and acquired the write lock, we
331  * must avoid doing it again.
332  */
333  if (metad->btm_root != P_NONE)
334  {
335  /*
336  * Metadata initialized by someone else. In order to guarantee no
337  * deadlocks, we have to release the metadata page and start all
338  * over again. (Is that really true? But it's hardly worth trying
339  * to optimize this case.)
340  */
341  _bt_relbuf(rel, metabuf);
342  return _bt_getroot(rel, access);
343  }
344 
345  /*
346  * Get, initialize, write, and leave a lock of the appropriate type on
347  * the new root page. Since this is the first page in the tree, it's
348  * a leaf as well as the root.
349  */
350  rootbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
351  rootblkno = BufferGetBlockNumber(rootbuf);
352  rootpage = BufferGetPage(rootbuf);
353  rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
354  rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE;
355  rootopaque->btpo_flags = (BTP_LEAF | BTP_ROOT);
356  rootopaque->btpo.level = 0;
357  rootopaque->btpo_cycleid = 0;
358  /* Get raw page pointer for metapage */
359  metapg = BufferGetPage(metabuf);
360 
361  /* NO ELOG(ERROR) till meta is updated */
363 
364  /* upgrade metapage if needed */
365  if (metad->btm_version < BTREE_NOVAC_VERSION)
366  _bt_upgrademetapage(metapg);
367 
368  metad->btm_root = rootblkno;
369  metad->btm_level = 0;
370  metad->btm_fastroot = rootblkno;
371  metad->btm_fastlevel = 0;
373  metad->btm_last_cleanup_num_heap_tuples = -1.0;
374 
375  MarkBufferDirty(rootbuf);
376  MarkBufferDirty(metabuf);
377 
378  /* XLOG stuff */
379  if (RelationNeedsWAL(rel))
380  {
381  xl_btree_newroot xlrec;
382  XLogRecPtr recptr;
384 
385  XLogBeginInsert();
388 
390  md.version = metad->btm_version;
391  md.root = rootblkno;
392  md.level = 0;
393  md.fastroot = rootblkno;
394  md.fastlevel = 0;
397 
398  XLogRegisterBufData(2, (char *) &md, sizeof(xl_btree_metadata));
399 
400  xlrec.rootblk = rootblkno;
401  xlrec.level = 0;
402 
403  XLogRegisterData((char *) &xlrec, SizeOfBtreeNewroot);
404 
405  recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT);
406 
407  PageSetLSN(rootpage, recptr);
408  PageSetLSN(metapg, recptr);
409  }
410 
412 
413  /*
414  * swap root write lock for read lock. There is no danger of anyone
415  * else accessing the new root page while it's unlocked, since no one
416  * else knows where it is yet.
417  */
418  LockBuffer(rootbuf, BUFFER_LOCK_UNLOCK);
419  LockBuffer(rootbuf, BT_READ);
420 
421  /* okay, metadata is correct, release lock on it without caching */
422  _bt_relbuf(rel, metabuf);
423  }
424  else
425  {
426  rootblkno = metad->btm_fastroot;
427  Assert(rootblkno != P_NONE);
428  rootlevel = metad->btm_fastlevel;
429 
430  /*
431  * Cache the metapage data for next time
432  */
434  sizeof(BTMetaPageData));
435  memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));
436 
437  /*
438  * We are done with the metapage; arrange to release it via first
439  * _bt_relandgetbuf call
440  */
441  rootbuf = metabuf;
442 
443  for (;;)
444  {
445  rootbuf = _bt_relandgetbuf(rel, rootbuf, rootblkno, BT_READ);
446  rootpage = BufferGetPage(rootbuf);
447  rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
448 
449  if (!P_IGNORE(rootopaque))
450  break;
451 
452  /* it's dead, Jim. step right one page */
453  if (P_RIGHTMOST(rootopaque))
454  elog(ERROR, "no live root page found in index \"%s\"",
456  rootblkno = rootopaque->btpo_next;
457  }
458 
459  /* Note: can't check btpo.level on deleted pages */
460  if (rootopaque->btpo.level != rootlevel)
461  elog(ERROR, "root page %u of index \"%s\" has level %u, expected %u",
462  rootblkno, RelationGetRelationName(rel),
463  rootopaque->btpo.level, rootlevel);
464  }
465 
466  /*
467  * By here, we have a pin and read lock on the root page, and no lock set
468  * on the metadata page. Return the root page's buffer.
469  */
470  return rootbuf;
471 }
void XLogRegisterBufData(uint8 block_id, char *data, int len)
Definition: xloginsert.c:361
BlockNumber rootblk
Definition: nbtxlog.h:246
#define BUFFER_LOCK_UNLOCK
Definition: bufmgr.h:86
#define BTP_ROOT
Definition: nbtree.h:72
BlockNumber btpo_next
Definition: nbtree.h:58
static BTMetaPageData * _bt_getmeta(Relation rel, Buffer metabuf)
Definition: nbtpage.c:120
#define P_IGNORE(opaque)
Definition: nbtree.h:194
void _bt_upgrademetapage(Page page)
Definition: nbtpage.c:88
uint32 btm_version
Definition: nbtree.h:100
Buffer _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
Definition: nbtpage.c:893
Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access)
Definition: nbtpage.c:757
void MarkBufferDirty(Buffer buffer)
Definition: bufmgr.c:1458
void XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags)
Definition: xloginsert.c:213
#define BTREE_VERSION
Definition: nbtree.h:133
uint32 btm_magic
Definition: nbtree.h:99
#define BTP_LEAF
Definition: nbtree.h:71
union BTPageOpaqueData::@46 btpo
#define END_CRIT_SECTION()
Definition: miscadmin.h:134
BlockNumber root
Definition: nbtxlog.h:50
#define P_NONE
Definition: nbtree.h:181
#define InvalidBuffer
Definition: buf.h:25
#define REGBUF_WILL_INIT
Definition: xloginsert.h:33
#define START_CRIT_SECTION()
Definition: miscadmin.h:132
uint32 level
Definition: nbtxlog.h:247
uint32 BlockNumber
Definition: block.h:31
#define P_NEW
Definition: bufmgr.h:81
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:68
#define BT_READ
Definition: nbtree.h:402
BlockNumber btm_fastroot
Definition: nbtree.h:103
#define XLOG_BTREE_NEWROOT
Definition: nbtxlog.h:35
void pfree(void *pointer)
Definition: mcxt.c:1031
#define BTREE_MAGIC
Definition: nbtree.h:132
#define ERROR
Definition: elog.h:43
float8 last_cleanup_num_heap_tuples
Definition: nbtxlog.h:55
BTCycleId btpo_cycleid
Definition: nbtree.h:65
TransactionId oldest_btpo_xact
Definition: nbtxlog.h:54
BlockNumber btpo_prev
Definition: nbtree.h:57
#define REGBUF_STANDARD
Definition: xloginsert.h:35
#define InvalidTransactionId
Definition: transam.h:31
#define RelationGetRelationName(relation)
Definition: rel.h:450
#define P_LEFTMOST(opaque)
Definition: nbtree.h:187
unsigned int uint32
Definition: c.h:358
#define BTREE_NOVAC_VERSION
Definition: nbtree.h:135
Buffer _bt_getroot(Relation rel, int access)
Definition: nbtpage.c:255
#define BufferGetPage(buffer)
Definition: bufmgr.h:159
#define BTREE_METAPAGE
Definition: nbtree.h:131
#define SizeOfBtreeNewroot
Definition: nbtxlog.h:250
uint32 version
Definition: nbtxlog.h:49
uint32 btm_fastlevel
Definition: nbtree.h:104
void XLogRegisterData(char *data, int len)
Definition: xloginsert.c:323
uint32 level
Definition: nbtree.h:61
XLogRecPtr XLogInsert(RmgrId rmid, uint8 info)
Definition: xloginsert.c:415
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:3590
BlockNumber btm_root
Definition: nbtree.h:101
#define BTREE_MIN_VERSION
Definition: nbtree.h:134
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:912
uint64 XLogRecPtr
Definition: xlogdefs.h:21
#define Assert(condition)
Definition: c.h:732
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
float8 btm_last_cleanup_num_heap_tuples
Definition: nbtree.h:108
#define RelationNeedsWAL(relation)
Definition: rel.h:518
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2613
void * MemoryContextAlloc(MemoryContext context, Size size)
Definition: mcxt.c:771
uint32 fastlevel
Definition: nbtxlog.h:53
uint32 btm_level
Definition: nbtree.h:102
#define elog(elevel,...)
Definition: elog.h:226
MemoryContext rd_indexcxt
Definition: rel.h:155
uint32 level
Definition: nbtxlog.h:51
BlockNumber fastroot
Definition: nbtxlog.h:52
TransactionId btm_oldest_btpo_xact
Definition: nbtree.h:106
void * rd_amcache
Definition: rel.h:179
#define BT_WRITE
Definition: nbtree.h:403
void XLogBeginInsert(void)
Definition: xloginsert.c:120
uint16 btpo_flags
Definition: nbtree.h:64
#define PageSetLSN(page, lsn)
Definition: bufpage.h:368
int Buffer
Definition: buf.h:23
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:188
Pointer Page
Definition: bufpage.h:78

◆ _bt_getrootheight()

int _bt_getrootheight ( Relation  rel)

Definition at line 584 of file nbtpage.c.

References _bt_getbuf(), _bt_getmeta(), _bt_relbuf(), Assert, BT_READ, BTMetaPageData::btm_fastlevel, BTMetaPageData::btm_fastroot, BTMetaPageData::btm_magic, BTMetaPageData::btm_root, BTMetaPageData::btm_version, BTREE_MAGIC, BTREE_METAPAGE, BTREE_MIN_VERSION, BTREE_VERSION, MemoryContextAlloc(), P_NONE, RelationData::rd_amcache, and RelationData::rd_indexcxt.

Referenced by _bt_insertonpg(), and get_relation_info().

585 {
586  BTMetaPageData *metad;
587 
588  if (rel->rd_amcache == NULL)
589  {
590  Buffer metabuf;
591 
592  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
593  metad = _bt_getmeta(rel, metabuf);
594 
595  /*
596  * If there's no root page yet, _bt_getroot() doesn't expect a cache
597  * to be made, so just stop here and report the index height is zero.
598  * (XXX perhaps _bt_getroot() should be changed to allow this case.)
599  */
600  if (metad->btm_root == P_NONE)
601  {
602  _bt_relbuf(rel, metabuf);
603  return 0;
604  }
605 
606  /*
607  * Cache the metapage data for next time
608  */
610  sizeof(BTMetaPageData));
611  memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));
612  _bt_relbuf(rel, metabuf);
613  }
614 
615  /* Get cached page */
616  metad = (BTMetaPageData *) rel->rd_amcache;
617  /* We shouldn't have cached it if any of these fail */
618  Assert(metad->btm_magic == BTREE_MAGIC);
620  Assert(metad->btm_version <= BTREE_VERSION);
621  Assert(metad->btm_fastroot != P_NONE);
622 
623  return metad->btm_fastlevel;
624 }
static BTMetaPageData * _bt_getmeta(Relation rel, Buffer metabuf)
Definition: nbtpage.c:120
uint32 btm_version
Definition: nbtree.h:100
Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access)
Definition: nbtpage.c:757
#define BTREE_VERSION
Definition: nbtree.h:133
uint32 btm_magic
Definition: nbtree.h:99
#define P_NONE
Definition: nbtree.h:181
#define BT_READ
Definition: nbtree.h:402
BlockNumber btm_fastroot
Definition: nbtree.h:103
#define BTREE_MAGIC
Definition: nbtree.h:132
#define BTREE_METAPAGE
Definition: nbtree.h:131
uint32 btm_fastlevel
Definition: nbtree.h:104
BlockNumber btm_root
Definition: nbtree.h:101
#define BTREE_MIN_VERSION
Definition: nbtree.h:134
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:912
#define Assert(condition)
Definition: c.h:732
void * MemoryContextAlloc(MemoryContext context, Size size)
Definition: mcxt.c:771
MemoryContext rd_indexcxt
Definition: rel.h:155
void * rd_amcache
Definition: rel.h:179
int Buffer
Definition: buf.h:23

◆ _bt_getstackbuf()

Buffer _bt_getstackbuf ( Relation  rel,
BTStack  stack,
BlockNumber  child 
)

Definition at line 1932 of file nbtinsert.c.

References _bt_finish_split(), _bt_getbuf(), _bt_relbuf(), BT_WRITE, BTPageOpaqueData::btpo_next, BTreeInnerTupleGetDownLink, BTStackData::bts_blkno, BTStackData::bts_offset, BTStackData::bts_parent, buf, BufferGetPage, InvalidBuffer, InvalidOffsetNumber, OffsetNumberNext, OffsetNumberPrev, P_FIRSTDATAKEY, P_IGNORE, P_INCOMPLETE_SPLIT, P_RIGHTMOST, PageGetItem, PageGetItemId, PageGetMaxOffsetNumber, and PageGetSpecialPointer.

Referenced by _bt_insert_parent(), and _bt_lock_branch_parent().

1933 {
1934  BlockNumber blkno;
1935  OffsetNumber start;
1936 
1937  blkno = stack->bts_blkno;
1938  start = stack->bts_offset;
1939 
1940  for (;;)
1941  {
1942  Buffer buf;
1943  Page page;
1944  BTPageOpaque opaque;
1945 
1946  buf = _bt_getbuf(rel, blkno, BT_WRITE);
1947  page = BufferGetPage(buf);
1948  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1949 
1950  if (P_INCOMPLETE_SPLIT(opaque))
1951  {
1952  _bt_finish_split(rel, buf, stack->bts_parent);
1953  continue;
1954  }
1955 
1956  if (!P_IGNORE(opaque))
1957  {
1958  OffsetNumber offnum,
1959  minoff,
1960  maxoff;
1961  ItemId itemid;
1962  IndexTuple item;
1963 
1964  minoff = P_FIRSTDATAKEY(opaque);
1965  maxoff = PageGetMaxOffsetNumber(page);
1966 
1967  /*
1968  * start = InvalidOffsetNumber means "search the whole page". We
1969  * need this test anyway due to possibility that page has a high
1970  * key now when it didn't before.
1971  */
1972  if (start < minoff)
1973  start = minoff;
1974 
1975  /*
1976  * Need this check too, to guard against possibility that page
1977  * split since we visited it originally.
1978  */
1979  if (start > maxoff)
1980  start = OffsetNumberNext(maxoff);
1981 
1982  /*
1983  * These loops will check every item on the page --- but in an
1984  * order that's attuned to the probability of where it actually
1985  * is. Scan to the right first, then to the left.
1986  */
1987  for (offnum = start;
1988  offnum <= maxoff;
1989  offnum = OffsetNumberNext(offnum))
1990  {
1991  itemid = PageGetItemId(page, offnum);
1992  item = (IndexTuple) PageGetItem(page, itemid);
1993 
1994  if (BTreeInnerTupleGetDownLink(item) == child)
1995  {
1996  /* Return accurate pointer to where link is now */
1997  stack->bts_blkno = blkno;
1998  stack->bts_offset = offnum;
1999  return buf;
2000  }
2001  }
2002 
2003  for (offnum = OffsetNumberPrev(start);
2004  offnum >= minoff;
2005  offnum = OffsetNumberPrev(offnum))
2006  {
2007  itemid = PageGetItemId(page, offnum);
2008  item = (IndexTuple) PageGetItem(page, itemid);
2009 
2010  if (BTreeInnerTupleGetDownLink(item) == child)
2011  {
2012  /* Return accurate pointer to where link is now */
2013  stack->bts_blkno = blkno;
2014  stack->bts_offset = offnum;
2015  return buf;
2016  }
2017  }
2018  }
2019 
2020  /*
2021  * The item we're looking for moved right at least one page.
2022  *
2023  * Lehman and Yao couple/chain locks when moving right here, which we
2024  * can avoid. See nbtree/README.
2025  */
2026  if (P_RIGHTMOST(opaque))
2027  {
2028  _bt_relbuf(rel, buf);
2029  return InvalidBuffer;
2030  }
2031  blkno = opaque->btpo_next;
2032  start = InvalidOffsetNumber;
2033  _bt_relbuf(rel, buf);
2034  }
2035 }
BlockNumber btpo_next
Definition: nbtree.h:58
#define P_IGNORE(opaque)
Definition: nbtree.h:194
#define BTreeInnerTupleGetDownLink(itup)
Definition: nbtree.h:301
Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access)
Definition: nbtpage.c:757
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:219
#define InvalidBuffer
Definition: buf.h:25
uint32 BlockNumber
Definition: block.h:31
#define P_INCOMPLETE_SPLIT(opaque)
Definition: nbtree.h:196
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:357
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:68
uint16 OffsetNumber
Definition: off.h:24
void _bt_finish_split(Relation rel, Buffer lbuf, BTStack stack)
Definition: nbtinsert.c:1856
OffsetNumber bts_offset
Definition: nbtree.h:418
static char * buf
Definition: pg_test_fsync.c:68
IndexTupleData * IndexTuple
Definition: itup.h:53
#define BufferGetPage(buffer)
Definition: bufmgr.h:159
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:235
#define InvalidOffsetNumber
Definition: off.h:26
BlockNumber bts_blkno
Definition: nbtree.h:417
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:912
#define OffsetNumberNext(offsetNumber)
Definition: off.h:52
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
#define OffsetNumberPrev(offsetNumber)
Definition: off.h:54
struct BTStackData * bts_parent
Definition: nbtree.h:419
#define BT_WRITE
Definition: nbtree.h:403
int Buffer
Definition: buf.h:23
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:188
#define PageGetItem(page, itemId)
Definition: bufpage.h:340
Pointer Page
Definition: bufpage.h:78

◆ _bt_gettrueroot()

Buffer _bt_gettrueroot ( Relation  rel)

Definition at line 488 of file nbtpage.c.

References _bt_getbuf(), _bt_relandgetbuf(), _bt_relbuf(), BT_READ, BTMetaPageData::btm_level, BTMetaPageData::btm_magic, BTMetaPageData::btm_root, BTMetaPageData::btm_version, BTPageGetMeta, BTPageOpaqueData::btpo, BTPageOpaqueData::btpo_next, BTREE_MAGIC, BTREE_METAPAGE, BTREE_MIN_VERSION, BTREE_VERSION, BufferGetPage, elog, ereport, errcode(), errmsg(), ERROR, InvalidBuffer, BTPageOpaqueData::level, P_IGNORE, P_ISMETA, P_NONE, P_RIGHTMOST, PageGetSpecialPointer, pfree(), RelationData::rd_amcache, and RelationGetRelationName.

Referenced by _bt_get_endpoint().

489 {
490  Buffer metabuf;
491  Page metapg;
492  BTPageOpaque metaopaque;
493  Buffer rootbuf;
494  Page rootpage;
495  BTPageOpaque rootopaque;
496  BlockNumber rootblkno;
497  uint32 rootlevel;
498  BTMetaPageData *metad;
499 
500  /*
501  * We don't try to use cached metapage data here, since (a) this path is
502  * not performance-critical, and (b) if we are here it suggests our cache
503  * is out-of-date anyway. In light of point (b), it's probably safest to
504  * actively flush any cached metapage info.
505  */
506  if (rel->rd_amcache)
507  pfree(rel->rd_amcache);
508  rel->rd_amcache = NULL;
509 
510  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
511  metapg = BufferGetPage(metabuf);
512  metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg);
513  metad = BTPageGetMeta(metapg);
514 
515  if (!P_ISMETA(metaopaque) ||
516  metad->btm_magic != BTREE_MAGIC)
517  ereport(ERROR,
518  (errcode(ERRCODE_INDEX_CORRUPTED),
519  errmsg("index \"%s\" is not a btree",
520  RelationGetRelationName(rel))));
521 
522  if (metad->btm_version < BTREE_MIN_VERSION ||
523  metad->btm_version > BTREE_VERSION)
524  ereport(ERROR,
525  (errcode(ERRCODE_INDEX_CORRUPTED),
526  errmsg("version mismatch in index \"%s\": file version %d, "
527  "current version %d, minimal supported version %d",
530 
531  /* if no root page initialized yet, fail */
532  if (metad->btm_root == P_NONE)
533  {
534  _bt_relbuf(rel, metabuf);
535  return InvalidBuffer;
536  }
537 
538  rootblkno = metad->btm_root;
539  rootlevel = metad->btm_level;
540 
541  /*
542  * We are done with the metapage; arrange to release it via first
543  * _bt_relandgetbuf call
544  */
545  rootbuf = metabuf;
546 
547  for (;;)
548  {
549  rootbuf = _bt_relandgetbuf(rel, rootbuf, rootblkno, BT_READ);
550  rootpage = BufferGetPage(rootbuf);
551  rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
552 
553  if (!P_IGNORE(rootopaque))
554  break;
555 
556  /* it's dead, Jim. step right one page */
557  if (P_RIGHTMOST(rootopaque))
558  elog(ERROR, "no live root page found in index \"%s\"",
560  rootblkno = rootopaque->btpo_next;
561  }
562 
563  /* Note: can't check btpo.level on deleted pages */
564  if (rootopaque->btpo.level != rootlevel)
565  elog(ERROR, "root page %u of index \"%s\" has level %u, expected %u",
566  rootblkno, RelationGetRelationName(rel),
567  rootopaque->btpo.level, rootlevel);
568 
569  return rootbuf;
570 }
BlockNumber btpo_next
Definition: nbtree.h:58
#define P_IGNORE(opaque)
Definition: nbtree.h:194
uint32 btm_version
Definition: nbtree.h:100
Buffer _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
Definition: nbtpage.c:893
Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access)
Definition: nbtpage.c:757
#define BTREE_VERSION
Definition: nbtree.h:133
uint32 btm_magic
Definition: nbtree.h:99
union BTPageOpaqueData::@46 btpo
#define P_NONE
Definition: nbtree.h:181
#define InvalidBuffer
Definition: buf.h:25
int errcode(int sqlerrcode)
Definition: elog.c:570
uint32 BlockNumber
Definition: block.h:31
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:68
#define P_ISMETA(opaque)
Definition: nbtree.h:192
#define BT_READ
Definition: nbtree.h:402
void pfree(void *pointer)
Definition: mcxt.c:1031
#define BTREE_MAGIC
Definition: nbtree.h:132
#define ERROR
Definition: elog.h:43
#define BTPageGetMeta(p)
Definition: nbtree.h:112
#define RelationGetRelationName(relation)
Definition: rel.h:450
unsigned int uint32
Definition: c.h:358
#define BufferGetPage(buffer)
Definition: bufmgr.h:159
#define ereport(elevel, rest)
Definition: elog.h:141
#define BTREE_METAPAGE
Definition: nbtree.h:131
uint32 level
Definition: nbtree.h:61
BlockNumber btm_root
Definition: nbtree.h:101
#define BTREE_MIN_VERSION
Definition: nbtree.h:134
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:912
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
int errmsg(const char *fmt,...)
Definition: elog.c:784
uint32 btm_level
Definition: nbtree.h:102
#define elog(elevel,...)
Definition: elog.h:226
void * rd_amcache
Definition: rel.h:179
int Buffer
Definition: buf.h:23
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:188
Pointer Page
Definition: bufpage.h:78

◆ _bt_heapkeyspace()

bool _bt_heapkeyspace ( Relation  rel)

Definition at line 636 of file nbtpage.c.

References _bt_getbuf(), _bt_getmeta(), _bt_relbuf(), Assert, BT_READ, BTMetaPageData::btm_fastroot, BTMetaPageData::btm_magic, BTMetaPageData::btm_root, BTMetaPageData::btm_version, BTREE_MAGIC, BTREE_METAPAGE, BTREE_MIN_VERSION, BTREE_NOVAC_VERSION, BTREE_VERSION, MemoryContextAlloc(), P_NONE, RelationData::rd_amcache, and RelationData::rd_indexcxt.

Referenced by _bt_first(), _bt_mkscankey(), and bt_index_check_internal().

637 {
638  BTMetaPageData *metad;
639 
640  if (rel->rd_amcache == NULL)
641  {
642  Buffer metabuf;
643 
644  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
645  metad = _bt_getmeta(rel, metabuf);
646 
647  /*
648  * If there's no root page yet, _bt_getroot() doesn't expect a cache
649  * to be made, so just stop here. (XXX perhaps _bt_getroot() should
650  * be changed to allow this case.)
651  */
652  if (metad->btm_root == P_NONE)
653  {
654  uint32 btm_version = metad->btm_version;
655 
656  _bt_relbuf(rel, metabuf);
657  return btm_version > BTREE_NOVAC_VERSION;
658  }
659 
660  /*
661  * Cache the metapage data for next time
662  *
663  * An on-the-fly version upgrade performed by _bt_upgrademetapage()
664  * can change the nbtree version for an index without invalidating any
665  * local cache. This is okay because it can only happen when moving
666  * from version 2 to version 3, both of which are !heapkeyspace
667  * versions.
668  */
670  sizeof(BTMetaPageData));
671  memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));
672  _bt_relbuf(rel, metabuf);
673  }
674 
675  /* Get cached page */
676  metad = (BTMetaPageData *) rel->rd_amcache;
677  /* We shouldn't have cached it if any of these fail */
678  Assert(metad->btm_magic == BTREE_MAGIC);
680  Assert(metad->btm_version <= BTREE_VERSION);
681  Assert(metad->btm_fastroot != P_NONE);
682 
683  return metad->btm_version > BTREE_NOVAC_VERSION;
684 }
static BTMetaPageData * _bt_getmeta(Relation rel, Buffer metabuf)
Definition: nbtpage.c:120
uint32 btm_version
Definition: nbtree.h:100
Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access)
Definition: nbtpage.c:757
#define BTREE_VERSION
Definition: nbtree.h:133
uint32 btm_magic
Definition: nbtree.h:99
#define P_NONE
Definition: nbtree.h:181
#define BT_READ
Definition: nbtree.h:402
BlockNumber btm_fastroot
Definition: nbtree.h:103
#define BTREE_MAGIC
Definition: nbtree.h:132
unsigned int uint32
Definition: c.h:358
#define BTREE_NOVAC_VERSION
Definition: nbtree.h:135
#define BTREE_METAPAGE
Definition: nbtree.h:131
BlockNumber btm_root
Definition: nbtree.h:101
#define BTREE_MIN_VERSION
Definition: nbtree.h:134
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:912
#define Assert(condition)
Definition: c.h:732
void * MemoryContextAlloc(MemoryContext context, Size size)
Definition: mcxt.c:771
MemoryContext rd_indexcxt
Definition: rel.h:155
void * rd_amcache
Definition: rel.h:179
int Buffer
Definition: buf.h:23

◆ _bt_initmetapage()

void _bt_initmetapage ( Page  page,
BlockNumber  rootbknum,
uint32  level 
)

Definition at line 50 of file nbtpage.c.

References _bt_pageinit(), BTMetaPageData::btm_fastlevel, BTMetaPageData::btm_fastroot, BTMetaPageData::btm_last_cleanup_num_heap_tuples, BTMetaPageData::btm_level, BTMetaPageData::btm_magic, BTMetaPageData::btm_oldest_btpo_xact, BTMetaPageData::btm_root, BTMetaPageData::btm_version, BTP_META, BTPageGetMeta, BTPageOpaqueData::btpo_flags, BTREE_MAGIC, BTREE_VERSION, InvalidTransactionId, and PageGetSpecialPointer.

Referenced by _bt_uppershutdown(), and btbuildempty().

51 {
52  BTMetaPageData *metad;
53  BTPageOpaque metaopaque;
54 
55  _bt_pageinit(page, BLCKSZ);
56 
57  metad = BTPageGetMeta(page);
58  metad->btm_magic = BTREE_MAGIC;
59  metad->btm_version = BTREE_VERSION;
60  metad->btm_root = rootbknum;
61  metad->btm_level = level;
62  metad->btm_fastroot = rootbknum;
63  metad->btm_fastlevel = level;
66 
67  metaopaque = (BTPageOpaque) PageGetSpecialPointer(page);
68  metaopaque->btpo_flags = BTP_META;
69 
70  /*
71  * Set pd_lower just past the end of the metadata. This is essential,
72  * because without doing so, metadata will be lost if xlog.c compresses
73  * the page.
74  */
75  ((PageHeader) page)->pd_lower =
76  ((char *) metad + sizeof(BTMetaPageData)) - (char *) page;
77 }
uint32 btm_version
Definition: nbtree.h:100
#define BTREE_VERSION
Definition: nbtree.h:133
uint32 btm_magic
Definition: nbtree.h:99
#define BTP_META
Definition: nbtree.h:74
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:68
BlockNumber btm_fastroot
Definition: nbtree.h:103
#define BTREE_MAGIC
Definition: nbtree.h:132
#define BTPageGetMeta(p)
Definition: nbtree.h:112
#define InvalidTransactionId
Definition: transam.h:31
uint32 btm_fastlevel
Definition: nbtree.h:104
BlockNumber btm_root
Definition: nbtree.h:101
PageHeaderData * PageHeader
Definition: bufpage.h:166
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
float8 btm_last_cleanup_num_heap_tuples
Definition: nbtree.h:108
uint32 btm_level
Definition: nbtree.h:102
void _bt_pageinit(Page page, Size size)
Definition: nbtpage.c:924
TransactionId btm_oldest_btpo_xact
Definition: nbtree.h:106
uint16 btpo_flags
Definition: nbtree.h:64

◆ _bt_keep_natts_fast()

int _bt_keep_natts_fast ( Relation  rel,
IndexTuple  lastleft,
IndexTuple  firstright 
)

Definition at line 2330 of file nbtutils.c.

References attnum, datumIsEqual(), index_getattr, IndexRelationGetNumberOfKeyAttributes, RelationGetDescr, and TupleDescAttr.

Referenced by _bt_afternewitemoff(), _bt_split_penalty(), and _bt_strategy().

2331 {
2332  TupleDesc itupdesc = RelationGetDescr(rel);
2333  int keysz = IndexRelationGetNumberOfKeyAttributes(rel);
2334  int keepnatts;
2335 
2336  keepnatts = 1;
2337  for (int attnum = 1; attnum <= keysz; attnum++)
2338  {
2339  Datum datum1,
2340  datum2;
2341  bool isNull1,
2342  isNull2;
2343  Form_pg_attribute att;
2344 
2345  datum1 = index_getattr(lastleft, attnum, itupdesc, &isNull1);
2346  datum2 = index_getattr(firstright, attnum, itupdesc, &isNull2);
2347  att = TupleDescAttr(itupdesc, attnum - 1);
2348 
2349  if (isNull1 != isNull2)
2350  break;
2351 
2352  if (!isNull1 &&
2353  !datumIsEqual(datum1, datum2, att->attbyval, att->attlen))
2354  break;
2355 
2356  keepnatts++;
2357  }
2358 
2359  return keepnatts;
2360 }
#define RelationGetDescr(relation)
Definition: rel.h:442
#define TupleDescAttr(tupdesc, i)
Definition: tupdesc.h:92