PostgreSQL Source Code  git master
nbtree.h File Reference
#include "access/amapi.h"
#include "access/itup.h"
#include "access/sdir.h"
#include "access/tableam.h"
#include "access/xlogreader.h"
#include "catalog/pg_am_d.h"
#include "catalog/pg_index.h"
#include "lib/stringinfo.h"
#include "storage/bufmgr.h"
#include "storage/shm_toc.h"
Include dependency graph for nbtree.h:
This graph shows which files directly or indirectly include this file:

Go to the source code of this file.

Data Structures

struct  BTPageOpaqueData
 
struct  BTMetaPageData
 
struct  BTDeletedPageData
 
struct  BTPendingFSM
 
struct  BTVacState
 
struct  BTStackData
 
struct  BTScanInsertData
 
struct  BTInsertStateData
 
struct  BTDedupInterval
 
struct  BTDedupStateData
 
struct  BTVacuumPostingData
 
struct  BTScanPosItem
 
struct  BTScanPosData
 
struct  BTArrayKeyInfo
 
struct  BTScanOpaqueData
 
struct  BTOptions
 

Macros

#define BTPageGetOpaque(page)   ((BTPageOpaque) PageGetSpecialPointer(page))
 
#define BTP_LEAF   (1 << 0) /* leaf page, i.e. not internal page */
 
#define BTP_ROOT   (1 << 1) /* root page (has no parent) */
 
#define BTP_DELETED   (1 << 2) /* page has been deleted from tree */
 
#define BTP_META   (1 << 3) /* meta-page */
 
#define BTP_HALF_DEAD   (1 << 4) /* empty, but still in tree */
 
#define BTP_SPLIT_END   (1 << 5) /* rightmost page of split group */
 
#define BTP_HAS_GARBAGE   (1 << 6) /* page has LP_DEAD tuples (deprecated) */
 
#define BTP_INCOMPLETE_SPLIT   (1 << 7) /* right sibling's downlink is missing */
 
#define BTP_HAS_FULLXID   (1 << 8) /* contains BTDeletedPageData */
 
#define MAX_BT_CYCLE_ID   0xFF7F
 
#define BTPageGetMeta(p)    ((BTMetaPageData *) PageGetContents(p))
 
#define BTREE_METAPAGE   0 /* first page is meta */
 
#define BTREE_MAGIC   0x053162 /* magic number in metapage */
 
#define BTREE_VERSION   4 /* current version number */
 
#define BTREE_MIN_VERSION   2 /* minimum supported version */
 
#define BTREE_NOVAC_VERSION   3 /* version with all meta fields set */
 
#define BTMaxItemSize(page)
 
#define BTMaxItemSizeNoHeapTid(page)
 
#define MaxTIDsPerBTreePage
 
#define BTREE_MIN_FILLFACTOR   10
 
#define BTREE_DEFAULT_FILLFACTOR   90
 
#define BTREE_NONLEAF_FILLFACTOR   70
 
#define BTREE_SINGLEVAL_FILLFACTOR   96
 
#define P_NONE   0
 
#define P_LEFTMOST(opaque)   ((opaque)->btpo_prev == P_NONE)
 
#define P_RIGHTMOST(opaque)   ((opaque)->btpo_next == P_NONE)
 
#define P_ISLEAF(opaque)   (((opaque)->btpo_flags & BTP_LEAF) != 0)
 
#define P_ISROOT(opaque)   (((opaque)->btpo_flags & BTP_ROOT) != 0)
 
#define P_ISDELETED(opaque)   (((opaque)->btpo_flags & BTP_DELETED) != 0)
 
#define P_ISMETA(opaque)   (((opaque)->btpo_flags & BTP_META) != 0)
 
#define P_ISHALFDEAD(opaque)   (((opaque)->btpo_flags & BTP_HALF_DEAD) != 0)
 
#define P_IGNORE(opaque)   (((opaque)->btpo_flags & (BTP_DELETED|BTP_HALF_DEAD)) != 0)
 
#define P_HAS_GARBAGE(opaque)   (((opaque)->btpo_flags & BTP_HAS_GARBAGE) != 0)
 
#define P_INCOMPLETE_SPLIT(opaque)   (((opaque)->btpo_flags & BTP_INCOMPLETE_SPLIT) != 0)
 
#define P_HAS_FULLXID(opaque)   (((opaque)->btpo_flags & BTP_HAS_FULLXID) != 0)
 
#define P_HIKEY   ((OffsetNumber) 1)
 
#define P_FIRSTKEY   ((OffsetNumber) 2)
 
#define P_FIRSTDATAKEY(opaque)   (P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY)
 
#define INDEX_ALT_TID_MASK   INDEX_AM_RESERVED_BIT
 
#define BT_OFFSET_MASK   0x0FFF
 
#define BT_STATUS_OFFSET_MASK   0xF000
 
#define BT_PIVOT_HEAP_TID_ATTR   0x1000
 
#define BT_IS_POSTING   0x2000
 
#define BTreeTupleGetNAtts(itup, rel)
 
#define BTCommuteStrategyNumber(strat)   (BTMaxStrategyNumber + 1 - (strat))
 
#define BTORDER_PROC   1
 
#define BTSORTSUPPORT_PROC   2
 
#define BTINRANGE_PROC   3
 
#define BTEQUALIMAGE_PROC   4
 
#define BTOPTIONS_PROC   5
 
#define BTNProcs   5
 
#define BT_READ   BUFFER_LOCK_SHARE
 
#define BT_WRITE   BUFFER_LOCK_EXCLUSIVE
 
#define BTScanPosIsPinned(scanpos)
 
#define BTScanPosUnpin(scanpos)
 
#define BTScanPosUnpinIfPinned(scanpos)
 
#define BTScanPosIsValid(scanpos)
 
#define BTScanPosInvalidate(scanpos)
 
#define SK_BT_REQFWD   0x00010000 /* required to continue forward scan */
 
#define SK_BT_REQBKWD   0x00020000 /* required to continue backward scan */
 
#define SK_BT_INDOPTION_SHIFT   24 /* must clear the above bits */
 
#define SK_BT_DESC   (INDOPTION_DESC << SK_BT_INDOPTION_SHIFT)
 
#define SK_BT_NULLS_FIRST   (INDOPTION_NULLS_FIRST << SK_BT_INDOPTION_SHIFT)
 
#define BTGetFillFactor(relation)
 
#define BTGetTargetPageFreeSpace(relation)    (BLCKSZ * (100 - BTGetFillFactor(relation)) / 100)
 
#define BTGetDeduplicateItems(relation)
 
#define PROGRESS_BTREE_PHASE_INDEXBUILD_TABLESCAN   2
 
#define PROGRESS_BTREE_PHASE_PERFORMSORT_1   3
 
#define PROGRESS_BTREE_PHASE_PERFORMSORT_2   4
 
#define PROGRESS_BTREE_PHASE_LEAF_LOAD   5
 

Typedefs

typedef uint16 BTCycleId
 
typedef struct BTPageOpaqueData BTPageOpaqueData
 
typedef BTPageOpaqueDataBTPageOpaque
 
typedef struct BTMetaPageData BTMetaPageData
 
typedef struct BTDeletedPageData BTDeletedPageData
 
typedef struct BTPendingFSM BTPendingFSM
 
typedef struct BTVacState BTVacState
 
typedef struct BTStackData BTStackData
 
typedef BTStackDataBTStack
 
typedef struct BTScanInsertData BTScanInsertData
 
typedef BTScanInsertDataBTScanInsert
 
typedef struct BTInsertStateData BTInsertStateData
 
typedef BTInsertStateDataBTInsertState
 
typedef struct BTDedupInterval BTDedupInterval
 
typedef struct BTDedupStateData BTDedupStateData
 
typedef BTDedupStateDataBTDedupState
 
typedef struct BTVacuumPostingData BTVacuumPostingData
 
typedef BTVacuumPostingDataBTVacuumPosting
 
typedef struct BTScanPosItem BTScanPosItem
 
typedef struct BTScanPosData BTScanPosData
 
typedef BTScanPosDataBTScanPos
 
typedef struct BTArrayKeyInfo BTArrayKeyInfo
 
typedef struct BTScanOpaqueData BTScanOpaqueData
 
typedef BTScanOpaqueDataBTScanOpaque
 
typedef struct BTOptions BTOptions
 

Functions

static void BTPageSetDeleted (Page page, FullTransactionId safexid)
 
static FullTransactionId BTPageGetDeleteXid (Page page)
 
static bool BTPageIsRecyclable (Page page)
 
static bool BTreeTupleIsPivot (IndexTuple itup)
 
static bool BTreeTupleIsPosting (IndexTuple itup)
 
static void BTreeTupleSetPosting (IndexTuple itup, uint16 nhtids, int postingoffset)
 
static uint16 BTreeTupleGetNPosting (IndexTuple posting)
 
static uint32 BTreeTupleGetPostingOffset (IndexTuple posting)
 
static ItemPointer BTreeTupleGetPosting (IndexTuple posting)
 
static ItemPointer BTreeTupleGetPostingN (IndexTuple posting, int n)
 
static BlockNumber BTreeTupleGetDownLink (IndexTuple pivot)
 
static void BTreeTupleSetDownLink (IndexTuple pivot, BlockNumber blkno)
 
static void BTreeTupleSetNAtts (IndexTuple itup, uint16 nkeyatts, bool heaptid)
 
static BlockNumber BTreeTupleGetTopParent (IndexTuple leafhikey)
 
static void BTreeTupleSetTopParent (IndexTuple leafhikey, BlockNumber blkno)
 
static ItemPointer BTreeTupleGetHeapTID (IndexTuple itup)
 
static ItemPointer BTreeTupleGetMaxHeapTID (IndexTuple itup)
 
void btbuildempty (Relation index)
 
bool btinsert (Relation rel, Datum *values, bool *isnull, ItemPointer ht_ctid, Relation heapRel, IndexUniqueCheck checkUnique, bool indexUnchanged, struct IndexInfo *indexInfo)
 
IndexScanDesc btbeginscan (Relation rel, int nkeys, int norderbys)
 
Size btestimateparallelscan (void)
 
void btinitparallelscan (void *target)
 
bool btgettuple (IndexScanDesc scan, ScanDirection dir)
 
int64 btgetbitmap (IndexScanDesc scan, TIDBitmap *tbm)
 
void btrescan (IndexScanDesc scan, ScanKey scankey, int nscankeys, ScanKey orderbys, int norderbys)
 
void btparallelrescan (IndexScanDesc scan)
 
void btendscan (IndexScanDesc scan)
 
void btmarkpos (IndexScanDesc scan)
 
void btrestrpos (IndexScanDesc scan)
 
IndexBulkDeleteResultbtbulkdelete (IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state)
 
IndexBulkDeleteResultbtvacuumcleanup (IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
 
bool btcanreturn (Relation index, int attno)
 
bool _bt_parallel_seize (IndexScanDesc scan, BlockNumber *pageno)
 
void _bt_parallel_release (IndexScanDesc scan, BlockNumber scan_page)
 
void _bt_parallel_done (IndexScanDesc scan)
 
void _bt_parallel_advance_array_keys (IndexScanDesc scan)
 
void _bt_dedup_pass (Relation rel, Buffer buf, Relation heapRel, IndexTuple newitem, Size newitemsz, bool bottomupdedup)
 
bool _bt_bottomupdel_pass (Relation rel, Buffer buf, Relation heapRel, Size newitemsz)
 
void _bt_dedup_start_pending (BTDedupState state, IndexTuple base, OffsetNumber baseoff)
 
bool _bt_dedup_save_htid (BTDedupState state, IndexTuple itup)
 
Size _bt_dedup_finish_pending (Page newpage, BTDedupState state)
 
IndexTuple _bt_form_posting (IndexTuple base, ItemPointer htids, int nhtids)
 
void _bt_update_posting (BTVacuumPosting vacposting)
 
IndexTuple _bt_swap_posting (IndexTuple newitem, IndexTuple oposting, int postingoff)
 
bool _bt_doinsert (Relation rel, IndexTuple itup, IndexUniqueCheck checkUnique, bool indexUnchanged, Relation heapRel)
 
void _bt_finish_split (Relation rel, Buffer lbuf, BTStack stack)
 
Buffer _bt_getstackbuf (Relation rel, BTStack stack, BlockNumber child)
 
OffsetNumber _bt_findsplitloc (Relation rel, Page origpage, OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem, bool *newitemonleft)
 
void _bt_initmetapage (Page page, BlockNumber rootbknum, uint32 level, bool allequalimage)
 
bool _bt_vacuum_needs_cleanup (Relation rel)
 
void _bt_set_cleanup_info (Relation rel, BlockNumber num_delpages)
 
void _bt_upgrademetapage (Page page)
 
Buffer _bt_getroot (Relation rel, int access)
 
Buffer _bt_gettrueroot (Relation rel)
 
int _bt_getrootheight (Relation rel)
 
void _bt_metaversion (Relation rel, bool *heapkeyspace, bool *allequalimage)
 
void _bt_checkpage (Relation rel, Buffer buf)
 
Buffer _bt_getbuf (Relation rel, BlockNumber blkno, int access)
 
Buffer _bt_relandgetbuf (Relation rel, Buffer obuf, BlockNumber blkno, int access)
 
void _bt_relbuf (Relation rel, Buffer buf)
 
void _bt_lockbuf (Relation rel, Buffer buf, int access)
 
void _bt_unlockbuf (Relation rel, Buffer buf)
 
bool _bt_conditionallockbuf (Relation rel, Buffer buf)
 
void _bt_upgradelockbufcleanup (Relation rel, Buffer buf)
 
void _bt_pageinit (Page page, Size size)
 
void _bt_delitems_vacuum (Relation rel, Buffer buf, OffsetNumber *deletable, int ndeletable, BTVacuumPosting *updatable, int nupdatable)
 
void _bt_delitems_delete_check (Relation rel, Buffer buf, Relation heapRel, TM_IndexDeleteOp *delstate)
 
void _bt_pagedel (Relation rel, Buffer leafbuf, BTVacState *vstate)
 
void _bt_pendingfsm_init (Relation rel, BTVacState *vstate, bool cleanuponly)
 
void _bt_pendingfsm_finalize (Relation rel, BTVacState *vstate)
 
BTStack _bt_search (Relation rel, BTScanInsert key, Buffer *bufP, int access, Snapshot snapshot)
 
Buffer _bt_moveright (Relation rel, BTScanInsert key, Buffer buf, bool forupdate, BTStack stack, int access, Snapshot snapshot)
 
OffsetNumber _bt_binsrch_insert (Relation rel, BTInsertState insertstate)
 
int32 _bt_compare (Relation rel, BTScanInsert key, Page page, OffsetNumber offnum)
 
bool _bt_first (IndexScanDesc scan, ScanDirection dir)
 
bool _bt_next (IndexScanDesc scan, ScanDirection dir)
 
Buffer _bt_get_endpoint (Relation rel, uint32 level, bool rightmost, Snapshot snapshot)
 
BTScanInsert _bt_mkscankey (Relation rel, IndexTuple itup)
 
void _bt_freestack (BTStack stack)
 
void _bt_preprocess_array_keys (IndexScanDesc scan)
 
void _bt_start_array_keys (IndexScanDesc scan, ScanDirection dir)
 
bool _bt_advance_array_keys (IndexScanDesc scan, ScanDirection dir)
 
void _bt_mark_array_keys (IndexScanDesc scan)
 
void _bt_restore_array_keys (IndexScanDesc scan)
 
void _bt_preprocess_keys (IndexScanDesc scan)
 
bool _bt_checkkeys (IndexScanDesc scan, IndexTuple tuple, int tupnatts, ScanDirection dir, bool *continuescan)
 
void _bt_killitems (IndexScanDesc scan)
 
BTCycleId _bt_vacuum_cycleid (Relation rel)
 
BTCycleId _bt_start_vacuum (Relation rel)
 
void _bt_end_vacuum (Relation rel)
 
void _bt_end_vacuum_callback (int code, Datum arg)
 
Size BTreeShmemSize (void)
 
void BTreeShmemInit (void)
 
byteabtoptions (Datum reloptions, bool validate)
 
bool btproperty (Oid index_oid, int attno, IndexAMProperty prop, const char *propname, bool *res, bool *isnull)
 
char * btbuildphasename (int64 phasenum)
 
IndexTuple _bt_truncate (Relation rel, IndexTuple lastleft, IndexTuple firstright, BTScanInsert itup_key)
 
int _bt_keep_natts_fast (Relation rel, IndexTuple lastleft, IndexTuple firstright)
 
bool _bt_check_natts (Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum)
 
void _bt_check_third_page (Relation rel, Relation heap, bool needheaptidspace, Page page, IndexTuple newtup)
 
bool _bt_allequalimage (Relation rel, bool debugmessage)
 
bool btvalidate (Oid opclassoid)
 
void btadjustmembers (Oid opfamilyoid, Oid opclassoid, List *operators, List *functions)
 
IndexBuildResultbtbuild (Relation heap, Relation index, struct IndexInfo *indexInfo)
 
void _bt_parallel_build_main (dsm_segment *seg, shm_toc *toc)
 

Macro Definition Documentation

◆ BT_IS_POSTING

#define BT_IS_POSTING   0x2000

Definition at line 468 of file nbtree.h.

◆ BT_OFFSET_MASK

#define BT_OFFSET_MASK   0x0FFF

Definition at line 464 of file nbtree.h.

◆ BT_PIVOT_HEAP_TID_ATTR

#define BT_PIVOT_HEAP_TID_ATTR   0x1000

Definition at line 467 of file nbtree.h.

◆ BT_READ

#define BT_READ   BUFFER_LOCK_SHARE

Definition at line 714 of file nbtree.h.

◆ BT_STATUS_OFFSET_MASK

#define BT_STATUS_OFFSET_MASK   0xF000

Definition at line 465 of file nbtree.h.

◆ BT_WRITE

#define BT_WRITE   BUFFER_LOCK_EXCLUSIVE

Definition at line 715 of file nbtree.h.

◆ BTCommuteStrategyNumber

#define BTCommuteStrategyNumber (   strat)    (BTMaxStrategyNumber + 1 - (strat))

Definition at line 680 of file nbtree.h.

◆ BTEQUALIMAGE_PROC

#define BTEQUALIMAGE_PROC   4

Definition at line 705 of file nbtree.h.

◆ BTGetDeduplicateItems

#define BTGetDeduplicateItems (   relation)
Value:
(AssertMacro(relation->rd_rel->relkind == RELKIND_INDEX && \
relation->rd_rel->relam == BTREE_AM_OID), \
((relation)->rd_options ? \
((BTOptions *) (relation)->rd_options)->deduplicate_items : true))
#define AssertMacro(condition)
Definition: c.h:805

Definition at line 1103 of file nbtree.h.

◆ BTGetFillFactor

#define BTGetFillFactor (   relation)
Value:
(AssertMacro(relation->rd_rel->relkind == RELKIND_INDEX && \
relation->rd_rel->relam == BTREE_AM_OID), \
(relation)->rd_options ? \
((BTOptions *) (relation)->rd_options)->fillfactor : \
BTREE_DEFAULT_FILLFACTOR)

Definition at line 1095 of file nbtree.h.

◆ BTGetTargetPageFreeSpace

#define BTGetTargetPageFreeSpace (   relation)     (BLCKSZ * (100 - BTGetFillFactor(relation)) / 100)

Definition at line 1101 of file nbtree.h.

◆ BTINRANGE_PROC

#define BTINRANGE_PROC   3

Definition at line 704 of file nbtree.h.

◆ BTMaxItemSize

#define BTMaxItemSize (   page)
Value:
3*sizeof(ItemIdData) + \
3*sizeof(ItemPointerData)) - \
MAXALIGN(sizeof(BTPageOpaqueData))) / 3)
#define SizeOfPageHeaderData
Definition: bufpage.h:215
#define PageGetPageSize(page)
Definition: bufpage.h:267
#define MAXALIGN_DOWN(LEN)
Definition: c.h:769
#define MAXALIGN(LEN)
Definition: c.h:757

Definition at line 164 of file nbtree.h.

◆ BTMaxItemSizeNoHeapTid

#define BTMaxItemSizeNoHeapTid (   page)
Value:

Definition at line 170 of file nbtree.h.

◆ BTNProcs

#define BTNProcs   5

Definition at line 707 of file nbtree.h.

◆ BTOPTIONS_PROC

#define BTOPTIONS_PROC   5

Definition at line 706 of file nbtree.h.

◆ BTORDER_PROC

#define BTORDER_PROC   1

Definition at line 702 of file nbtree.h.

◆ BTP_DELETED

#define BTP_DELETED   (1 << 2) /* page has been deleted from tree */

Definition at line 78 of file nbtree.h.

◆ BTP_HALF_DEAD

#define BTP_HALF_DEAD   (1 << 4) /* empty, but still in tree */

Definition at line 80 of file nbtree.h.

◆ BTP_HAS_FULLXID

#define BTP_HAS_FULLXID   (1 << 8) /* contains BTDeletedPageData */

Definition at line 84 of file nbtree.h.

◆ BTP_HAS_GARBAGE

#define BTP_HAS_GARBAGE   (1 << 6) /* page has LP_DEAD tuples (deprecated) */

Definition at line 82 of file nbtree.h.

◆ BTP_INCOMPLETE_SPLIT

#define BTP_INCOMPLETE_SPLIT   (1 << 7) /* right sibling's downlink is missing */

Definition at line 83 of file nbtree.h.

◆ BTP_LEAF

#define BTP_LEAF   (1 << 0) /* leaf page, i.e. not internal page */

Definition at line 76 of file nbtree.h.

◆ BTP_META

#define BTP_META   (1 << 3) /* meta-page */

Definition at line 79 of file nbtree.h.

◆ BTP_ROOT

#define BTP_ROOT   (1 << 1) /* root page (has no parent) */

Definition at line 77 of file nbtree.h.

◆ BTP_SPLIT_END

#define BTP_SPLIT_END   (1 << 5) /* rightmost page of split group */

Definition at line 81 of file nbtree.h.

◆ BTPageGetMeta

#define BTPageGetMeta (   p)     ((BTMetaPageData *) PageGetContents(p))

Definition at line 121 of file nbtree.h.

◆ BTPageGetOpaque

#define BTPageGetOpaque (   page)    ((BTPageOpaque) PageGetSpecialPointer(page))

Definition at line 73 of file nbtree.h.

◆ BTREE_DEFAULT_FILLFACTOR

#define BTREE_DEFAULT_FILLFACTOR   90

Definition at line 201 of file nbtree.h.

◆ BTREE_MAGIC

#define BTREE_MAGIC   0x053162 /* magic number in metapage */

Definition at line 149 of file nbtree.h.

◆ BTREE_METAPAGE

#define BTREE_METAPAGE   0 /* first page is meta */

Definition at line 148 of file nbtree.h.

◆ BTREE_MIN_FILLFACTOR

#define BTREE_MIN_FILLFACTOR   10

Definition at line 200 of file nbtree.h.

◆ BTREE_MIN_VERSION

#define BTREE_MIN_VERSION   2 /* minimum supported version */

Definition at line 151 of file nbtree.h.

◆ BTREE_NONLEAF_FILLFACTOR

#define BTREE_NONLEAF_FILLFACTOR   70

Definition at line 202 of file nbtree.h.

◆ BTREE_NOVAC_VERSION

#define BTREE_NOVAC_VERSION   3 /* version with all meta fields set */

Definition at line 152 of file nbtree.h.

◆ BTREE_SINGLEVAL_FILLFACTOR

#define BTREE_SINGLEVAL_FILLFACTOR   96

Definition at line 203 of file nbtree.h.

◆ BTREE_VERSION

#define BTREE_VERSION   4 /* current version number */

Definition at line 150 of file nbtree.h.

◆ BTreeTupleGetNAtts

#define BTreeTupleGetNAtts (   itup,
  rel 
)
Value:
( \
(BTreeTupleIsPivot(itup)) ? \
( \
) \
: \
)
#define ItemPointerGetOffsetNumberNoCheck(pointer)
Definition: itemptr.h:108
static bool BTreeTupleIsPivot(IndexTuple itup)
Definition: nbtree.h:475
#define BT_OFFSET_MASK
Definition: nbtree.h:464
#define IndexRelationGetNumberOfAttributes(relation)
Definition: rel.h:501

Definition at line 572 of file nbtree.h.

◆ BTScanPosInvalidate

#define BTScanPosInvalidate (   scanpos)
Value:
do { \
(scanpos).currPage = InvalidBlockNumber; \
(scanpos).nextPage = InvalidBlockNumber; \
(scanpos).buf = InvalidBuffer; \
(scanpos).lsn = InvalidXLogRecPtr; \
(scanpos).nextTupleOffset = 0; \
} while (0)
#define InvalidBlockNumber
Definition: block.h:33
#define InvalidBuffer
Definition: buf.h:25
static char * buf
Definition: pg_test_fsync.c:67
#define InvalidXLogRecPtr
Definition: xlogdefs.h:28

Definition at line 1013 of file nbtree.h.

◆ BTScanPosIsPinned

#define BTScanPosIsPinned (   scanpos)
Value:
( \
AssertMacro(BlockNumberIsValid((scanpos).currPage) || \
!BufferIsValid((scanpos).buf)), \
BufferIsValid((scanpos).buf) \
)
#define BlockNumberIsValid(blockNumber)
Definition: block.h:70
#define BufferIsValid(bufnum)
Definition: bufmgr.h:123

Definition at line 990 of file nbtree.h.

◆ BTScanPosIsValid

#define BTScanPosIsValid (   scanpos)
Value:
( \
AssertMacro(BlockNumberIsValid((scanpos).currPage) || \
!BufferIsValid((scanpos).buf)), \
BlockNumberIsValid((scanpos).currPage) \
)

Definition at line 1007 of file nbtree.h.

◆ BTScanPosUnpin

#define BTScanPosUnpin (   scanpos)
Value:
do { \
ReleaseBuffer((scanpos).buf); \
(scanpos).buf = InvalidBuffer; \
} while (0)

Definition at line 996 of file nbtree.h.

◆ BTScanPosUnpinIfPinned

#define BTScanPosUnpinIfPinned (   scanpos)
Value:
do { \
if (BTScanPosIsPinned(scanpos)) \
BTScanPosUnpin(scanpos); \
} while (0)
#define BTScanPosIsPinned(scanpos)
Definition: nbtree.h:990

Definition at line 1001 of file nbtree.h.

◆ BTSORTSUPPORT_PROC

#define BTSORTSUPPORT_PROC   2

Definition at line 703 of file nbtree.h.

◆ INDEX_ALT_TID_MASK

#define INDEX_ALT_TID_MASK   INDEX_AM_RESERVED_BIT

Definition at line 461 of file nbtree.h.

◆ MAX_BT_CYCLE_ID

#define MAX_BT_CYCLE_ID   0xFF7F

Definition at line 93 of file nbtree.h.

◆ MaxTIDsPerBTreePage

#define MaxTIDsPerBTreePage
Value:
(int) ((BLCKSZ - SizeOfPageHeaderData - sizeof(BTPageOpaqueData)) / \
sizeof(ItemPointerData))

Definition at line 186 of file nbtree.h.

◆ P_FIRSTDATAKEY

#define P_FIRSTDATAKEY (   opaque)    (P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY)

Definition at line 371 of file nbtree.h.

◆ P_FIRSTKEY

#define P_FIRSTKEY   ((OffsetNumber) 2)

Definition at line 370 of file nbtree.h.

◆ P_HAS_FULLXID

#define P_HAS_FULLXID (   opaque)    (((opaque)->btpo_flags & BTP_HAS_FULLXID) != 0)

Definition at line 229 of file nbtree.h.

◆ P_HAS_GARBAGE

#define P_HAS_GARBAGE (   opaque)    (((opaque)->btpo_flags & BTP_HAS_GARBAGE) != 0)

Definition at line 227 of file nbtree.h.

◆ P_HIKEY

#define P_HIKEY   ((OffsetNumber) 1)

Definition at line 369 of file nbtree.h.

◆ P_IGNORE

#define P_IGNORE (   opaque)    (((opaque)->btpo_flags & (BTP_DELETED|BTP_HALF_DEAD)) != 0)

Definition at line 226 of file nbtree.h.

◆ P_INCOMPLETE_SPLIT

#define P_INCOMPLETE_SPLIT (   opaque)    (((opaque)->btpo_flags & BTP_INCOMPLETE_SPLIT) != 0)

Definition at line 228 of file nbtree.h.

◆ P_ISDELETED

#define P_ISDELETED (   opaque)    (((opaque)->btpo_flags & BTP_DELETED) != 0)

Definition at line 223 of file nbtree.h.

◆ P_ISHALFDEAD

#define P_ISHALFDEAD (   opaque)    (((opaque)->btpo_flags & BTP_HALF_DEAD) != 0)

Definition at line 225 of file nbtree.h.

◆ P_ISLEAF

#define P_ISLEAF (   opaque)    (((opaque)->btpo_flags & BTP_LEAF) != 0)

Definition at line 221 of file nbtree.h.

◆ P_ISMETA

#define P_ISMETA (   opaque)    (((opaque)->btpo_flags & BTP_META) != 0)

Definition at line 224 of file nbtree.h.

◆ P_ISROOT

#define P_ISROOT (   opaque)    (((opaque)->btpo_flags & BTP_ROOT) != 0)

Definition at line 222 of file nbtree.h.

◆ P_LEFTMOST

#define P_LEFTMOST (   opaque)    ((opaque)->btpo_prev == P_NONE)

Definition at line 219 of file nbtree.h.

◆ P_NONE

#define P_NONE   0

Definition at line 213 of file nbtree.h.

◆ P_RIGHTMOST

#define P_RIGHTMOST (   opaque)    ((opaque)->btpo_next == P_NONE)

Definition at line 220 of file nbtree.h.

◆ PROGRESS_BTREE_PHASE_INDEXBUILD_TABLESCAN

#define PROGRESS_BTREE_PHASE_INDEXBUILD_TABLESCAN   2

Definition at line 1114 of file nbtree.h.

◆ PROGRESS_BTREE_PHASE_LEAF_LOAD

#define PROGRESS_BTREE_PHASE_LEAF_LOAD   5

Definition at line 1117 of file nbtree.h.

◆ PROGRESS_BTREE_PHASE_PERFORMSORT_1

#define PROGRESS_BTREE_PHASE_PERFORMSORT_1   3

Definition at line 1115 of file nbtree.h.

◆ PROGRESS_BTREE_PHASE_PERFORMSORT_2

#define PROGRESS_BTREE_PHASE_PERFORMSORT_2   4

Definition at line 1116 of file nbtree.h.

◆ SK_BT_DESC

#define SK_BT_DESC   (INDOPTION_DESC << SK_BT_INDOPTION_SHIFT)

Definition at line 1084 of file nbtree.h.

◆ SK_BT_INDOPTION_SHIFT

#define SK_BT_INDOPTION_SHIFT   24 /* must clear the above bits */

Definition at line 1083 of file nbtree.h.

◆ SK_BT_NULLS_FIRST

#define SK_BT_NULLS_FIRST   (INDOPTION_NULLS_FIRST << SK_BT_INDOPTION_SHIFT)

Definition at line 1085 of file nbtree.h.

◆ SK_BT_REQBKWD

#define SK_BT_REQBKWD   0x00020000 /* required to continue backward scan */

Definition at line 1082 of file nbtree.h.

◆ SK_BT_REQFWD

#define SK_BT_REQFWD   0x00010000 /* required to continue forward scan */

Definition at line 1081 of file nbtree.h.

Typedef Documentation

◆ BTArrayKeyInfo

◆ BTCycleId

typedef uint16 BTCycleId

Definition at line 29 of file nbtree.h.

◆ BTDedupInterval

◆ BTDedupState

Definition at line 893 of file nbtree.h.

◆ BTDedupStateData

◆ BTDeletedPageData

◆ BTInsertState

Definition at line 835 of file nbtree.h.

◆ BTInsertStateData

◆ BTMetaPageData

◆ BTOptions

typedef struct BTOptions BTOptions

◆ BTPageOpaque

Definition at line 71 of file nbtree.h.

◆ BTPageOpaqueData

◆ BTPendingFSM

typedef struct BTPendingFSM BTPendingFSM

◆ BTScanInsert

Definition at line 796 of file nbtree.h.

◆ BTScanInsertData

◆ BTScanOpaque

Definition at line 1074 of file nbtree.h.

◆ BTScanOpaqueData

◆ BTScanPos

Definition at line 988 of file nbtree.h.

◆ BTScanPosData

typedef struct BTScanPosData BTScanPosData

◆ BTScanPosItem

typedef struct BTScanPosItem BTScanPosItem

◆ BTStack

typedef BTStackData* BTStack

Definition at line 734 of file nbtree.h.

◆ BTStackData

typedef struct BTStackData BTStackData

◆ BTVacState

typedef struct BTVacState BTVacState

◆ BTVacuumPosting

Definition at line 914 of file nbtree.h.

◆ BTVacuumPostingData

Function Documentation

◆ _bt_advance_array_keys()

bool _bt_advance_array_keys ( IndexScanDesc  scan,
ScanDirection  dir 
)

Definition at line 551 of file nbtutils.c.

552 {
553  BTScanOpaque so = (BTScanOpaque) scan->opaque;
554  bool found = false;
555  int i;
556 
557  /*
558  * We must advance the last array key most quickly, since it will
559  * correspond to the lowest-order index column among the available
560  * qualifications. This is necessary to ensure correct ordering of output
561  * when there are multiple array keys.
562  */
563  for (i = so->numArrayKeys - 1; i >= 0; i--)
564  {
565  BTArrayKeyInfo *curArrayKey = &so->arrayKeys[i];
566  ScanKey skey = &so->arrayKeyData[curArrayKey->scan_key];
567  int cur_elem = curArrayKey->cur_elem;
568  int num_elems = curArrayKey->num_elems;
569 
570  if (ScanDirectionIsBackward(dir))
571  {
572  if (--cur_elem < 0)
573  {
574  cur_elem = num_elems - 1;
575  found = false; /* need to advance next array key */
576  }
577  else
578  found = true;
579  }
580  else
581  {
582  if (++cur_elem >= num_elems)
583  {
584  cur_elem = 0;
585  found = false; /* need to advance next array key */
586  }
587  else
588  found = true;
589  }
590 
591  curArrayKey->cur_elem = cur_elem;
592  skey->sk_argument = curArrayKey->elem_values[cur_elem];
593  if (found)
594  break;
595  }
596 
597  /* advance parallel scan */
598  if (scan->parallel_scan != NULL)
600 
601  return found;
602 }
int i
Definition: isn.c:73
void _bt_parallel_advance_array_keys(IndexScanDesc scan)
Definition: nbtree.c:758
BTScanOpaqueData * BTScanOpaque
Definition: nbtree.h:1074
#define ScanDirectionIsBackward(direction)
Definition: sdir.h:41
Datum * elem_values
Definition: nbtree.h:1029
BTArrayKeyInfo * arrayKeys
Definition: nbtree.h:1045
ScanKey arrayKeyData
Definition: nbtree.h:1040
struct ParallelIndexScanDescData * parallel_scan
Definition: relscan.h:166
Datum sk_argument
Definition: skey.h:72

References _bt_parallel_advance_array_keys(), BTScanOpaqueData::arrayKeyData, BTScanOpaqueData::arrayKeys, BTArrayKeyInfo::cur_elem, BTArrayKeyInfo::elem_values, i, BTArrayKeyInfo::num_elems, BTScanOpaqueData::numArrayKeys, IndexScanDescData::opaque, IndexScanDescData::parallel_scan, BTArrayKeyInfo::scan_key, ScanDirectionIsBackward, and ScanKeyData::sk_argument.

Referenced by btgetbitmap(), and btgettuple().

◆ _bt_allequalimage()

bool _bt_allequalimage ( Relation  rel,
bool  debugmessage 
)

Definition at line 2696 of file nbtutils.c.

2697 {
2698  bool allequalimage = true;
2699 
2700  /* INCLUDE indexes can never support deduplication */
2703  return false;
2704 
2705  for (int i = 0; i < IndexRelationGetNumberOfKeyAttributes(rel); i++)
2706  {
2707  Oid opfamily = rel->rd_opfamily[i];
2708  Oid opcintype = rel->rd_opcintype[i];
2709  Oid collation = rel->rd_indcollation[i];
2710  Oid equalimageproc;
2711 
2712  equalimageproc = get_opfamily_proc(opfamily, opcintype, opcintype,
2714 
2715  /*
2716  * If there is no BTEQUALIMAGE_PROC then deduplication is assumed to
2717  * be unsafe. Otherwise, actually call proc and see what it says.
2718  */
2719  if (!OidIsValid(equalimageproc) ||
2720  !DatumGetBool(OidFunctionCall1Coll(equalimageproc, collation,
2721  ObjectIdGetDatum(opcintype))))
2722  {
2723  allequalimage = false;
2724  break;
2725  }
2726  }
2727 
2728  if (debugmessage)
2729  {
2730  if (allequalimage)
2731  elog(DEBUG1, "index \"%s\" can safely use deduplication",
2733  else
2734  elog(DEBUG1, "index \"%s\" cannot use deduplication",
2736  }
2737 
2738  return allequalimage;
2739 }
#define OidIsValid(objectId)
Definition: c.h:710
#define DEBUG1
Definition: elog.h:24
#define elog(elevel,...)
Definition: elog.h:218
Datum OidFunctionCall1Coll(Oid functionId, Oid collation, Datum arg1)
Definition: fmgr.c:1396
Oid get_opfamily_proc(Oid opfamily, Oid lefttype, Oid righttype, int16 procnum)
Definition: lsyscache.c:794
#define BTEQUALIMAGE_PROC
Definition: nbtree.h:705
#define DatumGetBool(X)
Definition: postgres.h:437
#define ObjectIdGetDatum(X)
Definition: postgres.h:551
unsigned int Oid
Definition: postgres_ext.h:31
#define RelationGetRelationName(relation)
Definition: rel.h:523
#define IndexRelationGetNumberOfKeyAttributes(relation)
Definition: rel.h:508
Oid * rd_opcintype
Definition: rel.h:204
Oid * rd_opfamily
Definition: rel.h:203
Oid * rd_indcollation
Definition: rel.h:213

References BTEQUALIMAGE_PROC, DatumGetBool, DEBUG1, elog, get_opfamily_proc(), i, IndexRelationGetNumberOfAttributes, IndexRelationGetNumberOfKeyAttributes, ObjectIdGetDatum, OidFunctionCall1Coll(), OidIsValid, RelationData::rd_indcollation, RelationData::rd_opcintype, RelationData::rd_opfamily, and RelationGetRelationName.

Referenced by _bt_leafbuild(), bt_index_check_internal(), and btbuildempty().

◆ _bt_binsrch_insert()

OffsetNumber _bt_binsrch_insert ( Relation  rel,
BTInsertState  insertstate 
)

Definition at line 442 of file nbtsearch.c.

443 {
444  BTScanInsert key = insertstate->itup_key;
445  Page page;
446  BTPageOpaque opaque;
447  OffsetNumber low,
448  high,
449  stricthigh;
450  int32 result,
451  cmpval;
452 
453  page = BufferGetPage(insertstate->buf);
454  opaque = BTPageGetOpaque(page);
455 
456  Assert(P_ISLEAF(opaque));
457  Assert(!key->nextkey);
458  Assert(insertstate->postingoff == 0);
459 
460  if (!insertstate->bounds_valid)
461  {
462  /* Start new binary search */
463  low = P_FIRSTDATAKEY(opaque);
464  high = PageGetMaxOffsetNumber(page);
465  }
466  else
467  {
468  /* Restore result of previous binary search against same page */
469  low = insertstate->low;
470  high = insertstate->stricthigh;
471  }
472 
473  /* If there are no keys on the page, return the first available slot */
474  if (unlikely(high < low))
475  {
476  /* Caller can't reuse bounds */
477  insertstate->low = InvalidOffsetNumber;
478  insertstate->stricthigh = InvalidOffsetNumber;
479  insertstate->bounds_valid = false;
480  return low;
481  }
482 
483  /*
484  * Binary search to find the first key on the page >= scan key. (nextkey
485  * is always false when inserting).
486  *
487  * The loop invariant is: all slots before 'low' are < scan key, all slots
488  * at or after 'high' are >= scan key. 'stricthigh' is > scan key, and is
489  * maintained to save additional search effort for caller.
490  *
491  * We can fall out when high == low.
492  */
493  if (!insertstate->bounds_valid)
494  high++; /* establish the loop invariant for high */
495  stricthigh = high; /* high initially strictly higher */
496 
497  cmpval = 1; /* !nextkey comparison value */
498 
499  while (high > low)
500  {
501  OffsetNumber mid = low + ((high - low) / 2);
502 
503  /* We have low <= mid < high, so mid points at a real slot */
504 
505  result = _bt_compare(rel, key, page, mid);
506 
507  if (result >= cmpval)
508  low = mid + 1;
509  else
510  {
511  high = mid;
512  if (result != 0)
513  stricthigh = high;
514  }
515 
516  /*
517  * If tuple at offset located by binary search is a posting list whose
518  * TID range overlaps with caller's scantid, perform posting list
519  * binary search to set postingoff for caller. Caller must split the
520  * posting list when postingoff is set. This should happen
521  * infrequently.
522  */
523  if (unlikely(result == 0 && key->scantid != NULL))
524  {
525  /*
526  * postingoff should never be set more than once per leaf page
527  * binary search. That would mean that there are duplicate table
528  * TIDs in the index, which is never okay. Check for that here.
529  */
530  if (insertstate->postingoff != 0)
531  ereport(ERROR,
532  (errcode(ERRCODE_INDEX_CORRUPTED),
533  errmsg_internal("table tid from new index tuple (%u,%u) cannot find insert offset between offsets %u and %u of block %u in index \"%s\"",
534  ItemPointerGetBlockNumber(key->scantid),
536  low, stricthigh,
537  BufferGetBlockNumber(insertstate->buf),
538  RelationGetRelationName(rel))));
539 
540  insertstate->postingoff = _bt_binsrch_posting(key, page, mid);
541  }
542  }
543 
544  /*
545  * On a leaf page, a binary search always returns the first key >= scan
546  * key (at least in !nextkey case), which could be the last slot + 1. This
547  * is also the lower bound of cached search.
548  *
549  * stricthigh may also be the last slot + 1, which prevents caller from
550  * using bounds directly, but is still useful to us if we're called a
551  * second time with cached bounds (cached low will be < stricthigh when
552  * that happens).
553  */
554  insertstate->low = low;
555  insertstate->stricthigh = stricthigh;
556  insertstate->bounds_valid = true;
557 
558  return low;
559 }
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2755
#define BufferGetPage(buffer)
Definition: bufmgr.h:169
Pointer Page
Definition: bufpage.h:78
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:356
signed int int32
Definition: c.h:429
#define unlikely(x)
Definition: c.h:273
int errmsg_internal(const char *fmt,...)
Definition: elog.c:991
int errcode(int sqlerrcode)
Definition: elog.c:693
#define ERROR
Definition: elog.h:33
#define ereport(elevel,...)
Definition: elog.h:143
#define ItemPointerGetBlockNumber(pointer)
Definition: itemptr.h:98
#define ItemPointerGetOffsetNumber(pointer)
Definition: itemptr.h:117
Assert(fmt[strlen(fmt) - 1] !='\n')
#define P_ISLEAF(opaque)
Definition: nbtree.h:221
#define BTPageGetOpaque(page)
Definition: nbtree.h:73
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:371
static int _bt_binsrch_posting(BTScanInsert key, Page page, OffsetNumber offnum)
Definition: nbtsearch.c:570
int32 _bt_compare(Relation rel, BTScanInsert key, Page page, OffsetNumber offnum)
Definition: nbtsearch.c:656
#define InvalidOffsetNumber
Definition: off.h:26
uint16 OffsetNumber
Definition: off.h:24
OffsetNumber stricthigh
Definition: nbtree.h:825
bool bounds_valid
Definition: nbtree.h:823
OffsetNumber low
Definition: nbtree.h:824
BTScanInsert itup_key
Definition: nbtree.h:813

References _bt_binsrch_posting(), _bt_compare(), Assert(), BTInsertStateData::bounds_valid, BTPageGetOpaque, BTInsertStateData::buf, BufferGetBlockNumber(), BufferGetPage, ereport, errcode(), errmsg_internal(), ERROR, InvalidOffsetNumber, ItemPointerGetBlockNumber, ItemPointerGetOffsetNumber, BTInsertStateData::itup_key, sort-test::key, BTInsertStateData::low, P_FIRSTDATAKEY, P_ISLEAF, PageGetMaxOffsetNumber, BTInsertStateData::postingoff, RelationGetRelationName, BTInsertStateData::stricthigh, and unlikely.

Referenced by _bt_check_unique(), _bt_findinsertloc(), and bt_rootdescend().

◆ _bt_bottomupdel_pass()

bool _bt_bottomupdel_pass ( Relation  rel,
Buffer  buf,
Relation  heapRel,
Size  newitemsz 
)

Definition at line 306 of file nbtdedup.c.

308 {
309  OffsetNumber offnum,
310  minoff,
311  maxoff;
312  Page page = BufferGetPage(buf);
313  BTPageOpaque opaque = BTPageGetOpaque(page);
315  TM_IndexDeleteOp delstate;
316  bool neverdedup;
317  int nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
318 
319  /* Passed-in newitemsz is MAXALIGNED but does not include line pointer */
320  newitemsz += sizeof(ItemIdData);
321 
322  /* Initialize deduplication state */
324  state->deduplicate = true;
325  state->nmaxitems = 0;
326  state->maxpostingsize = BLCKSZ; /* We're not really deduplicating */
327  state->base = NULL;
328  state->baseoff = InvalidOffsetNumber;
329  state->basetupsize = 0;
330  state->htids = palloc(state->maxpostingsize);
331  state->nhtids = 0;
332  state->nitems = 0;
333  state->phystupsize = 0;
334  state->nintervals = 0;
335 
336  /*
337  * Initialize tableam state that describes bottom-up index deletion
338  * operation.
339  *
340  * We'll go on to ask the tableam to search for TIDs whose index tuples we
341  * can safely delete. The tableam will search until our leaf page space
342  * target is satisfied, or until the cost of continuing with the tableam
343  * operation seems too high. It focuses its efforts on TIDs associated
344  * with duplicate index tuples that we mark "promising".
345  *
346  * This space target is a little arbitrary. The tableam must be able to
347  * keep the costs and benefits in balance. We provide the tableam with
348  * exhaustive information about what might work, without directly
349  * concerning ourselves with avoiding work during the tableam call. Our
350  * role in costing the bottom-up deletion process is strictly advisory.
351  */
352  delstate.irel = rel;
353  delstate.iblknum = BufferGetBlockNumber(buf);
354  delstate.bottomup = true;
355  delstate.bottomupfreespace = Max(BLCKSZ / 16, newitemsz);
356  delstate.ndeltids = 0;
357  delstate.deltids = palloc(MaxTIDsPerBTreePage * sizeof(TM_IndexDelete));
358  delstate.status = palloc(MaxTIDsPerBTreePage * sizeof(TM_IndexStatus));
359 
360  minoff = P_FIRSTDATAKEY(opaque);
361  maxoff = PageGetMaxOffsetNumber(page);
362  for (offnum = minoff;
363  offnum <= maxoff;
364  offnum = OffsetNumberNext(offnum))
365  {
366  ItemId itemid = PageGetItemId(page, offnum);
367  IndexTuple itup = (IndexTuple) PageGetItem(page, itemid);
368 
369  Assert(!ItemIdIsDead(itemid));
370 
371  if (offnum == minoff)
372  {
373  /* itup starts first pending interval */
374  _bt_dedup_start_pending(state, itup, offnum);
375  }
376  else if (_bt_keep_natts_fast(rel, state->base, itup) > nkeyatts &&
377  _bt_dedup_save_htid(state, itup))
378  {
379  /* Tuple is equal; just added its TIDs to pending interval */
380  }
381  else
382  {
383  /* Finalize interval -- move its TIDs to delete state */
384  _bt_bottomupdel_finish_pending(page, state, &delstate);
385 
386  /* itup starts new pending interval */
387  _bt_dedup_start_pending(state, itup, offnum);
388  }
389  }
390  /* Finalize final interval -- move its TIDs to delete state */
391  _bt_bottomupdel_finish_pending(page, state, &delstate);
392 
393  /*
394  * We don't give up now in the event of having few (or even zero)
395  * promising tuples for the tableam because it's not up to us as the index
396  * AM to manage costs (note that the tableam might have heuristics of its
397  * own that work out what to do). We should at least avoid having our
398  * caller do a useless deduplication pass after we return in the event of
399  * zero promising tuples, though.
400  */
401  neverdedup = false;
402  if (state->nintervals == 0)
403  neverdedup = true;
404 
405  pfree(state->htids);
406  pfree(state);
407 
408  /* Ask tableam which TIDs are deletable, then physically delete them */
409  _bt_delitems_delete_check(rel, buf, heapRel, &delstate);
410 
411  pfree(delstate.deltids);
412  pfree(delstate.status);
413 
414  /* Report "success" to caller unconditionally to avoid deduplication */
415  if (neverdedup)
416  return true;
417 
418  /* Don't dedup when we won't end up back here any time soon anyway */
419  return PageGetExactFreeSpace(page) >= Max(BLCKSZ / 24, newitemsz);
420 }
Size PageGetExactFreeSpace(Page page)
Definition: bufpage.c:958
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:234
#define PageGetItem(page, itemId)
Definition: bufpage.h:339
#define Max(x, y)
Definition: c.h:980
struct ItemIdData ItemIdData
#define ItemIdIsDead(itemId)
Definition: itemid.h:113
IndexTupleData * IndexTuple
Definition: itup.h:53
void pfree(void *pointer)
Definition: mcxt.c:1175
void * palloc(Size size)
Definition: mcxt.c:1068
bool _bt_dedup_save_htid(BTDedupState state, IndexTuple itup)
Definition: nbtdedup.c:483
void _bt_dedup_start_pending(BTDedupState state, IndexTuple base, OffsetNumber baseoff)
Definition: nbtdedup.c:432
static void _bt_bottomupdel_finish_pending(Page page, BTDedupState state, TM_IndexDeleteOp *delstate)
Definition: nbtdedup.c:644
void _bt_delitems_delete_check(Relation rel, Buffer buf, Relation heapRel, TM_IndexDeleteOp *delstate)
Definition: nbtpage.c:1528
#define MaxTIDsPerBTreePage
Definition: nbtree.h:186
BTDedupStateData * BTDedupState
Definition: nbtree.h:893
int _bt_keep_natts_fast(Relation rel, IndexTuple lastleft, IndexTuple firstright)
Definition: nbtutils.c:2424
#define OffsetNumberNext(offsetNumber)
Definition: off.h:52
TM_IndexStatus * status
Definition: tableam.h:231
int bottomupfreespace
Definition: tableam.h:226
Relation irel
Definition: tableam.h:223
TM_IndexDelete * deltids
Definition: tableam.h:230
BlockNumber iblknum
Definition: tableam.h:224
Definition: regguts.h:318

References _bt_bottomupdel_finish_pending(), _bt_dedup_save_htid(), _bt_dedup_start_pending(), _bt_delitems_delete_check(), _bt_keep_natts_fast(), Assert(), TM_IndexDeleteOp::bottomup, TM_IndexDeleteOp::bottomupfreespace, BTPageGetOpaque, buf, BufferGetBlockNumber(), BufferGetPage, TM_IndexDeleteOp::deltids, TM_IndexDeleteOp::iblknum, IndexRelationGetNumberOfKeyAttributes, InvalidOffsetNumber, TM_IndexDeleteOp::irel, ItemIdIsDead, Max, MaxTIDsPerBTreePage, TM_IndexDeleteOp::ndeltids, OffsetNumberNext, P_FIRSTDATAKEY, PageGetExactFreeSpace(), PageGetItem, PageGetItemId, PageGetMaxOffsetNumber, palloc(), pfree(), and TM_IndexDeleteOp::status.

Referenced by _bt_delete_or_dedup_one_page().

◆ _bt_check_natts()

bool _bt_check_natts ( Relation  rel,
bool  heapkeyspace,
Page  page,
OffsetNumber  offnum 
)

Definition at line 2471 of file nbtutils.c.

2472 {
2475  BTPageOpaque opaque = BTPageGetOpaque(page);
2476  IndexTuple itup;
2477  int tupnatts;
2478 
2479  /*
2480  * We cannot reliably test a deleted or half-dead page, since they have
2481  * dummy high keys
2482  */
2483  if (P_IGNORE(opaque))
2484  return true;
2485 
2486  Assert(offnum >= FirstOffsetNumber &&
2487  offnum <= PageGetMaxOffsetNumber(page));
2488 
2489  /*
2490  * Mask allocated for number of keys in index tuple must be able to fit
2491  * maximum possible number of index attributes
2492  */
2494  "BT_OFFSET_MASK can't fit INDEX_MAX_KEYS");
2495 
2496  itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
2497  tupnatts = BTreeTupleGetNAtts(itup, rel);
2498 
2499  /* !heapkeyspace indexes do not support deduplication */
2500  if (!heapkeyspace && BTreeTupleIsPosting(itup))
2501  return false;
2502 
2503  /* Posting list tuples should never have "pivot heap TID" bit set */
2504  if (BTreeTupleIsPosting(itup) &&
2506  BT_PIVOT_HEAP_TID_ATTR) != 0)
2507  return false;
2508 
2509  /* INCLUDE indexes do not support deduplication */
2510  if (natts != nkeyatts && BTreeTupleIsPosting(itup))
2511  return false;
2512 
2513  if (P_ISLEAF(opaque))
2514  {
2515  if (offnum >= P_FIRSTDATAKEY(opaque))
2516  {
2517  /*
2518  * Non-pivot tuple should never be explicitly marked as a pivot
2519  * tuple
2520  */
2521  if (BTreeTupleIsPivot(itup))
2522  return false;
2523 
2524  /*
2525  * Leaf tuples that are not the page high key (non-pivot tuples)
2526  * should never be truncated. (Note that tupnatts must have been
2527  * inferred, even with a posting list tuple, because only pivot
2528  * tuples store tupnatts directly.)
2529  */
2530  return tupnatts == natts;
2531  }
2532  else
2533  {
2534  /*
2535  * Rightmost page doesn't contain a page high key, so tuple was
2536  * checked above as ordinary leaf tuple
2537  */
2538  Assert(!P_RIGHTMOST(opaque));
2539 
2540  /*
2541  * !heapkeyspace high key tuple contains only key attributes. Note
2542  * that tupnatts will only have been explicitly represented in
2543  * !heapkeyspace indexes that happen to have non-key attributes.
2544  */
2545  if (!heapkeyspace)
2546  return tupnatts == nkeyatts;
2547 
2548  /* Use generic heapkeyspace pivot tuple handling */
2549  }
2550  }
2551  else /* !P_ISLEAF(opaque) */
2552  {
2553  if (offnum == P_FIRSTDATAKEY(opaque))
2554  {
2555  /*
2556  * The first tuple on any internal page (possibly the first after
2557  * its high key) is its negative infinity tuple. Negative
2558  * infinity tuples are always truncated to zero attributes. They
2559  * are a particular kind of pivot tuple.
2560  */
2561  if (heapkeyspace)
2562  return tupnatts == 0;
2563 
2564  /*
2565  * The number of attributes won't be explicitly represented if the
2566  * negative infinity tuple was generated during a page split that
2567  * occurred with a version of Postgres before v11. There must be
2568  * a problem when there is an explicit representation that is
2569  * non-zero, or when there is no explicit representation and the
2570  * tuple is evidently not a pre-pg_upgrade tuple.
2571  *
2572  * Prior to v11, downlinks always had P_HIKEY as their offset.
2573  * Accept that as an alternative indication of a valid
2574  * !heapkeyspace negative infinity tuple.
2575  */
2576  return tupnatts == 0 ||
2578  }
2579  else
2580  {
2581  /*
2582  * !heapkeyspace downlink tuple with separator key contains only
2583  * key attributes. Note that tupnatts will only have been
2584  * explicitly represented in !heapkeyspace indexes that happen to
2585  * have non-key attributes.
2586  */
2587  if (!heapkeyspace)
2588  return tupnatts == nkeyatts;
2589 
2590  /* Use generic heapkeyspace pivot tuple handling */
2591  }
2592  }
2593 
2594  /* Handle heapkeyspace pivot tuples (excluding minus infinity items) */
2595  Assert(heapkeyspace);
2596 
2597  /*
2598  * Explicit representation of the number of attributes is mandatory with
2599  * heapkeyspace index pivot tuples, regardless of whether or not there are
2600  * non-key attributes.
2601  */
2602  if (!BTreeTupleIsPivot(itup))
2603  return false;
2604 
2605  /* Pivot tuple should not use posting list representation (redundant) */
2606  if (BTreeTupleIsPosting(itup))
2607  return false;
2608 
2609  /*
2610  * Heap TID is a tiebreaker key attribute, so it cannot be untruncated
2611  * when any other key attribute is truncated
2612  */
2613  if (BTreeTupleGetHeapTID(itup) != NULL && tupnatts != nkeyatts)
2614  return false;
2615 
2616  /*
2617  * Pivot tuple must have at least one untruncated key attribute (minus
2618  * infinity pivot tuples are the only exception). Pivot tuples can never
2619  * represent that there is a value present for a key attribute that
2620  * exceeds pg_index.indnkeyatts for the index.
2621  */
2622  return tupnatts > 0 && tupnatts <= nkeyatts;
2623 }
signed short int16
Definition: c.h:428
#define StaticAssertStmt(condition, errmessage)
Definition: c.h:918
#define BT_PIVOT_HEAP_TID_ATTR
Definition: nbtree.h:467
#define P_HIKEY
Definition: nbtree.h:369
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:220
#define P_IGNORE(opaque)
Definition: nbtree.h:226
static bool BTreeTupleIsPosting(IndexTuple itup)
Definition: nbtree.h:487
static ItemPointer BTreeTupleGetHeapTID(IndexTuple itup)
Definition: nbtree.h:633
#define BTreeTupleGetNAtts(itup, rel)
Definition: nbtree.h:572
#define FirstOffsetNumber
Definition: off.h:27
#define INDEX_MAX_KEYS
ItemPointerData t_tid
Definition: itup.h:37

References Assert(), BT_OFFSET_MASK, BT_PIVOT_HEAP_TID_ATTR, BTPageGetOpaque, BTreeTupleGetHeapTID(), BTreeTupleGetNAtts, BTreeTupleIsPivot(), BTreeTupleIsPosting(), FirstOffsetNumber, INDEX_MAX_KEYS, IndexRelationGetNumberOfAttributes, IndexRelationGetNumberOfKeyAttributes, ItemPointerGetOffsetNumber, ItemPointerGetOffsetNumberNoCheck, P_FIRSTDATAKEY, P_HIKEY, P_IGNORE, P_ISLEAF, P_RIGHTMOST, PageGetItem, PageGetItemId, PageGetMaxOffsetNumber, StaticAssertStmt, and IndexTupleData::t_tid.

Referenced by _bt_compare(), and bt_target_page_check().

◆ _bt_check_third_page()

void _bt_check_third_page ( Relation  rel,
Relation  heap,
bool  needheaptidspace,
Page  page,
IndexTuple  newtup 
)

Definition at line 2638 of file nbtutils.c.

2640 {
2641  Size itemsz;
2642  BTPageOpaque opaque;
2643 
2644  itemsz = MAXALIGN(IndexTupleSize(newtup));
2645 
2646  /* Double check item size against limit */
2647  if (itemsz <= BTMaxItemSize(page))
2648  return;
2649 
2650  /*
2651  * Tuple is probably too large to fit on page, but it's possible that the
2652  * index uses version 2 or version 3, or that page is an internal page, in
2653  * which case a slightly higher limit applies.
2654  */
2655  if (!needheaptidspace && itemsz <= BTMaxItemSizeNoHeapTid(page))
2656  return;
2657 
2658  /*
2659  * Internal page insertions cannot fail here, because that would mean that
2660  * an earlier leaf level insertion that should have failed didn't
2661  */
2662  opaque = BTPageGetOpaque(page);
2663  if (!P_ISLEAF(opaque))
2664  elog(ERROR, "cannot insert oversized tuple of size %zu on internal page of index \"%s\"",
2665  itemsz, RelationGetRelationName(rel));
2666 
2667  ereport(ERROR,
2668  (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
2669  errmsg("index row size %zu exceeds btree version %u maximum %zu for index \"%s\"",
2670  itemsz,
2671  needheaptidspace ? BTREE_VERSION : BTREE_NOVAC_VERSION,
2672  needheaptidspace ? BTMaxItemSize(page) :
2673  BTMaxItemSizeNoHeapTid(page),
2675  errdetail("Index row references tuple (%u,%u) in relation \"%s\".",
2678  RelationGetRelationName(heap)),
2679  errhint("Values larger than 1/3 of a buffer page cannot be indexed.\n"
2680  "Consider a function index of an MD5 hash of the value, "
2681  "or use full text indexing."),
2683 }
size_t Size
Definition: c.h:540
int errdetail(const char *fmt,...)
Definition: elog.c:1037
int errhint(const char *fmt,...)
Definition: elog.c:1151
int errmsg(const char *fmt,...)
Definition: elog.c:904
#define IndexTupleSize(itup)
Definition: itup.h:70
#define BTMaxItemSizeNoHeapTid(page)
Definition: nbtree.h:170
#define BTREE_VERSION
Definition: nbtree.h:150
#define BTMaxItemSize(page)
Definition: nbtree.h:164
#define BTREE_NOVAC_VERSION
Definition: nbtree.h:152
int errtableconstraint(Relation rel, const char *conname)
Definition: relcache.c:5873

References BTMaxItemSize, BTMaxItemSizeNoHeapTid, BTPageGetOpaque, BTREE_NOVAC_VERSION, BTREE_VERSION, BTreeTupleGetHeapTID(), elog, ereport, errcode(), errdetail(), errhint(), errmsg(), ERROR, errtableconstraint(), IndexTupleSize, ItemPointerGetBlockNumber, ItemPointerGetOffsetNumber, MAXALIGN, P_ISLEAF, and RelationGetRelationName.

Referenced by _bt_buildadd(), and _bt_findinsertloc().

◆ _bt_checkkeys()

bool _bt_checkkeys ( IndexScanDesc  scan,
IndexTuple  tuple,
int  tupnatts,
ScanDirection  dir,
bool continuescan 
)

Definition at line 1362 of file nbtutils.c.

1364 {
1365  TupleDesc tupdesc;
1366  BTScanOpaque so;
1367  int keysz;
1368  int ikey;
1369  ScanKey key;
1370 
1371  Assert(BTreeTupleGetNAtts(tuple, scan->indexRelation) == tupnatts);
1372 
1373  *continuescan = true; /* default assumption */
1374 
1375  tupdesc = RelationGetDescr(scan->indexRelation);
1376  so = (BTScanOpaque) scan->opaque;
1377  keysz = so->numberOfKeys;
1378 
1379  for (key = so->keyData, ikey = 0; ikey < keysz; key++, ikey++)
1380  {
1381  Datum datum;
1382  bool isNull;
1383  Datum test;
1384 
1385  if (key->sk_attno > tupnatts)
1386  {
1387  /*
1388  * This attribute is truncated (must be high key). The value for
1389  * this attribute in the first non-pivot tuple on the page to the
1390  * right could be any possible value. Assume that truncated
1391  * attribute passes the qual.
1392  */
1394  Assert(BTreeTupleIsPivot(tuple));
1395  continue;
1396  }
1397 
1398  /* row-comparison keys need special processing */
1399  if (key->sk_flags & SK_ROW_HEADER)
1400  {
1401  if (_bt_check_rowcompare(key, tuple, tupnatts, tupdesc, dir,
1402  continuescan))
1403  continue;
1404  return false;
1405  }
1406 
1407  datum = index_getattr(tuple,
1408  key->sk_attno,
1409  tupdesc,
1410  &isNull);
1411 
1412  if (key->sk_flags & SK_ISNULL)
1413  {
1414  /* Handle IS NULL/NOT NULL tests */
1415  if (key->sk_flags & SK_SEARCHNULL)
1416  {
1417  if (isNull)
1418  continue; /* tuple satisfies this qual */
1419  }
1420  else
1421  {
1422  Assert(key->sk_flags & SK_SEARCHNOTNULL);
1423  if (!isNull)
1424  continue; /* tuple satisfies this qual */
1425  }
1426 
1427  /*
1428  * Tuple fails this qual. If it's a required qual for the current
1429  * scan direction, then we can conclude no further tuples will
1430  * pass, either.
1431  */
1432  if ((key->sk_flags & SK_BT_REQFWD) &&
1434  *continuescan = false;
1435  else if ((key->sk_flags & SK_BT_REQBKWD) &&
1437  *continuescan = false;
1438 
1439  /*
1440  * In any case, this indextuple doesn't match the qual.
1441  */
1442  return false;
1443  }
1444 
1445  if (isNull)
1446  {
1447  if (key->sk_flags & SK_BT_NULLS_FIRST)
1448  {
1449  /*
1450  * Since NULLs are sorted before non-NULLs, we know we have
1451  * reached the lower limit of the range of values for this
1452  * index attr. On a backward scan, we can stop if this qual
1453  * is one of the "must match" subset. We can stop regardless
1454  * of whether the qual is > or <, so long as it's required,
1455  * because it's not possible for any future tuples to pass. On
1456  * a forward scan, however, we must keep going, because we may
1457  * have initially positioned to the start of the index.
1458  */
1459  if ((key->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) &&
1461  *continuescan = false;
1462  }
1463  else
1464  {
1465  /*
1466  * Since NULLs are sorted after non-NULLs, we know we have
1467  * reached the upper limit of the range of values for this
1468  * index attr. On a forward scan, we can stop if this qual is
1469  * one of the "must match" subset. We can stop regardless of
1470  * whether the qual is > or <, so long as it's required,
1471  * because it's not possible for any future tuples to pass. On
1472  * a backward scan, however, we must keep going, because we
1473  * may have initially positioned to the end of the index.
1474  */
1475  if ((key->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) &&
1477  *continuescan = false;
1478  }
1479 
1480  /*
1481  * In any case, this indextuple doesn't match the qual.
1482  */
1483  return false;
1484  }
1485 
1486  test = FunctionCall2Coll(&key->sk_func, key->sk_collation,
1487  datum, key->sk_argument);
1488 
1489  if (!DatumGetBool(test))
1490  {
1491  /*
1492  * Tuple fails this qual. If it's a required qual for the current
1493  * scan direction, then we can conclude no further tuples will
1494  * pass, either.
1495  *
1496  * Note: because we stop the scan as soon as any required equality
1497  * qual fails, it is critical that equality quals be used for the
1498  * initial positioning in _bt_first() when they are available. See
1499  * comments in _bt_first().
1500  */
1501  if ((key->sk_flags & SK_BT_REQFWD) &&
1503  *continuescan = false;
1504  else if ((key->sk_flags & SK_BT_REQBKWD) &&
1506  *continuescan = false;
1507 
1508  /*
1509  * In any case, this indextuple doesn't match the qual.
1510  */
1511  return false;
1512  }
1513  }
1514 
1515  /* If we get here, the tuple passes all index quals. */
1516  return true;
1517 }
Datum FunctionCall2Coll(FmgrInfo *flinfo, Oid collation, Datum arg1, Datum arg2)
Definition: fmgr.c:1134
#define index_getattr(tup, attnum, tupleDesc, isnull)
Definition: itup.h:99
#define SK_BT_REQBKWD
Definition: nbtree.h:1082
#define SK_BT_NULLS_FIRST
Definition: nbtree.h:1085
#define SK_BT_REQFWD
Definition: nbtree.h:1081
static bool _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts, TupleDesc tupdesc, ScanDirection dir, bool *continuescan)
Definition: nbtutils.c:1529
uintptr_t Datum
Definition: postgres.h:411
static void test(void)
#define RelationGetDescr(relation)
Definition: rel.h:515
#define ScanDirectionIsForward(direction)
Definition: sdir.h:55
#define SK_ROW_HEADER
Definition: skey.h:117
#define SK_SEARCHNOTNULL
Definition: skey.h:122
#define SK_SEARCHNULL
Definition: skey.h:121
#define SK_ISNULL
Definition: skey.h:115
ScanKey keyData
Definition: nbtree.h:1037
Relation indexRelation
Definition: relscan.h:118

References _bt_check_rowcompare(), Assert(), BTreeTupleGetNAtts, BTreeTupleIsPivot(), DatumGetBool, FunctionCall2Coll(), index_getattr, IndexScanDescData::indexRelation, sort-test::key, BTScanOpaqueData::keyData, BTScanOpaqueData::numberOfKeys, IndexScanDescData::opaque, RelationGetDescr, ScanDirectionIsBackward, ScanDirectionIsForward, SK_BT_NULLS_FIRST, SK_BT_REQBKWD, SK_BT_REQFWD, SK_ISNULL, SK_ROW_HEADER, SK_SEARCHNOTNULL, SK_SEARCHNULL, and test().

Referenced by _bt_readpage().

◆ _bt_checkpage()

void _bt_checkpage ( Relation  rel,
Buffer  buf 
)

Definition at line 794 of file nbtpage.c.

795 {
796  Page page = BufferGetPage(buf);
797 
798  /*
799  * ReadBuffer verifies that every newly-read page passes
800  * PageHeaderIsValid, which means it either contains a reasonably sane
801  * page header or is all-zero. We have to defend against the all-zero
802  * case, however.
803  */
804  if (PageIsNew(page))
805  ereport(ERROR,
806  (errcode(ERRCODE_INDEX_CORRUPTED),
807  errmsg("index \"%s\" contains unexpected zero page at block %u",
810  errhint("Please REINDEX it.")));
811 
812  /*
813  * Additionally check that the special area looks sane.
814  */
815  if (PageGetSpecialSize(page) != MAXALIGN(sizeof(BTPageOpaqueData)))
816  ereport(ERROR,
817  (errcode(ERRCODE_INDEX_CORRUPTED),
818  errmsg("index \"%s\" contains corrupted page at block %u",
821  errhint("Please REINDEX it.")));
822 }
#define PageGetSpecialSize(page)
Definition: bufpage.h:299
#define PageIsNew(page)
Definition: bufpage.h:228

References buf, BufferGetBlockNumber(), BufferGetPage, ereport, errcode(), errhint(), errmsg(), ERROR, MAXALIGN, PageGetSpecialSize, PageIsNew, and RelationGetRelationName.

Referenced by _bt_getbuf(), _bt_relandgetbuf(), _bt_search_insert(), bt_recheck_sibling_links(), btvacuumpage(), and palloc_btree_page().

◆ _bt_compare()

int32 _bt_compare ( Relation  rel,
BTScanInsert  key,
Page  page,
OffsetNumber  offnum 
)

Definition at line 656 of file nbtsearch.c.

660 {
661  TupleDesc itupdesc = RelationGetDescr(rel);
662  BTPageOpaque opaque = BTPageGetOpaque(page);
663  IndexTuple itup;
664  ItemPointer heapTid;
665  ScanKey scankey;
666  int ncmpkey;
667  int ntupatts;
668  int32 result;
669 
670  Assert(_bt_check_natts(rel, key->heapkeyspace, page, offnum));
672  Assert(key->heapkeyspace || key->scantid == NULL);
673 
674  /*
675  * Force result ">" if target item is first data item on an internal page
676  * --- see NOTE above.
677  */
678  if (!P_ISLEAF(opaque) && offnum == P_FIRSTDATAKEY(opaque))
679  return 1;
680 
681  itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
682  ntupatts = BTreeTupleGetNAtts(itup, rel);
683 
684  /*
685  * The scan key is set up with the attribute number associated with each
686  * term in the key. It is important that, if the index is multi-key, the
687  * scan contain the first k key attributes, and that they be in order. If
688  * you think about how multi-key ordering works, you'll understand why
689  * this is.
690  *
691  * We don't test for violation of this condition here, however. The
692  * initial setup for the index scan had better have gotten it right (see
693  * _bt_first).
694  */
695 
696  ncmpkey = Min(ntupatts, key->keysz);
697  Assert(key->heapkeyspace || ncmpkey == key->keysz);
698  Assert(!BTreeTupleIsPosting(itup) || key->allequalimage);
699  scankey = key->scankeys;
700  for (int i = 1; i <= ncmpkey; i++)
701  {
702  Datum datum;
703  bool isNull;
704 
705  datum = index_getattr(itup, scankey->sk_attno, itupdesc, &isNull);
706 
707  if (scankey->sk_flags & SK_ISNULL) /* key is NULL */
708  {
709  if (isNull)
710  result = 0; /* NULL "=" NULL */
711  else if (scankey->sk_flags & SK_BT_NULLS_FIRST)
712  result = -1; /* NULL "<" NOT_NULL */
713  else
714  result = 1; /* NULL ">" NOT_NULL */
715  }
716  else if (isNull) /* key is NOT_NULL and item is NULL */
717  {
718  if (scankey->sk_flags & SK_BT_NULLS_FIRST)
719  result = 1; /* NOT_NULL ">" NULL */
720  else
721  result = -1; /* NOT_NULL "<" NULL */
722  }
723  else
724  {
725  /*
726  * The sk_func needs to be passed the index value as left arg and
727  * the sk_argument as right arg (they might be of different
728  * types). Since it is convenient for callers to think of
729  * _bt_compare as comparing the scankey to the index item, we have
730  * to flip the sign of the comparison result. (Unless it's a DESC
731  * column, in which case we *don't* flip the sign.)
732  */
733  result = DatumGetInt32(FunctionCall2Coll(&scankey->sk_func,
734  scankey->sk_collation,
735  datum,
736  scankey->sk_argument));
737 
738  if (!(scankey->sk_flags & SK_BT_DESC))
739  INVERT_COMPARE_RESULT(result);
740  }
741 
742  /* if the keys are unequal, return the difference */
743  if (result != 0)
744  return result;
745 
746  scankey++;
747  }
748 
749  /*
750  * All non-truncated attributes (other than heap TID) were found to be
751  * equal. Treat truncated attributes as minus infinity when scankey has a
752  * key attribute value that would otherwise be compared directly.
753  *
754  * Note: it doesn't matter if ntupatts includes non-key attributes;
755  * scankey won't, so explicitly excluding non-key attributes isn't
756  * necessary.
757  */
758  if (key->keysz > ntupatts)
759  return 1;
760 
761  /*
762  * Use the heap TID attribute and scantid to try to break the tie. The
763  * rules are the same as any other key attribute -- only the
764  * representation differs.
765  */
766  heapTid = BTreeTupleGetHeapTID(itup);
767  if (key->scantid == NULL)
768  {
769  /*
770  * Most searches have a scankey that is considered greater than a
771  * truncated pivot tuple if and when the scankey has equal values for
772  * attributes up to and including the least significant untruncated
773  * attribute in tuple.
774  *
775  * For example, if an index has the minimum two attributes (single
776  * user key attribute, plus heap TID attribute), and a page's high key
777  * is ('foo', -inf), and scankey is ('foo', <omitted>), the search
778  * will not descend to the page to the left. The search will descend
779  * right instead. The truncated attribute in pivot tuple means that
780  * all non-pivot tuples on the page to the left are strictly < 'foo',
781  * so it isn't necessary to descend left. In other words, search
782  * doesn't have to descend left because it isn't interested in a match
783  * that has a heap TID value of -inf.
784  *
785  * However, some searches (pivotsearch searches) actually require that
786  * we descend left when this happens. -inf is treated as a possible
787  * match for omitted scankey attribute(s). This is needed by page
788  * deletion, which must re-find leaf pages that are targets for
789  * deletion using their high keys.
790  *
791  * Note: the heap TID part of the test ensures that scankey is being
792  * compared to a pivot tuple with one or more truncated key
793  * attributes.
794  *
795  * Note: pg_upgrade'd !heapkeyspace indexes must always descend to the
796  * left here, since they have no heap TID attribute (and cannot have
797  * any -inf key values in any case, since truncation can only remove
798  * non-key attributes). !heapkeyspace searches must always be
799  * prepared to deal with matches on both sides of the pivot once the
800  * leaf level is reached.
801  */
802  if (key->heapkeyspace && !key->pivotsearch &&
803  key->keysz == ntupatts && heapTid == NULL)
804  return 1;
805 
806  /* All provided scankey arguments found to be equal */
807  return 0;
808  }
809 
810  /*
811  * Treat truncated heap TID as minus infinity, since scankey has a key
812  * attribute value (scantid) that would otherwise be compared directly
813  */
815  if (heapTid == NULL)
816  return 1;
817 
818  /*
819  * Scankey must be treated as equal to a posting list tuple if its scantid
820  * value falls within the range of the posting list. In all other cases
821  * there can only be a single heap TID value, which is compared directly
822  * with scantid.
823  */
825  result = ItemPointerCompare(key->scantid, heapTid);
826  if (result <= 0 || !BTreeTupleIsPosting(itup))
827  return result;
828  else
829  {
830  result = ItemPointerCompare(key->scantid,
832  if (result > 0)
833  return 1;
834  }
835 
836  return 0;
837 }
#define Min(x, y)
Definition: c.h:986
#define INVERT_COMPARE_RESULT(var)
Definition: c.h:1122
int32 ItemPointerCompare(ItemPointer arg1, ItemPointer arg2)
Definition: itemptr.c:52
#define SK_BT_DESC
Definition: nbtree.h:1084
static ItemPointer BTreeTupleGetMaxHeapTID(IndexTuple itup)
Definition: nbtree.h:659
bool _bt_check_natts(Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum)
Definition: nbtutils.c:2471
#define DatumGetInt32(X)
Definition: postgres.h:516
int sk_flags
Definition: skey.h:66
FmgrInfo sk_func
Definition: skey.h:71
Oid sk_collation
Definition: skey.h:70
AttrNumber sk_attno
Definition: skey.h:67

References _bt_check_natts(), Assert(), BTPageGetOpaque, BTreeTupleGetHeapTID(), BTreeTupleGetMaxHeapTID(), BTreeTupleGetNAtts, BTreeTupleIsPosting(), DatumGetInt32, FunctionCall2Coll(), i, index_getattr, IndexRelationGetNumberOfKeyAttributes, INVERT_COMPARE_RESULT, ItemPointerCompare(), sort-test::key, Min, P_FIRSTDATAKEY, P_ISLEAF, PageGetItem, PageGetItemId, RelationGetDescr, ScanKeyData::sk_argument, ScanKeyData::sk_attno, SK_BT_DESC, SK_BT_NULLS_FIRST, ScanKeyData::sk_collation, ScanKeyData::sk_flags, ScanKeyData::sk_func, and SK_ISNULL.

Referenced by _bt_binsrch(), _bt_binsrch_insert(), _bt_check_unique(), _bt_findinsertloc(), _bt_moveright(), _bt_search_insert(), bt_rootdescend(), invariant_g_offset(), invariant_l_nontarget_offset(), invariant_l_offset(), and invariant_leq_offset().

◆ _bt_conditionallockbuf()

bool _bt_conditionallockbuf ( Relation  rel,
Buffer  buf 
)

Definition at line 1105 of file nbtpage.c.

1106 {
1107  /* ConditionalLockBuffer() asserts that pin is held by this backend */
1108  if (!ConditionalLockBuffer(buf))
1109  return false;
1110 
1111  if (!RelationUsesLocalBuffers(rel))
1113 
1114  return true;
1115 }
bool ConditionalLockBuffer(Buffer buffer)
Definition: bufmgr.c:4182
#define VALGRIND_MAKE_MEM_DEFINED(addr, size)
Definition: memdebug.h:26
#define RelationUsesLocalBuffers(relation)
Definition: rel.h:622

References buf, BufferGetPage, ConditionalLockBuffer(), RelationUsesLocalBuffers, and VALGRIND_MAKE_MEM_DEFINED.

Referenced by _bt_getbuf(), and _bt_search_insert().

◆ _bt_dedup_finish_pending()

Size _bt_dedup_finish_pending ( Page  newpage,
BTDedupState  state 
)

Definition at line 554 of file nbtdedup.c.

555 {
556  OffsetNumber tupoff;
557  Size tuplesz;
558  Size spacesaving;
559 
560  Assert(state->nitems > 0);
561  Assert(state->nitems <= state->nhtids);
562  Assert(state->intervals[state->nintervals].baseoff == state->baseoff);
563 
564  tupoff = OffsetNumberNext(PageGetMaxOffsetNumber(newpage));
565  if (state->nitems == 1)
566  {
567  /* Use original, unchanged base tuple */
568  tuplesz = IndexTupleSize(state->base);
569  if (PageAddItem(newpage, (Item) state->base, tuplesz, tupoff,
570  false, false) == InvalidOffsetNumber)
571  elog(ERROR, "deduplication failed to add tuple to page");
572 
573  spacesaving = 0;
574  }
575  else
576  {
577  IndexTuple final;
578 
579  /* Form a tuple with a posting list */
580  final = _bt_form_posting(state->base, state->htids, state->nhtids);
581  tuplesz = IndexTupleSize(final);
582  Assert(tuplesz <= state->maxpostingsize);
583 
584  /* Save final number of items for posting list */
585  state->intervals[state->nintervals].nitems = state->nitems;
586 
587  Assert(tuplesz == MAXALIGN(IndexTupleSize(final)));
588  if (PageAddItem(newpage, (Item) final, tuplesz, tupoff, false,
589  false) == InvalidOffsetNumber)
590  elog(ERROR, "deduplication failed to add tuple to page");
591 
592  pfree(final);
593  spacesaving = state->phystupsize - (tuplesz + sizeof(ItemIdData));
594  /* Increment nintervals, since we wrote a new posting list tuple */
595  state->nintervals++;
596  Assert(spacesaving > 0 && spacesaving < BLCKSZ);
597  }
598 
599  /* Reset state for next pending posting list */
600  state->nhtids = 0;
601  state->nitems = 0;
602  state->phystupsize = 0;
603 
604  return spacesaving;
605 }
#define PageAddItem(page, item, size, offsetNumber, overwrite, is_heap)
Definition: bufpage.h:415
Pointer Item
Definition: item.h:17
IndexTuple _bt_form_posting(IndexTuple base, ItemPointer htids, int nhtids)
Definition: nbtdedup.c:860

References _bt_form_posting(), Assert(), elog, ERROR, IndexTupleSize, InvalidOffsetNumber, MAXALIGN, OffsetNumberNext, PageAddItem, PageGetMaxOffsetNumber, and pfree().

Referenced by _bt_dedup_pass(), and btree_xlog_dedup().

◆ _bt_dedup_pass()

void _bt_dedup_pass ( Relation  rel,
Buffer  buf,
Relation  heapRel,
IndexTuple  newitem,
Size  newitemsz,
bool  bottomupdedup 
)

Definition at line 58 of file nbtdedup.c.

60 {
61  OffsetNumber offnum,
62  minoff,
63  maxoff;
64  Page page = BufferGetPage(buf);
65  BTPageOpaque opaque = BTPageGetOpaque(page);
66  Page newpage;
68  Size pagesaving PG_USED_FOR_ASSERTS_ONLY = 0;
69  bool singlevalstrat = false;
70  int nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
71 
72  /* Passed-in newitemsz is MAXALIGNED but does not include line pointer */
73  newitemsz += sizeof(ItemIdData);
74 
75  /*
76  * Initialize deduplication state.
77  *
78  * It would be possible for maxpostingsize (limit on posting list tuple
79  * size) to be set to one third of the page. However, it seems like a
80  * good idea to limit the size of posting lists to one sixth of a page.
81  * That ought to leave us with a good split point when pages full of
82  * duplicates can be split several times.
83  */
85  state->deduplicate = true;
86  state->nmaxitems = 0;
87  state->maxpostingsize = Min(BTMaxItemSize(page) / 2, INDEX_SIZE_MASK);
88  /* Metadata about base tuple of current pending posting list */
89  state->base = NULL;
90  state->baseoff = InvalidOffsetNumber;
91  state->basetupsize = 0;
92  /* Metadata about current pending posting list TIDs */
93  state->htids = palloc(state->maxpostingsize);
94  state->nhtids = 0;
95  state->nitems = 0;
96  /* Size of all physical tuples to be replaced by pending posting list */
97  state->phystupsize = 0;
98  /* nintervals should be initialized to zero */
99  state->nintervals = 0;
100 
101  minoff = P_FIRSTDATAKEY(opaque);
102  maxoff = PageGetMaxOffsetNumber(page);
103 
104  /*
105  * Consider applying "single value" strategy, though only if the page
106  * seems likely to be split in the near future
107  */
108  if (!bottomupdedup)
109  singlevalstrat = _bt_do_singleval(rel, page, state, minoff, newitem);
110 
111  /*
112  * Deduplicate items from page, and write them to newpage.
113  *
114  * Copy the original page's LSN into newpage copy. This will become the
115  * updated version of the page. We need this because XLogInsert will
116  * examine the LSN and possibly dump it in a page image.
117  */
118  newpage = PageGetTempPageCopySpecial(page);
119  PageSetLSN(newpage, PageGetLSN(page));
120 
121  /* Copy high key, if any */
122  if (!P_RIGHTMOST(opaque))
123  {
124  ItemId hitemid = PageGetItemId(page, P_HIKEY);
125  Size hitemsz = ItemIdGetLength(hitemid);
126  IndexTuple hitem = (IndexTuple) PageGetItem(page, hitemid);
127 
128  if (PageAddItem(newpage, (Item) hitem, hitemsz, P_HIKEY,
129  false, false) == InvalidOffsetNumber)
130  elog(ERROR, "deduplication failed to add highkey");
131  }
132 
133  for (offnum = minoff;
134  offnum <= maxoff;
135  offnum = OffsetNumberNext(offnum))
136  {
137  ItemId itemid = PageGetItemId(page, offnum);
138  IndexTuple itup = (IndexTuple) PageGetItem(page, itemid);
139 
140  Assert(!ItemIdIsDead(itemid));
141 
142  if (offnum == minoff)
143  {
144  /*
145  * No previous/base tuple for the data item -- use the data item
146  * as base tuple of pending posting list
147  */
148  _bt_dedup_start_pending(state, itup, offnum);
149  }
150  else if (state->deduplicate &&
151  _bt_keep_natts_fast(rel, state->base, itup) > nkeyatts &&
152  _bt_dedup_save_htid(state, itup))
153  {
154  /*
155  * Tuple is equal to base tuple of pending posting list. Heap
156  * TID(s) for itup have been saved in state.
157  */
158  }
159  else
160  {
161  /*
162  * Tuple is not equal to pending posting list tuple, or
163  * _bt_dedup_save_htid() opted to not merge current item into
164  * pending posting list for some other reason (e.g., adding more
165  * TIDs would have caused posting list to exceed current
166  * maxpostingsize).
167  *
168  * If state contains pending posting list with more than one item,
169  * form new posting tuple, and actually update the page. Else
170  * reset the state and move on without modifying the page.
171  */
172  pagesaving += _bt_dedup_finish_pending(newpage, state);
173 
174  if (singlevalstrat)
175  {
176  /*
177  * Single value strategy's extra steps.
178  *
179  * Lower maxpostingsize for sixth and final large posting list
180  * tuple at the point where 5 maxpostingsize-capped tuples
181  * have either been formed or observed.
182  *
183  * When a sixth maxpostingsize-capped item is formed/observed,
184  * stop merging together tuples altogether. The few tuples
185  * that remain at the end of the page won't be merged together
186  * at all (at least not until after a future page split takes
187  * place).
188  */
189  if (state->nmaxitems == 5)
190  _bt_singleval_fillfactor(page, state, newitemsz);
191  else if (state->nmaxitems == 6)
192  {
193  state->deduplicate = false;
194  singlevalstrat = false; /* won't be back here */
195  }
196  }
197 
198  /* itup starts new pending posting list */
199  _bt_dedup_start_pending(state, itup, offnum);
200  }
201  }
202 
203  /* Handle the last item */
204  pagesaving += _bt_dedup_finish_pending(newpage, state);
205 
206  /*
207  * If no items suitable for deduplication were found, newpage must be
208  * exactly the same as the original page, so just return from function.
209  *
210  * We could determine whether or not to proceed on the basis the space
211  * savings being sufficient to avoid an immediate page split instead. We
212  * don't do that because there is some small value in nbtsplitloc.c always
213  * operating against a page that is fully deduplicated (apart from
214  * newitem). Besides, most of the cost has already been paid.
215  */
216  if (state->nintervals == 0)
217  {
218  /* cannot leak memory here */
219  pfree(newpage);
220  pfree(state->htids);
221  pfree(state);
222  return;
223  }
224 
225  /*
226  * By here, it's clear that deduplication will definitely go ahead.
227  *
228  * Clear the BTP_HAS_GARBAGE page flag. The index must be a heapkeyspace
229  * index, and as such we'll never pay attention to BTP_HAS_GARBAGE anyway.
230  * But keep things tidy.
231  */
232  if (P_HAS_GARBAGE(opaque))
233  {
234  BTPageOpaque nopaque = BTPageGetOpaque(newpage);
235 
236  nopaque->btpo_flags &= ~BTP_HAS_GARBAGE;
237  }
238 
240 
241  PageRestoreTempPage(newpage, page);
243 
244  /* XLOG stuff */
245  if (RelationNeedsWAL(rel))
246  {
247  XLogRecPtr recptr;
248  xl_btree_dedup xlrec_dedup;
249 
250  xlrec_dedup.nintervals = state->nintervals;
251 
252  XLogBeginInsert();
254  XLogRegisterData((char *) &xlrec_dedup, SizeOfBtreeDedup);
255 
256  /*
257  * The intervals array is not in the buffer, but pretend that it is.
258  * When XLogInsert stores the whole buffer, the array need not be
259  * stored too.
260  */
261  XLogRegisterBufData(0, (char *) state->intervals,
262  state->nintervals * sizeof(BTDedupInterval));
263 
264  recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DEDUP);
265 
266  PageSetLSN(page, recptr);
267  }
268 
270 
271  /* Local space accounting should agree with page accounting */
272  Assert(pagesaving < newitemsz || PageGetExactFreeSpace(page) >= newitemsz);
273 
274  /* cannot leak memory here */
275  pfree(state->htids);
276  pfree(state);
277 }
void MarkBufferDirty(Buffer buffer)
Definition: bufmgr.c:1573
void PageRestoreTempPage(Page tempPage, Page oldPage)
Definition: bufpage.c:424
Page PageGetTempPageCopySpecial(Page page)
Definition: bufpage.c:402
#define PageSetLSN(page, lsn)
Definition: bufpage.h:367
#define PageGetLSN(page)
Definition: bufpage.h:365
#define PG_USED_FOR_ASSERTS_ONLY
Definition: c.h:155
#define ItemIdGetLength(itemId)
Definition: itemid.h:59
#define INDEX_SIZE_MASK
Definition: itup.h:65
#define START_CRIT_SECTION()
Definition: miscadmin.h:148
#define END_CRIT_SECTION()
Definition: miscadmin.h:150
static bool _bt_do_singleval(Relation rel, Page page, BTDedupState state, OffsetNumber minoff, IndexTuple newitem)
Definition: nbtdedup.c:778
Size _bt_dedup_finish_pending(Page newpage, BTDedupState state)
Definition: nbtdedup.c:554
static void _bt_singleval_fillfactor(Page page, BTDedupState state, Size newitemsz)
Definition: nbtdedup.c:818
#define P_HAS_GARBAGE(opaque)
Definition: nbtree.h:227
#define BTP_HAS_GARBAGE
Definition: nbtree.h:82
#define XLOG_BTREE_DEDUP
Definition: nbtxlog.h:33
#define SizeOfBtreeDedup
Definition: nbtxlog.h:174
#define RelationNeedsWAL(relation)
Definition: rel.h:613
uint16 btpo_flags
Definition: nbtree.h:67
uint16 nintervals
Definition: nbtxlog.h:169
uint64 XLogRecPtr
Definition: xlogdefs.h:21
XLogRecPtr XLogInsert(RmgrId rmid, uint8 info)
Definition: xloginsert.c:443
void XLogRegisterBufData(uint8 block_id, char *data, int len)
Definition: xloginsert.c:389
void XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags)
Definition: xloginsert.c:243
void XLogBeginInsert(void)
Definition: xloginsert.c:150
void XLogRegisterData(char *data, int len)
Definition: xloginsert.c:351
#define REGBUF_STANDARD
Definition: xloginsert.h:34

References _bt_dedup_finish_pending(), _bt_dedup_save_htid(), _bt_dedup_start_pending(), _bt_do_singleval(), _bt_keep_natts_fast(), _bt_singleval_fillfactor(), Assert(), BTMaxItemSize, BTP_HAS_GARBAGE, BTPageGetOpaque, BTPageOpaqueData::btpo_flags, buf, BufferGetPage, elog, END_CRIT_SECTION, ERROR, INDEX_SIZE_MASK, IndexRelationGetNumberOfKeyAttributes, InvalidOffsetNumber, ItemIdGetLength, ItemIdIsDead, MarkBufferDirty(), Min, xl_btree_dedup::nintervals, OffsetNumberNext, P_FIRSTDATAKEY, P_HAS_GARBAGE, P_HIKEY, P_RIGHTMOST, PageAddItem, PageGetExactFreeSpace(), PageGetItem, PageGetItemId, PageGetLSN, PageGetMaxOffsetNumber, PageGetTempPageCopySpecial(), PageRestoreTempPage(), PageSetLSN, palloc(), pfree(), PG_USED_FOR_ASSERTS_ONLY, REGBUF_STANDARD, RelationNeedsWAL, SizeOfBtreeDedup, START_CRIT_SECTION, XLOG_BTREE_DEDUP, XLogBeginInsert(), XLogInsert(), XLogRegisterBufData(), XLogRegisterBuffer(), and XLogRegisterData().

Referenced by _bt_delete_or_dedup_one_page().

◆ _bt_dedup_save_htid()

bool _bt_dedup_save_htid ( BTDedupState  state,
IndexTuple  itup 
)

Definition at line 483 of file nbtdedup.c.

484 {
485  int nhtids;
486  ItemPointer htids;
487  Size mergedtupsz;
488 
489  Assert(!BTreeTupleIsPivot(itup));
490 
491  if (!BTreeTupleIsPosting(itup))
492  {
493  nhtids = 1;
494  htids = &itup->t_tid;
495  }
496  else
497  {
498  nhtids = BTreeTupleGetNPosting(itup);
499  htids = BTreeTupleGetPosting(itup);
500  }
501 
502  /*
503  * Don't append (have caller finish pending posting list as-is) if
504  * appending heap TID(s) from itup would put us over maxpostingsize limit.
505  *
506  * This calculation needs to match the code used within _bt_form_posting()
507  * for new posting list tuples.
508  */
509  mergedtupsz = MAXALIGN(state->basetupsize +
510  (state->nhtids + nhtids) * sizeof(ItemPointerData));
511 
512  if (mergedtupsz > state->maxpostingsize)
513  {
514  /*
515  * Count this as an oversized item for single value strategy, though
516  * only when there are 50 TIDs in the final posting list tuple. This
517  * limit (which is fairly arbitrary) avoids confusion about how many
518  * 1/6 of a page tuples have been encountered/created by the current
519  * deduplication pass.
520  *
521  * Note: We deliberately don't consider which deduplication pass
522  * merged together tuples to create this item (could be a previous
523  * deduplication pass, or current pass). See _bt_do_singleval()
524  * comments.
525  */
526  if (state->nhtids > 50)
527  state->nmaxitems++;
528 
529  return false;
530  }
531 
532  /*
533  * Save heap TIDs to pending posting list tuple -- itup can be merged into
534  * pending posting list
535  */
536  state->nitems++;
537  memcpy(state->htids + state->nhtids, htids,
538  sizeof(ItemPointerData) * nhtids);
539  state->nhtids += nhtids;
540  state->phystupsize += MAXALIGN(IndexTupleSize(itup)) + sizeof(ItemIdData);
541 
542  return true;
543 }
static uint16 BTreeTupleGetNPosting(IndexTuple posting)
Definition: nbtree.h:513
static ItemPointer BTreeTupleGetPosting(IndexTuple posting)
Definition: nbtree.h:532

References Assert(), BTreeTupleGetNPosting(), BTreeTupleGetPosting(), BTreeTupleIsPivot(), BTreeTupleIsPosting(), IndexTupleSize, MAXALIGN, and IndexTupleData::t_tid.

Referenced by _bt_bottomupdel_pass(), _bt_dedup_pass(), _bt_load(), and btree_xlog_dedup().

◆ _bt_dedup_start_pending()

void _bt_dedup_start_pending ( BTDedupState  state,
IndexTuple  base,
OffsetNumber  baseoff 
)

Definition at line 432 of file nbtdedup.c.

434 {
435  Assert(state->nhtids == 0);
436  Assert(state->nitems == 0);
437  Assert(!BTreeTupleIsPivot(base));
438 
439  /*
440  * Copy heap TID(s) from new base tuple for new candidate posting list
441  * into working state's array
442  */
443  if (!BTreeTupleIsPosting(base))
444  {
445  memcpy(state->htids, &base->t_tid, sizeof(ItemPointerData));
446  state->nhtids = 1;
447  state->basetupsize = IndexTupleSize(base);
448  }
449  else
450  {
451  int nposting;
452 
453  nposting = BTreeTupleGetNPosting(base);
454  memcpy(state->htids, BTreeTupleGetPosting(base),
455  sizeof(ItemPointerData) * nposting);
456  state->nhtids = nposting;
457  /* basetupsize should not include existing posting list */
458  state->basetupsize = BTreeTupleGetPostingOffset(base);
459  }
460 
461  /*
462  * Save new base tuple itself -- it'll be needed if we actually create a
463  * new posting list from new pending posting list.
464  *
465  * Must maintain physical size of all existing tuples (including line
466  * pointer overhead) so that we can calculate space savings on page.
467  */
468  state->nitems = 1;
469  state->base = base;
470  state->baseoff = baseoff;
471  state->phystupsize = MAXALIGN(IndexTupleSize(base)) + sizeof(ItemIdData);
472  /* Also save baseoff in pending state for interval */
473  state->intervals[state->nintervals].baseoff = state->baseoff;
474 }
static uint32 BTreeTupleGetPostingOffset(IndexTuple posting)
Definition: nbtree.h:524

References Assert(), BTreeTupleGetNPosting(), BTreeTupleGetPosting(), BTreeTupleGetPostingOffset(), BTreeTupleIsPivot(), BTreeTupleIsPosting(), IndexTupleSize, MAXALIGN, and IndexTupleData::t_tid.

Referenced by _bt_bottomupdel_pass(), _bt_dedup_pass(), _bt_load(), and btree_xlog_dedup().

◆ _bt_delitems_delete_check()

void _bt_delitems_delete_check ( Relation  rel,
Buffer  buf,
Relation  heapRel,
TM_IndexDeleteOp delstate 
)

Definition at line 1528 of file nbtpage.c.

1530 {
1531  Page page = BufferGetPage(buf);
1532  TransactionId latestRemovedXid;
1533  OffsetNumber postingidxoffnum = InvalidOffsetNumber;
1534  int ndeletable = 0,
1535  nupdatable = 0;
1538 
1539  /* Use tableam interface to determine which tuples to delete first */
1540  latestRemovedXid = table_index_delete_tuples(heapRel, delstate);
1541 
1542  /* Should not WAL-log latestRemovedXid unless it's required */
1543  if (!XLogStandbyInfoActive() || !RelationNeedsWAL(rel))
1544  latestRemovedXid = InvalidTransactionId;
1545 
1546  /*
1547  * Construct a leaf-page-wise description of what _bt_delitems_delete()
1548  * needs to do to physically delete index tuples from the page.
1549  *
1550  * Must sort deltids array to restore leaf-page-wise order (original order
1551  * before call to tableam). This is the order that the loop expects.
1552  *
1553  * Note that deltids array might be a lot smaller now. It might even have
1554  * no entries at all (with bottom-up deletion caller), in which case there
1555  * is nothing left to do.
1556  */
1557  qsort(delstate->deltids, delstate->ndeltids, sizeof(TM_IndexDelete),
1559  if (delstate->ndeltids == 0)
1560  {
1561  Assert(delstate->bottomup);
1562  return;
1563  }
1564 
1565  /* We definitely have to delete at least one index tuple (or one TID) */
1566  for (int i = 0; i < delstate->ndeltids; i++)
1567  {
1568  TM_IndexStatus *dstatus = delstate->status + delstate->deltids[i].id;
1569  OffsetNumber idxoffnum = dstatus->idxoffnum;
1570  ItemId itemid = PageGetItemId(page, idxoffnum);
1571  IndexTuple itup = (IndexTuple) PageGetItem(page, itemid);
1572  int nestedi,
1573  nitem;
1574  BTVacuumPosting vacposting;
1575 
1576  Assert(OffsetNumberIsValid(idxoffnum));
1577 
1578  if (idxoffnum == postingidxoffnum)
1579  {
1580  /*
1581  * This deltid entry is a TID from a posting list tuple that has
1582  * already been completely processed
1583  */
1584  Assert(BTreeTupleIsPosting(itup));
1586  &delstate->deltids[i].tid) < 0);
1588  &delstate->deltids[i].tid) >= 0);
1589  continue;
1590  }
1591 
1592  if (!BTreeTupleIsPosting(itup))
1593  {
1594  /* Plain non-pivot tuple */
1595  Assert(ItemPointerEquals(&itup->t_tid, &delstate->deltids[i].tid));
1596  if (dstatus->knowndeletable)
1597  deletable[ndeletable++] = idxoffnum;
1598  continue;
1599  }
1600 
1601  /*
1602  * itup is a posting list tuple whose lowest deltids entry (which may
1603  * or may not be for the first TID from itup) is considered here now.
1604  * We should process all of the deltids entries for the posting list
1605  * together now, though (not just the lowest). Remember to skip over
1606  * later itup-related entries during later iterations of outermost
1607  * loop.
1608  */
1609  postingidxoffnum = idxoffnum; /* Remember work in outermost loop */
1610  nestedi = i; /* Initialize for first itup deltids entry */
1611  vacposting = NULL; /* Describes final action for itup */
1612  nitem = BTreeTupleGetNPosting(itup);
1613  for (int p = 0; p < nitem; p++)
1614  {
1615  ItemPointer ptid = BTreeTupleGetPostingN(itup, p);
1616  int ptidcmp = -1;
1617 
1618  /*
1619  * This nested loop reuses work across ptid TIDs taken from itup.
1620  * We take advantage of the fact that both itup's TIDs and deltids
1621  * entries (within a single itup/posting list grouping) must both
1622  * be in ascending TID order.
1623  */
1624  for (; nestedi < delstate->ndeltids; nestedi++)
1625  {
1626  TM_IndexDelete *tcdeltid = &delstate->deltids[nestedi];
1627  TM_IndexStatus *tdstatus = (delstate->status + tcdeltid->id);
1628 
1629  /* Stop once we get past all itup related deltids entries */
1630  Assert(tdstatus->idxoffnum >= idxoffnum);
1631  if (tdstatus->idxoffnum != idxoffnum)
1632  break;
1633 
1634  /* Skip past non-deletable itup related entries up front */
1635  if (!tdstatus->knowndeletable)
1636  continue;
1637 
1638  /* Entry is first partial ptid match (or an exact match)? */
1639  ptidcmp = ItemPointerCompare(&tcdeltid->tid, ptid);
1640  if (ptidcmp >= 0)
1641  {
1642  /* Greater than or equal (partial or exact) match... */
1643  break;
1644  }
1645  }
1646 
1647  /* ...exact ptid match to a deletable deltids entry? */
1648  if (ptidcmp != 0)
1649  continue;
1650 
1651  /* Exact match for deletable deltids entry -- ptid gets deleted */
1652  if (vacposting == NULL)
1653  {
1654  vacposting = palloc(offsetof(BTVacuumPostingData, deletetids) +
1655  nitem * sizeof(uint16));
1656  vacposting->itup = itup;
1657  vacposting->updatedoffset = idxoffnum;
1658  vacposting->ndeletedtids = 0;
1659  }
1660  vacposting->deletetids[vacposting->ndeletedtids++] = p;
1661  }
1662 
1663  /* Final decision on itup, a posting list tuple */
1664 
1665  if (vacposting == NULL)
1666  {
1667  /* No TIDs to delete from itup -- do nothing */
1668  }
1669  else if (vacposting->ndeletedtids == nitem)
1670  {
1671  /* Straight delete of itup (to delete all TIDs) */
1672  deletable[ndeletable++] = idxoffnum;
1673  /* Turns out we won't need granular information */
1674  pfree(vacposting);
1675  }
1676  else
1677  {
1678  /* Delete some (but not all) TIDs from itup */
1679  Assert(vacposting->ndeletedtids > 0 &&
1680  vacposting->ndeletedtids < nitem);
1681  updatable[nupdatable++] = vacposting;
1682  }
1683  }
1684 
1685  /* Physically delete tuples (or TIDs) using deletable (or updatable) */
1686  _bt_delitems_delete(rel, buf, latestRemovedXid, deletable, ndeletable,
1687  updatable, nupdatable);
1688 
1689  /* be tidy */
1690  for (int i = 0; i < nupdatable; i++)
1691  pfree(updatable[i]);
1692 }
unsigned short uint16
Definition: c.h:440
#define offsetof(type, field)
Definition: c.h:727
uint32 TransactionId
Definition: c.h:587
bool ItemPointerEquals(ItemPointer pointer1, ItemPointer pointer2)
Definition: itemptr.c:29
#define MaxIndexTuplesPerPage
Definition: itup.h:144
static void _bt_delitems_delete(Relation rel, Buffer buf, TransactionId latestRemovedXid, OffsetNumber *deletable, int ndeletable, BTVacuumPosting *updatable, int nupdatable)
Definition: nbtpage.c:1296
static int _bt_delitems_cmp(const void *a, const void *b)
Definition: nbtpage.c:1474
static ItemPointer BTreeTupleGetPostingN(IndexTuple posting, int n)
Definition: nbtree.h:539
#define OffsetNumberIsValid(offsetNumber)
Definition: off.h:39
#define qsort(a, b, c, d)
Definition: port.h:495
uint16 deletetids[FLEXIBLE_ARRAY_MEMBER]
Definition: nbtree.h:911
uint16 ndeletedtids
Definition: nbtree.h:910
IndexTuple itup
Definition: nbtree.h:906
OffsetNumber updatedoffset
Definition: nbtree.h:907
ItemPointerData tid
Definition: tableam.h:189
bool knowndeletable
Definition: tableam.h:196
OffsetNumber idxoffnum
Definition: tableam.h:195
static TransactionId table_index_delete_tuples(Relation rel, TM_IndexDeleteOp *delstate)
Definition: tableam.h:1327
#define InvalidTransactionId
Definition: transam.h:31
#define XLogStandbyInfoActive()
Definition: xlog.h:118

References _bt_delitems_cmp(), _bt_delitems_delete(), Assert(), TM_IndexDeleteOp::bottomup, BTreeTupleGetHeapTID(), BTreeTupleGetMaxHeapTID(), BTreeTupleGetNPosting(), BTreeTupleGetPostingN(), BTreeTupleIsPosting(), buf, BufferGetPage, BTVacuumPostingData::deletetids, TM_IndexDeleteOp::deltids, i, TM_IndexDelete::id, TM_IndexStatus::idxoffnum, InvalidOffsetNumber, InvalidTransactionId, ItemPointerCompare(), ItemPointerEquals(), BTVacuumPostingData::itup, TM_IndexStatus::knowndeletable, MaxIndexTuplesPerPage, BTVacuumPostingData::ndeletedtids, TM_IndexDeleteOp::ndeltids, OffsetNumberIsValid, offsetof, PageGetItem, PageGetItemId, palloc(), pfree(), qsort, RelationNeedsWAL, TM_IndexDeleteOp::status, IndexTupleData::t_tid, table_index_delete_tuples(), TM_IndexDelete::tid, BTVacuumPostingData::updatedoffset, and XLogStandbyInfoActive.

Referenced by _bt_bottomupdel_pass(), and _bt_simpledel_pass().

◆ _bt_delitems_vacuum()

void _bt_delitems_vacuum ( Relation  rel,
Buffer  buf,
OffsetNumber deletable,
int  ndeletable,
BTVacuumPosting updatable,
int  nupdatable 
)

Definition at line 1166 of file nbtpage.c.

1169 {
1170  Page page = BufferGetPage(buf);
1171  BTPageOpaque opaque;
1172  bool needswal = RelationNeedsWAL(rel);
1173  char *updatedbuf = NULL;
1174  Size updatedbuflen = 0;
1175  OffsetNumber updatedoffsets[MaxIndexTuplesPerPage];
1176 
1177  /* Shouldn't be called unless there's something to do */
1178  Assert(ndeletable > 0 || nupdatable > 0);
1179 
1180  /* Generate new version of posting lists without deleted TIDs */
1181  if (nupdatable > 0)
1182  updatedbuf = _bt_delitems_update(updatable, nupdatable,
1183  updatedoffsets, &updatedbuflen,
1184  needswal);
1185 
1186  /* No ereport(ERROR) until changes are logged */
1188 
1189  /*
1190  * Handle posting tuple updates.
1191  *
1192  * Deliberately do this before handling simple deletes. If we did it the
1193  * other way around (i.e. WAL record order -- simple deletes before
1194  * updates) then we'd have to make compensating changes to the 'updatable'
1195  * array of offset numbers.
1196  *
1197  * PageIndexTupleOverwrite() won't unset each item's LP_DEAD bit when it
1198  * happens to already be set. It's important that we not interfere with
1199  * any future simple index tuple deletion operations.
1200  */
1201  for (int i = 0; i < nupdatable; i++)
1202  {
1203  OffsetNumber updatedoffset = updatedoffsets[i];
1204  IndexTuple itup;
1205  Size itemsz;
1206 
1207  itup = updatable[i]->itup;
1208  itemsz = MAXALIGN(IndexTupleSize(itup));
1209  if (!PageIndexTupleOverwrite(page, updatedoffset, (Item) itup,
1210  itemsz))
1211  elog(PANIC, "failed to update partially dead item in block %u of index \"%s\"",
1213  }
1214 
1215  /* Now handle simple deletes of entire tuples */
1216  if (ndeletable > 0)
1217  PageIndexMultiDelete(page, deletable, ndeletable);
1218 
1219  /*
1220  * We can clear the vacuum cycle ID since this page has certainly been
1221  * processed by the current vacuum scan.
1222  */
1223  opaque = BTPageGetOpaque(page);
1224  opaque->btpo_cycleid = 0;
1225 
1226  /*
1227  * Clear the BTP_HAS_GARBAGE page flag.
1228  *
1229  * This flag indicates the presence of LP_DEAD items on the page (though
1230  * not reliably). Note that we only rely on it with pg_upgrade'd
1231  * !heapkeyspace indexes. That's why clearing it here won't usually
1232  * interfere with simple index tuple deletion.
1233  */
1234  opaque->btpo_flags &= ~BTP_HAS_GARBAGE;
1235 
1237 
1238  /* XLOG stuff */
1239  if (needswal)
1240  {
1241  XLogRecPtr recptr;
1242  xl_btree_vacuum xlrec_vacuum;
1243 
1244  xlrec_vacuum.ndeleted = ndeletable;
1245  xlrec_vacuum.nupdated = nupdatable;
1246 
1247  XLogBeginInsert();
1249  XLogRegisterData((char *) &xlrec_vacuum, SizeOfBtreeVacuum);
1250 
1251  if (ndeletable > 0)
1252  XLogRegisterBufData(0, (char *) deletable,
1253  ndeletable * sizeof(OffsetNumber));
1254 
1255  if (nupdatable > 0)
1256  {
1257  XLogRegisterBufData(0, (char *) updatedoffsets,
1258  nupdatable * sizeof(OffsetNumber));
1259  XLogRegisterBufData(0, updatedbuf, updatedbuflen);
1260  }
1261 
1262  recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_VACUUM);
1263 
1264  PageSetLSN(page, recptr);
1265  }
1266 
1267  END_CRIT_SECTION();
1268 
1269  /* can't leak memory here */
1270  if (updatedbuf != NULL)
1271  pfree(updatedbuf);
1272  /* free tuples allocated within _bt_delitems_update() */
1273  for (int i = 0; i < nupdatable; i++)
1274  pfree(updatable[i]->itup);
1275 }
void PageIndexMultiDelete(Page page, OffsetNumber *itemnos, int nitems)
Definition: bufpage.c:1161
bool PageIndexTupleOverwrite(Page page, OffsetNumber offnum, Item newtup, Size newsize)
Definition: bufpage.c:1405
#define PANIC
Definition: elog.h:36
static char * _bt_delitems_update(BTVacuumPosting *updatable, int nupdatable, OffsetNumber *updatedoffsets, Size *updatedbuflen, bool needswal)
Definition: nbtpage.c:1415
#define SizeOfBtreeVacuum
Definition: nbtxlog.h:228
#define XLOG_BTREE_VACUUM
Definition: nbtxlog.h:39
BTCycleId btpo_cycleid
Definition: nbtree.h:68
uint16 ndeleted
Definition: nbtxlog.h:220
uint16 nupdated
Definition: nbtxlog.h:221

References _bt_delitems_update(), Assert(), BTP_HAS_GARBAGE, BTPageGetOpaque, BTPageOpaqueData::btpo_cycleid, BTPageOpaqueData::btpo_flags, buf, BufferGetBlockNumber(), BufferGetPage, elog, END_CRIT_SECTION, i, IndexTupleSize, BTVacuumPostingData::itup, MarkBufferDirty(), MAXALIGN, MaxIndexTuplesPerPage, xl_btree_vacuum::ndeleted, xl_btree_vacuum::nupdated, PageIndexMultiDelete(), PageIndexTupleOverwrite(), PageSetLSN, PANIC, pfree(), REGBUF_STANDARD, RelationGetRelationName, RelationNeedsWAL, SizeOfBtreeVacuum, START_CRIT_SECTION, XLOG_BTREE_VACUUM, XLogBeginInsert(), XLogInsert(), XLogRegisterBufData(), XLogRegisterBuffer(), and XLogRegisterData().

Referenced by btvacuumpage().

◆ _bt_doinsert()

bool _bt_doinsert ( Relation  rel,
IndexTuple  itup,
IndexUniqueCheck  checkUnique,
bool  indexUnchanged,
Relation  heapRel 
)

Definition at line 100 of file nbtinsert.c.

103 {
104  bool is_unique = false;
105  BTInsertStateData insertstate;
106  BTScanInsert itup_key;
107  BTStack stack;
108  bool checkingunique = (checkUnique != UNIQUE_CHECK_NO);
109 
110  /* we need an insertion scan key to do our search, so build one */
111  itup_key = _bt_mkscankey(rel, itup);
112 
113  if (checkingunique)
114  {
115  if (!itup_key->anynullkeys)
116  {
117  /* No (heapkeyspace) scantid until uniqueness established */
118  itup_key->scantid = NULL;
119  }
120  else
121  {
122  /*
123  * Scan key for new tuple contains NULL key values. Bypass
124  * checkingunique steps. They are unnecessary because core code
125  * considers NULL unequal to every value, including NULL.
126  *
127  * This optimization avoids O(N^2) behavior within the
128  * _bt_findinsertloc() heapkeyspace path when a unique index has a
129  * large number of "duplicates" with NULL key values.
130  */
131  checkingunique = false;
132  /* Tuple is unique in the sense that core code cares about */
133  Assert(checkUnique != UNIQUE_CHECK_EXISTING);
134  is_unique = true;
135  }
136  }
137 
138  /*
139  * Fill in the BTInsertState working area, to track the current page and
140  * position within the page to insert on.
141  *
142  * Note that itemsz is passed down to lower level code that deals with
143  * inserting the item. It must be MAXALIGN()'d. This ensures that space
144  * accounting code consistently considers the alignment overhead that we
145  * expect PageAddItem() will add later. (Actually, index_form_tuple() is
146  * already conservative about alignment, but we don't rely on that from
147  * this distance. Besides, preserving the "true" tuple size in index
148  * tuple headers for the benefit of nbtsplitloc.c might happen someday.
149  * Note that heapam does not MAXALIGN() each heap tuple's lp_len field.)
150  */
151  insertstate.itup = itup;
152  insertstate.itemsz = MAXALIGN(IndexTupleSize(itup));
153  insertstate.itup_key = itup_key;
154  insertstate.bounds_valid = false;
155  insertstate.buf = InvalidBuffer;
156  insertstate.postingoff = 0;
157 
158 search:
159 
160  /*
161  * Find and lock the leaf page that the tuple should be added to by
162  * searching from the root page. insertstate.buf will hold a buffer that
163  * is locked in exclusive mode afterwards.
164  */
165  stack = _bt_search_insert(rel, &insertstate);
166 
167  /*
168  * checkingunique inserts are not allowed to go ahead when two tuples with
169  * equal key attribute values would be visible to new MVCC snapshots once
170  * the xact commits. Check for conflicts in the locked page/buffer (if
171  * needed) here.
172  *
173  * It might be necessary to check a page to the right in _bt_check_unique,
174  * though that should be very rare. In practice the first page the value
175  * could be on (with scantid omitted) is almost always also the only page
176  * that a matching tuple might be found on. This is due to the behavior
177  * of _bt_findsplitloc with duplicate tuples -- a group of duplicates can
178  * only be allowed to cross a page boundary when there is no candidate
179  * leaf page split point that avoids it. Also, _bt_check_unique can use
180  * the leaf page high key to determine that there will be no duplicates on
181  * the right sibling without actually visiting it (it uses the high key in
182  * cases where the new item happens to belong at the far right of the leaf
183  * page).
184  *
185  * NOTE: obviously, _bt_check_unique can only detect keys that are already
186  * in the index; so it cannot defend against concurrent insertions of the
187  * same key. We protect against that by means of holding a write lock on
188  * the first page the value could be on, with omitted/-inf value for the
189  * implicit heap TID tiebreaker attribute. Any other would-be inserter of
190  * the same key must acquire a write lock on the same page, so only one
191  * would-be inserter can be making the check at one time. Furthermore,
192  * once we are past the check we hold write locks continuously until we
193  * have performed our insertion, so no later inserter can fail to see our
194  * insertion. (This requires some care in _bt_findinsertloc.)
195  *
196  * If we must wait for another xact, we release the lock while waiting,
197  * and then must perform a new search.
198  *
199  * For a partial uniqueness check, we don't wait for the other xact. Just
200  * let the tuple in and return false for possibly non-unique, or true for
201  * definitely unique.
202  */
203  if (checkingunique)
204  {
205  TransactionId xwait;
206  uint32 speculativeToken;
207 
208  xwait = _bt_check_unique(rel, &insertstate, heapRel, checkUnique,
209  &is_unique, &speculativeToken);
210 
211  if (unlikely(TransactionIdIsValid(xwait)))
212  {
213  /* Have to wait for the other guy ... */
214  _bt_relbuf(rel, insertstate.buf);
215  insertstate.buf = InvalidBuffer;
216 
217  /*
218  * If it's a speculative insertion, wait for it to finish (ie. to
219  * go ahead with the insertion, or kill the tuple). Otherwise
220  * wait for the transaction to finish as usual.
221  */
222  if (speculativeToken)
223  SpeculativeInsertionWait(xwait, speculativeToken);
224  else
225  XactLockTableWait(xwait, rel, &itup->t_tid, XLTW_InsertIndex);
226 
227  /* start over... */
228  if (stack)
229  _bt_freestack(stack);
230  goto search;
231  }
232 
233  /* Uniqueness is established -- restore heap tid as scantid */
234  if (itup_key->heapkeyspace)
235  itup_key->scantid = &itup->t_tid;
236  }
237 
238  if (checkUnique != UNIQUE_CHECK_EXISTING)
239  {
240  OffsetNumber newitemoff;
241 
242  /*
243  * The only conflict predicate locking cares about for indexes is when
244  * an index tuple insert conflicts with an existing lock. We don't
245  * know the actual page we're going to insert on for sure just yet in
246  * checkingunique and !heapkeyspace cases, but it's okay to use the
247  * first page the value could be on (with scantid omitted) instead.
248  */
250 
251  /*
252  * Do the insertion. Note that insertstate contains cached binary
253  * search bounds established within _bt_check_unique when insertion is
254  * checkingunique.
255  */
256  newitemoff = _bt_findinsertloc(rel, &insertstate, checkingunique,
257  indexUnchanged, stack, heapRel);
258  _bt_insertonpg(rel, itup_key, insertstate.buf, InvalidBuffer, stack,
259  itup, insertstate.itemsz, newitemoff,
260  insertstate.postingoff, false);
261  }
262  else
263  {
264  /* just release the buffer */
265  _bt_relbuf(rel, insertstate.buf);
266  }
267 
268  /* be tidy */
269  if (stack)
270  _bt_freestack(stack);
271  pfree(itup_key);
272 
273  return is_unique;
274 }
unsigned int uint32
Definition: c.h:441
@ UNIQUE_CHECK_NO
Definition: genam.h:116
@ UNIQUE_CHECK_EXISTING
Definition: genam.h:119
void XactLockTableWait(TransactionId xid, Relation rel, ItemPointer ctid, XLTW_Oper oper)
Definition: lmgr.c:668
void SpeculativeInsertionWait(TransactionId xid, uint32 token)
Definition: lmgr.c:825
@ XLTW_InsertIndex
Definition: lmgr.h:31
static OffsetNumber _bt_findinsertloc(Relation rel, BTInsertState insertstate, bool checkingunique, bool indexUnchanged, BTStack stack, Relation heapRel)
Definition: nbtinsert.c:813
static BTStack _bt_search_insert(Relation rel, BTInsertState insertstate)
Definition: nbtinsert.c:315
static void _bt_insertonpg(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, BTStack stack, IndexTuple itup, Size itemsz, OffsetNumber newitemoff, int postingoff, bool split_only_page)
Definition: nbtinsert.c:1101
static TransactionId _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel, IndexUniqueCheck checkUnique, bool *is_unique, uint32 *speculativeToken)
Definition: nbtinsert.c:406
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:1035
void _bt_freestack(BTStack stack)
Definition: nbtutils.c:182
BTScanInsert _bt_mkscankey(Relation rel, IndexTuple itup)
Definition: nbtutils.c:90
void CheckForSerializableConflictIn(Relation relation, ItemPointer tid, BlockNumber blkno)
Definition: predicate.c:4448
IndexTuple itup
Definition: nbtree.h:811
ItemPointer scantid
Definition: nbtree.h:791
bool heapkeyspace
Definition: nbtree.h:786
bool anynullkeys
Definition: nbtree.h:788
#define TransactionIdIsValid(xid)
Definition: transam.h:41

References _bt_check_unique(), _bt_findinsertloc(), _bt_freestack(), _bt_insertonpg(), _bt_mkscankey(), _bt_relbuf(), _bt_search_insert(), BTScanInsertData::anynullkeys, Assert(), BTInsertStateData::bounds_valid, BTInsertStateData::buf, BufferGetBlockNumber(), CheckForSerializableConflictIn(), BTScanInsertData::heapkeyspace, IndexTupleSize, InvalidBuffer, BTInsertStateData::itemsz, BTInsertStateData::itup, BTInsertStateData::itup_key, MAXALIGN, pfree(), BTInsertStateData::postingoff, BTScanInsertData::scantid, SpeculativeInsertionWait(), IndexTupleData::t_tid, TransactionIdIsValid, UNIQUE_CHECK_EXISTING, UNIQUE_CHECK_NO, unlikely, XactLockTableWait(), and XLTW_InsertIndex.

Referenced by btinsert().

◆ _bt_end_vacuum()

void _bt_end_vacuum ( Relation  rel)

Definition at line 2033 of file nbtutils.c.

2034 {
2035  int i;
2036 
2037  LWLockAcquire(BtreeVacuumLock, LW_EXCLUSIVE);
2038 
2039  /* Find the array entry */
2040  for (i = 0; i < btvacinfo->num_vacuums; i++)
2041  {
2042  BTOneVacInfo *vac = &btvacinfo->vacuums[i];
2043 
2044  if (vac->relid.relId == rel->rd_lockInfo.lockRelId.relId &&
2045  vac->relid.dbId == rel->rd_lockInfo.lockRelId.dbId)
2046  {
2047  /* Remove it by shifting down the last entry */
2048  *vac = btvacinfo->vacuums[btvacinfo->num_vacuums - 1];
2050  break;
2051  }
2052  }
2053 
2054  LWLockRelease(BtreeVacuumLock);
2055 }
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1196
void LWLockRelease(LWLock *lock)
Definition: lwlock.c:1800
@ LW_EXCLUSIVE
Definition: lwlock.h:104
static BTVacInfo * btvacinfo
Definition: nbtutils.c:1929
LockRelId relid
Definition: nbtutils.c:1917
int num_vacuums
Definition: nbtutils.c:1924
BTOneVacInfo vacuums[FLEXIBLE_ARRAY_MEMBER]
Definition: nbtutils.c:1926
LockRelId lockRelId
Definition: rel.h:45
Oid relId
Definition: rel.h:39
Oid dbId
Definition: rel.h:40
LockInfoData rd_lockInfo
Definition: rel.h:112

References btvacinfo, LockRelId::dbId, i, LockInfoData::lockRelId, LW_EXCLUSIVE, LWLockAcquire(), LWLockRelease(), BTVacInfo::num_vacuums, RelationData::rd_lockInfo, BTOneVacInfo::relid, LockRelId::relId, and BTVacInfo::vacuums.

Referenced by _bt_end_vacuum_callback(), and btbulkdelete().

◆ _bt_end_vacuum_callback()

void _bt_end_vacuum_callback ( int  code,
Datum  arg 
)

Definition at line 2061 of file nbtutils.c.

2062 {
2064 }
void _bt_end_vacuum(Relation rel)
Definition: nbtutils.c:2033
void * arg
#define DatumGetPointer(X)
Definition: postgres.h:593

References _bt_end_vacuum(), arg, and DatumGetPointer.

Referenced by btbulkdelete().

◆ _bt_findsplitloc()

OffsetNumber _bt_findsplitloc ( Relation  rel,
Page  origpage,
OffsetNumber  newitemoff,
Size  newitemsz,
IndexTuple  newitem,
bool newitemonleft 
)

Definition at line 129 of file nbtsplitloc.c.

135 {
136  BTPageOpaque opaque;
137  int leftspace,
138  rightspace,
139  olddataitemstotal,
140  olddataitemstoleft,
141  perfectpenalty,
142  leaffillfactor;
144  FindSplitStrat strategy;
145  ItemId itemid;
146  OffsetNumber offnum,
147  maxoff,
148  firstrightoff;
149  double fillfactormult;
150  bool usemult;
151  SplitPoint leftpage,
152  rightpage;
153 
154  opaque = BTPageGetOpaque(origpage);
155  maxoff = PageGetMaxOffsetNumber(origpage);
156 
157  /* Total free space available on a btree page, after fixed overhead */
158  leftspace = rightspace =
160  MAXALIGN(sizeof(BTPageOpaqueData));
161 
162  /* The right page will have the same high key as the old page */
163  if (!P_RIGHTMOST(opaque))
164  {
165  itemid = PageGetItemId(origpage, P_HIKEY);
166  rightspace -= (int) (MAXALIGN(ItemIdGetLength(itemid)) +
167  sizeof(ItemIdData));
168  }
169 
170  /* Count up total space in data items before actually scanning 'em */
171  olddataitemstotal = rightspace - (int) PageGetExactFreeSpace(origpage);
172  leaffillfactor = BTGetFillFactor(rel);
173 
174  /* Passed-in newitemsz is MAXALIGNED but does not include line pointer */
175  newitemsz += sizeof(ItemIdData);
176  state.rel = rel;
177  state.origpage = origpage;
178  state.newitem = newitem;
179  state.newitemsz = newitemsz;
180  state.is_leaf = P_ISLEAF(opaque);
181  state.is_rightmost = P_RIGHTMOST(opaque);
182  state.leftspace = leftspace;
183  state.rightspace = rightspace;
184  state.olddataitemstotal = olddataitemstotal;
185  state.minfirstrightsz = SIZE_MAX;
186  state.newitemoff = newitemoff;
187 
188  /* newitem cannot be a posting list item */
189  Assert(!BTreeTupleIsPosting(newitem));
190 
191  /*
192  * nsplits should never exceed maxoff because there will be at most as
193  * many candidate split points as there are points _between_ tuples, once
194  * you imagine that the new item is already on the original page (the
195  * final number of splits may be slightly lower because not all points
196  * between tuples will be legal).
197  */
198  state.maxsplits = maxoff;
199  state.splits = palloc(sizeof(SplitPoint) * state.maxsplits);
200  state.nsplits = 0;
201 
202  /*
203  * Scan through the data items and calculate space usage for a split at
204  * each possible position
205  */
206  olddataitemstoleft = 0;
207 
208  for (offnum = P_FIRSTDATAKEY(opaque);
209  offnum <= maxoff;
210  offnum = OffsetNumberNext(offnum))
211  {
212  Size itemsz;
213 
214  itemid = PageGetItemId(origpage, offnum);
215  itemsz = MAXALIGN(ItemIdGetLength(itemid)) + sizeof(ItemIdData);
216 
217  /*
218  * When item offset number is not newitemoff, neither side of the
219  * split can be newitem. Record a split after the previous data item
220  * from original page, but before the current data item from original
221  * page. (_bt_recsplitloc() will reject the split when there are no
222  * previous items, which we rely on.)
223  */
224  if (offnum < newitemoff)
225  _bt_recsplitloc(&state, offnum, false, olddataitemstoleft, itemsz);
226  else if (offnum > newitemoff)
227  _bt_recsplitloc(&state, offnum, true, olddataitemstoleft, itemsz);
228  else
229  {
230  /*
231  * Record a split after all "offnum < newitemoff" original page
232  * data items, but before newitem
233  */
234  _bt_recsplitloc(&state, offnum, false, olddataitemstoleft, itemsz);
235 
236  /*
237  * Record a split after newitem, but before data item from
238  * original page at offset newitemoff/current offset
239  */
240  _bt_recsplitloc(&state, offnum, true, olddataitemstoleft, itemsz);
241  }
242 
243  olddataitemstoleft += itemsz;
244  }
245 
246  /*
247  * Record a split after all original page data items, but before newitem.
248  * (Though only when it's possible that newitem will end up alone on new
249  * right page.)
250  */
251  Assert(olddataitemstoleft == olddataitemstotal);
252  if (newitemoff > maxoff)
253  _bt_recsplitloc(&state, newitemoff, false, olddataitemstotal, 0);
254 
255  /*
256  * I believe it is not possible to fail to find a feasible split, but just
257  * in case ...
258  */
259  if (state.nsplits == 0)
260  elog(ERROR, "could not find a feasible split point for index \"%s\"",
262 
263  /*
264  * Start search for a split point among list of legal split points. Give
265  * primary consideration to equalizing available free space in each half
266  * of the split initially (start with default strategy), while applying
267  * rightmost and split-after-new-item optimizations where appropriate.
268  * Either of the two other fallback strategies may be required for cases
269  * with a large number of duplicates around the original/space-optimal
270  * split point.
271  *
272  * Default strategy gives some weight to suffix truncation in deciding a
273  * split point on leaf pages. It attempts to select a split point where a
274  * distinguishing attribute appears earlier in the new high key for the
275  * left side of the split, in order to maximize the number of trailing
276  * attributes that can be truncated away. Only candidate split points
277  * that imply an acceptable balance of free space on each side are
278  * considered. See _bt_defaultinterval().
279  */
280  if (!state.is_leaf)
281  {
282  /* fillfactormult only used on rightmost page */
283  usemult = state.is_rightmost;
284  fillfactormult = BTREE_NONLEAF_FILLFACTOR / 100.0;
285  }
286  else if (state.is_rightmost)
287  {
288  /* Rightmost leaf page -- fillfactormult always used */
289  usemult = true;
290  fillfactormult = leaffillfactor / 100.0;
291  }
292  else if (_bt_afternewitemoff(&state, maxoff, leaffillfactor, &usemult))
293  {
294  /*
295  * New item inserted at rightmost point among a localized grouping on
296  * a leaf page -- apply "split after new item" optimization, either by
297  * applying leaf fillfactor multiplier, or by choosing the exact split
298  * point that leaves newitem as lastleft. (usemult is set for us.)
299  */
300  if (usemult)
301  {
302  /* fillfactormult should be set based on leaf fillfactor */
303  fillfactormult = leaffillfactor / 100.0;
304  }
305  else
306  {
307  /* find precise split point after newitemoff */
308  for (int i = 0; i < state.nsplits; i++)
309  {
310  SplitPoint *split = state.splits + i;
311 
312  if (split->newitemonleft &&
313  newitemoff == split->firstrightoff)
314  {
315  pfree(state.splits);
316  *newitemonleft = true;
317  return newitemoff;
318  }
319  }
320 
321  /*
322  * Cannot legally split after newitemoff; proceed with split
323  * without using fillfactor multiplier. This is defensive, and
324  * should never be needed in practice.
325  */
326  fillfactormult = 0.50;
327  }
328  }
329  else
330  {
331  /* Other leaf page. 50:50 page split. */
332  usemult = false;
333  /* fillfactormult not used, but be tidy */
334  fillfactormult = 0.50;
335  }
336 
337  /*
338  * Save leftmost and rightmost splits for page before original ordinal
339  * sort order is lost by delta/fillfactormult sort
340  */
341  leftpage = state.splits[0];
342  rightpage = state.splits[state.nsplits - 1];
343 
344  /* Give split points a fillfactormult-wise delta, and sort on deltas */
345  _bt_deltasortsplits(&state, fillfactormult, usemult);
346 
347  /* Determine split interval for default strategy */
348  state.interval = _bt_defaultinterval(&state);
349 
350  /*
351  * Determine if default strategy/split interval will produce a
352  * sufficiently distinguishing split, or if we should change strategies.
353  * Alternative strategies change the range of split points that are
354  * considered acceptable (split interval), and possibly change
355  * fillfactormult, in order to deal with pages with a large number of
356  * duplicates gracefully.
357  *
358  * Pass low and high splits for the entire page (actually, they're for an
359  * imaginary version of the page that includes newitem). These are used
360  * when the initial split interval encloses split points that are full of
361  * duplicates, and we need to consider if it's even possible to avoid
362  * appending a heap TID.
363  */
364  perfectpenalty = _bt_strategy(&state, &leftpage, &rightpage, &strategy);
365 
366  if (strategy == SPLIT_DEFAULT)
367  {
368  /*
369  * Default strategy worked out (always works out with internal page).
370  * Original split interval still stands.
371  */
372  }
373 
374  /*
375  * Many duplicates strategy is used when a heap TID would otherwise be
376  * appended, but the page isn't completely full of logical duplicates.
377  *
378  * The split interval is widened to include all legal candidate split
379  * points. There might be a few as two distinct values in the whole-page
380  * split interval, though it's also possible that most of the values on
381  * the page are unique. The final split point will either be to the
382  * immediate left or to the immediate right of the group of duplicate
383  * tuples that enclose the first/delta-optimal split point (perfect
384  * penalty was set so that the lowest delta split point that avoids
385  * appending a heap TID will be chosen). Maximizing the number of
386  * attributes that can be truncated away is not a goal of the many
387  * duplicates strategy.
388  *
389  * Single value strategy is used when it is impossible to avoid appending
390  * a heap TID. It arranges to leave the left page very full. This
391  * maximizes space utilization in cases where tuples with the same
392  * attribute values span many pages. Newly inserted duplicates will tend
393  * to have higher heap TID values, so we'll end up splitting to the right
394  * consistently. (Single value strategy is harmless though not
395  * particularly useful with !heapkeyspace indexes.)
396  */
397  else if (strategy == SPLIT_MANY_DUPLICATES)
398  {
399  Assert(state.is_leaf);
400  /* Shouldn't try to truncate away extra user attributes */
401  Assert(perfectpenalty ==
403  /* No need to resort splits -- no change in fillfactormult/deltas */
404  state.interval = state.nsplits;
405  }
406  else if (strategy == SPLIT_SINGLE_VALUE)
407  {
408  Assert(state.is_leaf);
409  /* Split near the end of the page */
410  usemult = true;
411  fillfactormult = BTREE_SINGLEVAL_FILLFACTOR / 100.0;
412  /* Resort split points with new delta */
413  _bt_deltasortsplits(&state, fillfactormult, usemult);
414  /* Appending a heap TID is unavoidable, so interval of 1 is fine */
415  state.interval = 1;
416  }
417 
418  /*
419  * Search among acceptable split points (using final split interval) for
420  * the entry that has the lowest penalty, and is therefore expected to
421  * maximize fan-out. Sets *newitemonleft for us.
422  */
423  firstrightoff = _bt_bestsplitloc(&state, perfectpenalty, newitemonleft,
424  strategy);
425  pfree(state.splits);
426 
427  return firstrightoff;
428 }
#define BTREE_SINGLEVAL_FILLFACTOR
Definition: nbtree.h:203
#define BTGetFillFactor(relation)
Definition: nbtree.h:1095
#define BTREE_NONLEAF_FILLFACTOR
Definition: nbtree.h:202
static void _bt_deltasortsplits(FindSplitData *state, double fillfactormult, bool usemult)
Definition: nbtsplitloc.c:566
static int _bt_strategy(FindSplitData *state, SplitPoint *leftpage, SplitPoint *rightpage, FindSplitStrat *strategy)
Definition: nbtsplitloc.c:939
static bool _bt_afternewitemoff(FindSplitData *state, OffsetNumber maxoff, int leaffillfactor, bool *usemult)
Definition: nbtsplitloc.c:635
static void _bt_recsplitloc(FindSplitData *state, OffsetNumber firstrightoff, bool newitemonleft, int olddataitemstoleft, Size firstrightofforigpagetuplesz)
Definition: nbtsplitloc.c:449
FindSplitStrat
Definition: nbtsplitloc.c:21
@ SPLIT_DEFAULT
Definition: nbtsplitloc.c:23
@ SPLIT_MANY_DUPLICATES
Definition: nbtsplitloc.c:24
@ SPLIT_SINGLE_VALUE
Definition: nbtsplitloc.c:25
static OffsetNumber _bt_bestsplitloc(FindSplitData *state, int perfectpenalty, bool *newitemonleft, FindSplitStrat strategy)
Definition: nbtsplitloc.c:793
static int _bt_defaultinterval(FindSplitData *state)
Definition: nbtsplitloc.c:881
bool newitemonleft
Definition: nbtsplitloc.c:37
OffsetNumber firstrightoff
Definition: nbtsplitloc.c:36

References _bt_afternewitemoff(), _bt_bestsplitloc(), _bt_defaultinterval(), _bt_deltasortsplits(), _bt_recsplitloc(), _bt_strategy(), Assert(), BTGetFillFactor, BTPageGetOpaque, BTREE_NONLEAF_FILLFACTOR, BTREE_SINGLEVAL_FILLFACTOR, BTreeTupleIsPosting(), elog, ERROR, SplitPoint::firstrightoff, i, IndexRelationGetNumberOfKeyAttributes, ItemIdGetLength, MAXALIGN, SplitPoint::newitemonleft, OffsetNumberNext, P_FIRSTDATAKEY, P_HIKEY, P_ISLEAF, P_RIGHTMOST, PageGetExactFreeSpace(), PageGetItemId, PageGetMaxOffsetNumber, PageGetPageSize, palloc(), pfree(), RelationGetRelationName, SizeOfPageHeaderData, SPLIT_DEFAULT, SPLIT_MANY_DUPLICATES, and SPLIT_SINGLE_VALUE.

Referenced by _bt_split().

◆ _bt_finish_split()

void _bt_finish_split ( Relation  rel,
Buffer  lbuf,
BTStack  stack 
)

Definition at line 2230 of file nbtinsert.c.

2231 {
2232  Page lpage = BufferGetPage(lbuf);
2233  BTPageOpaque lpageop = BTPageGetOpaque(lpage);
2234  Buffer rbuf;
2235  Page rpage;
2236  BTPageOpaque rpageop;
2237  bool wasroot;
2238  bool wasonly;
2239 
2240  Assert(P_INCOMPLETE_SPLIT(lpageop));
2241 
2242  /* Lock right sibling, the one missing the downlink */
2243  rbuf = _bt_getbuf(rel, lpageop->btpo_next, BT_WRITE);
2244  rpage = BufferGetPage(rbuf);
2245  rpageop = BTPageGetOpaque(rpage);
2246 
2247  /* Could this be a root split? */
2248  if (!stack)
2249  {
2250  Buffer metabuf;
2251  Page metapg;
2252  BTMetaPageData *metad;
2253 
2254  /* acquire lock on the metapage */
2255  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
2256  metapg = BufferGetPage(metabuf);
2257  metad = BTPageGetMeta(metapg);
2258 
2259  wasroot = (metad->btm_root == BufferGetBlockNumber(lbuf));
2260 
2261  _bt_relbuf(rel, metabuf);
2262  }
2263  else
2264  wasroot = false;
2265 
2266  /* Was this the only page on the level before split? */
2267  wasonly = (P_LEFTMOST(lpageop) && P_RIGHTMOST(rpageop));
2268 
2269  elog(DEBUG1, "finishing incomplete split of %u/%u",
2271 
2272  _bt_insert_parent(rel, lbuf, rbuf, stack, wasroot, wasonly);
2273 }
int Buffer
Definition: buf.h:23
static void _bt_insert_parent(Relation rel, Buffer buf, Buffer rbuf, BTStack stack, bool isroot, bool isonly)
Definition: nbtinsert.c:2094
Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access)
Definition: nbtpage.c:871
#define BTPageGetMeta(p)
Definition: nbtree.h:121
#define P_LEFTMOST(opaque)
Definition: nbtree.h:219
#define P_INCOMPLETE_SPLIT(opaque)
Definition: nbtree.h:228
#define BTREE_METAPAGE
Definition: nbtree.h:148
#define BT_WRITE
Definition: nbtree.h:715
BlockNumber btm_root
Definition: nbtree.h:107
BlockNumber btpo_next
Definition: nbtree.h:65

References _bt_getbuf(), _bt_insert_parent(), _bt_relbuf(), Assert(), BT_WRITE, BTMetaPageData::btm_root, BTPageGetMeta, BTPageGetOpaque, BTPageOpaqueData::btpo_next, BTREE_METAPAGE, BufferGetBlockNumber(), BufferGetPage, DEBUG1, elog, P_INCOMPLETE_SPLIT, P_LEFTMOST, and P_RIGHTMOST.

Referenced by _bt_getstackbuf(), _bt_moveright(), and _bt_stepright().

◆ _bt_first()

bool _bt_first ( IndexScanDesc  scan,
ScanDirection  dir 
)

Definition at line 860 of file nbtsearch.c.

861 {
862  Relation rel = scan->indexRelation;
863  BTScanOpaque so = (BTScanOpaque) scan->opaque;
864  Buffer buf;
865  BTStack stack;
866  OffsetNumber offnum;
867  StrategyNumber strat;
868  bool nextkey;
869  bool goback;
870  BTScanInsertData inskey;
871  ScanKey startKeys[INDEX_MAX_KEYS];
872  ScanKeyData notnullkeys[INDEX_MAX_KEYS];
873  int keysCount = 0;
874  int i;
875  bool status;
876  StrategyNumber strat_total;
877  BTScanPosItem *currItem;
878  BlockNumber blkno;
879 
881 
883 
884  /*
885  * Examine the scan keys and eliminate any redundant keys; also mark the
886  * keys that must be matched to continue the scan.
887  */
888  _bt_preprocess_keys(scan);
889 
890  /*
891  * Quit now if _bt_preprocess_keys() discovered that the scan keys can
892  * never be satisfied (eg, x == 1 AND x > 2).
893  */
894  if (!so->qual_ok)
895  {
896  /* Notify any other workers that we're done with this scan key. */
897  _bt_parallel_done(scan);
898  return false;
899  }
900 
901  /*
902  * For parallel scans, get the starting page from shared state. If the
903  * scan has not started, proceed to find out first leaf page in the usual
904  * way while keeping other participating processes waiting. If the scan
905  * has already begun, use the page number from the shared structure.
906  */
907  if (scan->parallel_scan != NULL)
908  {
909  status = _bt_parallel_seize(scan, &blkno);
910  if (!status)
911  return false;
912  else if (blkno == P_NONE)
913  {
914  _bt_parallel_done(scan);
915  return false;
916  }
917  else if (blkno != InvalidBlockNumber)
918  {
919  if (!_bt_parallel_readpage(scan, blkno, dir))
920  return false;
921  goto readcomplete;
922  }
923  }
924 
925  /*----------
926  * Examine the scan keys to discover where we need to start the scan.
927  *
928  * We want to identify the keys that can be used as starting boundaries;
929  * these are =, >, or >= keys for a forward scan or =, <, <= keys for
930  * a backwards scan. We can use keys for multiple attributes so long as
931  * the prior attributes had only =, >= (resp. =, <=) keys. Once we accept
932  * a > or < boundary or find an attribute with no boundary (which can be
933  * thought of as the same as "> -infinity"), we can't use keys for any
934  * attributes to its right, because it would break our simplistic notion
935  * of what initial positioning strategy to use.
936  *
937  * When the scan keys include cross-type operators, _bt_preprocess_keys
938  * may not be able to eliminate redundant keys; in such cases we will
939  * arbitrarily pick a usable one for each attribute. This is correct
940  * but possibly not optimal behavior. (For example, with keys like
941  * "x >= 4 AND x >= 5" we would elect to scan starting at x=4 when
942  * x=5 would be more efficient.) Since the situation only arises given
943  * a poorly-worded query plus an incomplete opfamily, live with it.
944  *
945  * When both equality and inequality keys appear for a single attribute
946  * (again, only possible when cross-type operators appear), we *must*
947  * select one of the equality keys for the starting point, because
948  * _bt_checkkeys() will stop the scan as soon as an equality qual fails.
949  * For example, if we have keys like "x >= 4 AND x = 10" and we elect to
950  * start at x=4, we will fail and stop before reaching x=10. If multiple
951  * equality quals survive preprocessing, however, it doesn't matter which
952  * one we use --- by definition, they are either redundant or
953  * contradictory.
954  *
955  * Any regular (not SK_SEARCHNULL) key implies a NOT NULL qualifier.
956  * If the index stores nulls at the end of the index we'll be starting
957  * from, and we have no boundary key for the column (which means the key
958  * we deduced NOT NULL from is an inequality key that constrains the other
959  * end of the index), then we cons up an explicit SK_SEARCHNOTNULL key to
960  * use as a boundary key. If we didn't do this, we might find ourselves
961  * traversing a lot of null entries at the start of the scan.
962  *
963  * In this loop, row-comparison keys are treated the same as keys on their
964  * first (leftmost) columns. We'll add on lower-order columns of the row
965  * comparison below, if possible.
966  *
967  * The selected scan keys (at most one per index column) are remembered by
968  * storing their addresses into the local startKeys[] array.
969  *----------
970  */
971  strat_total = BTEqualStrategyNumber;
972  if (so->numberOfKeys > 0)
973  {
974  AttrNumber curattr;
975  ScanKey chosen;
976  ScanKey impliesNN;
977  ScanKey cur;
978 
979  /*
980  * chosen is the so-far-chosen key for the current attribute, if any.
981  * We don't cast the decision in stone until we reach keys for the
982  * next attribute.
983  */
984  curattr = 1;
985  chosen = NULL;
986  /* Also remember any scankey that implies a NOT NULL constraint */
987  impliesNN = NULL;
988 
989  /*
990  * Loop iterates from 0 to numberOfKeys inclusive; we use the last
991  * pass to handle after-last-key processing. Actual exit from the
992  * loop is at one of the "break" statements below.
993  */
994  for (cur = so->keyData, i = 0;; cur++, i++)
995  {
996  if (i >= so->numberOfKeys || cur->sk_attno != curattr)
997  {
998  /*
999  * Done looking at keys for curattr. If we didn't find a
1000  * usable boundary key, see if we can deduce a NOT NULL key.
1001  */
1002  if (chosen == NULL && impliesNN != NULL &&
1003  ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
1004  ScanDirectionIsForward(dir) :
1006  {
1007  /* Yes, so build the key in notnullkeys[keysCount] */
1008  chosen = &notnullkeys[keysCount];
1009  ScanKeyEntryInitialize(chosen,
1011  (impliesNN->sk_flags &
1013  curattr,
1014  ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
1017  InvalidOid,
1018  InvalidOid,
1019  InvalidOid,
1020  (Datum) 0);
1021  }
1022 
1023  /*
1024  * If we still didn't find a usable boundary key, quit; else
1025  * save the boundary key pointer in startKeys.
1026  */
1027  if (chosen == NULL)
1028  break;
1029  startKeys[keysCount++] = chosen;
1030 
1031  /*
1032  * Adjust strat_total, and quit if we have stored a > or <
1033  * key.
1034  */
1035  strat = chosen->sk_strategy;
1036  if (strat != BTEqualStrategyNumber)
1037  {
1038  strat_total = strat;
1039  if (strat == BTGreaterStrategyNumber ||
1040  strat == BTLessStrategyNumber)
1041  break;
1042  }
1043 
1044  /*
1045  * Done if that was the last attribute, or if next key is not
1046  * in sequence (implying no boundary key is available for the
1047  * next attribute).
1048  */
1049  if (i >= so->numberOfKeys ||
1050  cur->sk_attno != curattr + 1)
1051  break;
1052 
1053  /*
1054  * Reset for next attr.
1055  */
1056  curattr = cur->sk_attno;
1057  chosen = NULL;
1058  impliesNN = NULL;
1059  }
1060 
1061  /*
1062  * Can we use this key as a starting boundary for this attr?
1063  *
1064  * If not, does it imply a NOT NULL constraint? (Because
1065  * SK_SEARCHNULL keys are always assigned BTEqualStrategyNumber,
1066  * *any* inequality key works for that; we need not test.)
1067  */
1068  switch (cur->sk_strategy)
1069  {
1070  case BTLessStrategyNumber:
1072  if (chosen == NULL)
1073  {
1074  if (ScanDirectionIsBackward(dir))
1075  chosen = cur;
1076  else
1077  impliesNN = cur;
1078  }
1079  break;
1080  case BTEqualStrategyNumber:
1081  /* override any non-equality choice */
1082  chosen = cur;
1083  break;
1086  if (chosen == NULL)
1087  {
1088  if (ScanDirectionIsForward(dir))
1089  chosen = cur;
1090  else
1091  impliesNN = cur;
1092  }
1093  break;
1094  }
1095  }
1096  }
1097 
1098  /*
1099  * If we found no usable boundary keys, we have to start from one end of
1100  * the tree. Walk down that edge to the first or last key, and scan from
1101  * there.
1102  */
1103  if (keysCount == 0)
1104  {
1105  bool match;
1106 
1107  match = _bt_endpoint(scan, dir);
1108 
1109  if (!match)
1110  {
1111  /* No match, so mark (parallel) scan finished */
1112  _bt_parallel_done(scan);
1113  }
1114 
1115  return match;
1116  }
1117 
1118  /*
1119  * We want to start the scan somewhere within the index. Set up an
1120  * insertion scankey we can use to search for the boundary point we
1121  * identified above. The insertion scankey is built using the keys
1122  * identified by startKeys[]. (Remaining insertion scankey fields are
1123  * initialized after initial-positioning strategy is finalized.)
1124  */
1125  Assert(keysCount <= INDEX_MAX_KEYS);
1126  for (i = 0; i < keysCount; i++)
1127  {
1128  ScanKey cur = startKeys[i];
1129 
1130  Assert(cur->sk_attno == i + 1);
1131 
1132  if (cur->sk_flags & SK_ROW_HEADER)
1133  {
1134  /*
1135  * Row comparison header: look to the first row member instead.
1136  *
1137  * The member scankeys are already in insertion format (ie, they
1138  * have sk_func = 3-way-comparison function), but we have to watch
1139  * out for nulls, which _bt_preprocess_keys didn't check. A null
1140  * in the first row member makes the condition unmatchable, just
1141  * like qual_ok = false.
1142  */
1143  ScanKey subkey = (ScanKey) DatumGetPointer(cur->sk_argument);
1144 
1145  Assert(subkey->sk_flags & SK_ROW_MEMBER);
1146  if (subkey->sk_flags & SK_ISNULL)
1147  {
1148  _bt_parallel_done(scan);
1149  return false;
1150  }
1151  memcpy(inskey.scankeys + i, subkey, sizeof(ScanKeyData));
1152 
1153  /*
1154  * If the row comparison is the last positioning key we accepted,
1155  * try to add additional keys from the lower-order row members.
1156  * (If we accepted independent conditions on additional index
1157  * columns, we use those instead --- doesn't seem worth trying to
1158  * determine which is more restrictive.) Note that this is OK
1159  * even if the row comparison is of ">" or "<" type, because the
1160  * condition applied to all but the last row member is effectively
1161  * ">=" or "<=", and so the extra keys don't break the positioning
1162  * scheme. But, by the same token, if we aren't able to use all
1163  * the row members, then the part of the row comparison that we
1164  * did use has to be treated as just a ">=" or "<=" condition, and
1165  * so we'd better adjust strat_total accordingly.
1166  */
1167  if (i == keysCount - 1)
1168  {
1169  bool used_all_subkeys = false;
1170 
1171  Assert(!(subkey->sk_flags & SK_ROW_END));
1172  for (;;)
1173  {
1174  subkey++;
1175  Assert(subkey->sk_flags & SK_ROW_MEMBER);
1176  if (subkey->sk_attno != keysCount + 1)
1177  break; /* out-of-sequence, can't use it */
1178  if (subkey->sk_strategy != cur->sk_strategy)
1179  break; /* wrong direction, can't use it */
1180  if (subkey->sk_flags & SK_ISNULL)
1181  break; /* can't use null keys */
1182  Assert(keysCount < INDEX_MAX_KEYS);
1183  memcpy(inskey.scankeys + keysCount, subkey,
1184  sizeof(ScanKeyData));
1185  keysCount++;
1186  if (subkey->sk_flags & SK_ROW_END)
1187  {
1188  used_all_subkeys = true;
1189  break;
1190  }
1191  }
1192  if (!used_all_subkeys)
1193  {
1194  switch (strat_total)
1195  {
1196  case BTLessStrategyNumber:
1197  strat_total = BTLessEqualStrategyNumber;
1198  break;
1200  strat_total = BTGreaterEqualStrategyNumber;
1201  break;
1202  }
1203  }
1204  break; /* done with outer loop */
1205  }
1206  }
1207  else
1208  {
1209  /*
1210  * Ordinary comparison key. Transform the search-style scan key
1211  * to an insertion scan key by replacing the sk_func with the
1212  * appropriate btree comparison function.
1213  *
1214  * If scankey operator is not a cross-type comparison, we can use
1215  * the cached comparison function; otherwise gotta look it up in
1216  * the catalogs. (That can't lead to infinite recursion, since no
1217  * indexscan initiated by syscache lookup will use cross-data-type
1218  * operators.)
1219  *
1220  * We support the convention that sk_subtype == InvalidOid means
1221  * the opclass input type; this is a hack to simplify life for
1222  * ScanKeyInit().
1223  */
1224  if (cur->sk_subtype == rel->rd_opcintype[i] ||
1225  cur->sk_subtype == InvalidOid)
1226  {
1227  FmgrInfo *procinfo;
1228 
1229  procinfo = index_getprocinfo(rel, cur->sk_attno, BTORDER_PROC);
1230  ScanKeyEntryInitializeWithInfo(inskey.scankeys + i,
1231  cur->sk_flags,
1232  cur->sk_attno,
1234  cur->sk_subtype,
1235  cur->sk_collation,
1236  procinfo,
1237  cur->sk_argument);
1238  }
1239  else
1240  {
1241  RegProcedure cmp_proc;
1242 
1243  cmp_proc = get_opfamily_proc(rel->rd_opfamily[i],
1244  rel->rd_opcintype[i],
1245  cur->sk_subtype,
1246  BTORDER_PROC);
1247  if (!RegProcedureIsValid(cmp_proc))
1248  elog(ERROR, "missing support function %d(%u,%u) for attribute %d of index \"%s\"",
1249  BTORDER_PROC, rel->rd_opcintype[i], cur->sk_subtype,
1250  cur->sk_attno, RelationGetRelationName(rel));
1251  ScanKeyEntryInitialize(inskey.scankeys + i,
1252  cur->sk_flags,
1253  cur->sk_attno,
1255  cur->sk_subtype,
1256  cur->sk_collation,
1257  cmp_proc,
1258  cur->sk_argument);
1259  }
1260  }
1261  }
1262 
1263  /*----------
1264  * Examine the selected initial-positioning strategy to determine exactly
1265  * where we need to start the scan, and set flag variables to control the
1266  * code below.
1267  *
1268  * If nextkey = false, _bt_search and _bt_binsrch will locate the first
1269  * item >= scan key. If nextkey = true, they will locate the first
1270  * item > scan key.
1271  *
1272  * If goback = true, we will then step back one item, while if
1273  * goback = false, we will start the scan on the located item.
1274  *----------
1275  */
1276  switch (strat_total)
1277  {
1278  case BTLessStrategyNumber:
1279 
1280  /*
1281  * Find first item >= scankey, then back up one to arrive at last
1282  * item < scankey. (Note: this positioning strategy is only used
1283  * for a backward scan, so that is always the correct starting
1284  * position.)
1285  */
1286  nextkey = false;
1287  goback = true;
1288  break;
1289 
1291 
1292  /*
1293  * Find first item > scankey, then back up one to arrive at last
1294  * item <= scankey. (Note: this positioning strategy is only used
1295  * for a backward scan, so that is always the correct starting
1296  * position.)
1297  */
1298  nextkey = true;
1299  goback = true;
1300  break;
1301 
1302  case BTEqualStrategyNumber:
1303 
1304  /*
1305  * If a backward scan was specified, need to start with last equal
1306  * item not first one.
1307  */
1308  if (ScanDirectionIsBackward(dir))
1309  {
1310  /*
1311  * This is the same as the <= strategy. We will check at the
1312  * end whether the found item is actually =.
1313  */
1314  nextkey = true;
1315  goback = true;
1316  }
1317  else
1318  {
1319  /*
1320  * This is the same as the >= strategy. We will check at the
1321  * end whether the found item is actually =.
1322  */
1323  nextkey = false;
1324  goback = false;
1325  }
1326  break;
1327 
1329 
1330  /*
1331  * Find first item >= scankey. (This is only used for forward
1332  * scans.)
1333  */
1334  nextkey = false;
1335  goback = false;
1336  break;
1337 
1339 
1340  /*
1341  * Find first item > scankey. (This is only used for forward
1342  * scans.)
1343  */
1344  nextkey = true;
1345  goback = false;
1346  break;
1347 
1348  default:
1349  /* can't get here, but keep compiler quiet */
1350  elog(ERROR, "unrecognized strat_total: %d", (int) strat_total);
1351  return false;
1352  }
1353 
1354  /* Initialize remaining insertion scan key fields */
1355  _bt_metaversion(rel, &inskey.heapkeyspace, &inskey.allequalimage);
1356  inskey.anynullkeys = false; /* unused */
1357  inskey.nextkey = nextkey;
1358  inskey.pivotsearch = false;
1359  inskey.scantid = NULL;
1360  inskey.keysz = keysCount;
1361 
1362  /*
1363  * Use the manufactured insertion scan key to descend the tree and
1364  * position ourselves on the target leaf page.
1365  */
1366  stack = _bt_search(rel, &inskey, &buf, BT_READ, scan->xs_snapshot);
1367 
1368  /* don't need to keep the stack around... */
1369  _bt_freestack(stack);
1370 
1371  if (!BufferIsValid(buf))
1372  {
1373  /*
1374  * We only get here if the index is completely empty. Lock relation
1375  * because nothing finer to lock exists.
1376  */
1377  PredicateLockRelation(rel, scan->xs_snapshot);
1378 
1379  /*
1380  * mark parallel scan as done, so that all the workers can finish
1381  * their scan
1382  */
1383  _bt_parallel_done(scan);
1385 
1386  return false;
1387  }
1388  else
1390  scan->xs_snapshot);
1391 
1392  _bt_initialize_more_data(so, dir);
1393 
1394  /* position to the precise item on the page */
1395  offnum = _bt_binsrch(rel, &inskey, buf);
1396 
1397  /*
1398  * If nextkey = false, we are positioned at the first item >= scan key, or
1399  * possibly at the end of a page on which all the existing items are less
1400  * than the scan key and we know that everything on later pages is greater
1401  * than or equal to scan key.
1402  *
1403  * If nextkey = true, we are positioned at the first item > scan key, or
1404  * possibly at the end of a page on which all the existing items are less
1405  * than or equal to the scan key and we know that everything on later
1406  * pages is greater than scan key.
1407  *
1408  * The actually desired starting point is either this item or the prior
1409  * one, or in the end-of-page case it's the first item on the next page or
1410  * the last item on this page. Adjust the starting offset if needed. (If
1411  * this results in an offset before the first item or after the last one,
1412  * _bt_readpage will report no items found, and then we'll step to the
1413  * next page as needed.)
1414  */
1415  if (goback)
1416  offnum = OffsetNumberPrev(offnum);
1417 
1418  /* remember which buffer we have pinned, if any */
1420  so->currPos.buf = buf;
1421 
1422  /*
1423  * Now load data from the first page of the scan.
1424  */
1425  if (!_bt_readpage(scan, dir, offnum))
1426  {
1427  /*
1428  * There's no actually-matching data on this page. Try to advance to
1429  * the next page. Return false if there's no matching data at all.
1430  */
1431  _bt_unlockbuf(scan->indexRelation, so->currPos.buf);
1432  if (!_bt_steppage(scan, dir))
1433  return false;
1434  }
1435  else
1436  {
1437  /* Drop the lock, and maybe the pin, on the current page */
1439  }
1440 
1441 readcomplete:
1442  /* OK, itemIndex says what to return */
1443  currItem = &so->currPos.items[so->currPos.itemIndex];
1444  scan->xs_heaptid = currItem->heapTid;
1445  if (scan->xs_want_itup)
1446  scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
1447 
1448  return true;
1449 }
int16 AttrNumber
Definition: attnum.h:21
uint32 BlockNumber
Definition: block.h:31
#define RegProcedureIsValid(p)
Definition: c.h:712
regproc RegProcedure
Definition: c.h:585
struct cursor * cur
Definition: ecpg.c:28
FmgrInfo * index_getprocinfo(Relation irel, AttrNumber attnum, uint16 procnum)
Definition: indexam.c:803
void _bt_metaversion(Relation rel, bool *heapkeyspace, bool *allequalimage)
Definition: nbtpage.c:736
void _bt_unlockbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:1082
void _bt_parallel_done(IndexScanDesc scan)
Definition: nbtree.c:717
bool _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno)
Definition: nbtree.c:636
#define BTORDER_PROC
Definition: nbtree.h:702
#define BTScanPosIsValid(scanpos)
Definition: nbtree.h:1007
#define P_NONE
Definition: nbtree.h:213
#define BT_READ
Definition: nbtree.h:714
#define BTScanPosInvalidate(scanpos)
Definition: nbtree.h:1013
static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
Definition: nbtsearch.c:1521
static bool _bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
Definition: nbtsearch.c:2158
static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp)
Definition: nbtsearch.c:60
static OffsetNumber _bt_binsrch(Relation rel, BTScanInsert key, Buffer buf)
Definition: nbtsearch.c:338
static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:2391
static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:1873
static void _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir)
Definition: nbtsearch.c:2481
BTStack _bt_search(Relation rel, BTScanInsert key, Buffer *bufP, int access, Snapshot snapshot)
Definition: nbtsearch.c:96
void _bt_preprocess_keys(IndexScanDesc scan)
Definition: nbtutils.c:749
#define OffsetNumberPrev(offsetNumber)
Definition: off.h:54
static void static void status(const char *fmt,...) pg_attribute_printf(1
Definition: pg_regress.c:229
#define pgstat_count_index_scan(rel)
Definition: pgstat.h:540
#define InvalidOid
Definition: postgres_ext.h:36
void PredicateLockPage(Relation relation, BlockNumber blkno, Snapshot snapshot)
Definition: predicate.c:2594
void PredicateLockRelation(Relation relation, Snapshot snapshot)
Definition: predicate.c:2571
void ScanKeyEntryInitialize(ScanKey entry, int flags, AttrNumber attributeNumber, StrategyNumber strategy, Oid subtype, Oid collation, RegProcedure procedure, Datum argument)
Definition: scankey.c:32
void ScanKeyEntryInitializeWithInfo(ScanKey entry, int flags, AttrNumber attributeNumber, StrategyNumber strategy, Oid subtype, Oid collation, FmgrInfo *finfo, Datum argument)
Definition: scankey.c:101
#define SK_ROW_MEMBER
Definition: skey.h:118
#define SK_ROW_END
Definition: skey.h:119
ScanKeyData * ScanKey
Definition: skey.h:75
uint16 StrategyNumber
Definition: stratnum.h:22
#define BTGreaterStrategyNumber
Definition: stratnum.h:33
#define InvalidStrategy
Definition: stratnum.h:24
#define BTLessStrategyNumber
Definition: stratnum.h:29
#define BTEqualStrategyNumber
Definition: stratnum.h:31
#define BTLessEqualStrategyNumber
Definition: stratnum.h:30
#define BTGreaterEqualStrategyNumber
Definition: stratnum.h:32
BTScanPosData currPos
Definition: nbtree.h:1070
char * currTuples
Definition: nbtree.h:1057
Buffer buf
Definition: nbtree.h:953
BTScanPosItem items[MaxTIDsPerBTreePage]
Definition: nbtree.h:985
int itemIndex
Definition: nbtree.h:983
Definition: fmgr.h:57
IndexTuple xs_itup
Definition: relscan.h:142
ItemPointerData xs_heaptid
Definition: relscan.h:147
struct SnapshotData * xs_snapshot
Definition: relscan.h:119
StrategyNumber sk_strategy
Definition: skey.h:68

References _bt_binsrch(), _bt_drop_lock_and_maybe_pin(), _bt_endpoint(), _bt_freestack(), _bt_initialize_more_data(), _bt_metaversion(), _bt_parallel_done(), _bt_parallel_readpage(), _bt_parallel_seize(), _bt_preprocess_keys(), _bt_readpage(), _bt_search(), _bt_steppage(), _bt_unlockbuf(), Assert(), BT_READ, BTEqualStrategyNumber, BTGreaterEqualStrategyNumber, BTGreaterStrategyNumber, BTLessEqualStrategyNumber, BTLessStrategyNumber, BTORDER_PROC, BTScanPosInvalidate, BTScanPosIsValid, buf, BTScanPosData::buf, BufferGetBlockNumber(), BufferIsValid, cur, BTScanOpaqueData::currPos, BTScanOpaqueData::currTuples, DatumGetPointer, elog, ERROR, get_opfamily_proc(), i, index_getprocinfo(), INDEX_MAX_KEYS, IndexScanDescData::indexRelation, InvalidBlockNumber, InvalidOid, InvalidStrategy, BTScanPosData::itemIndex, BTScanPosData::items, BTScanOpaqueData::keyData, BTScanOpaqueData::numberOfKeys, OffsetNumberPrev, IndexScanDescData::opaque, P_NONE, IndexScanDescData::parallel_scan, pgstat_count_index_scan, PredicateLockPage(), PredicateLockRelation(), BTScanOpaqueData::qual_ok, RelationData::rd_opcintype, RelationData::rd_opfamily, RegProcedureIsValid, RelationGetRelationName, ScanDirectionIsBackward, ScanDirectionIsForward, ScanKeyEntryInitialize(), ScanKeyEntryInitializeWithInfo(), ScanKeyData::sk_attno, SK_BT_DESC, SK_BT_NULLS_FIRST, ScanKeyData::sk_flags, SK_ISNULL, SK_ROW_END, SK_ROW_HEADER, SK_ROW_MEMBER, SK_SEARCHNOTNULL, ScanKeyData::sk_strategy, status(), IndexScanDescData::xs_heaptid, IndexScanDescData::xs_itup, IndexScanDescData::xs_snapshot, and IndexScanDescData::xs_want_itup.

Referenced by btgetbitmap(), and btgettuple().

◆ _bt_form_posting()

IndexTuple _bt_form_posting ( IndexTuple  base,
ItemPointer  htids,
int  nhtids 
)

Definition at line 860 of file nbtdedup.c.

861 {
862  uint32 keysize,
863  newsize;
864  IndexTuple itup;
865 
866  if (BTreeTupleIsPosting(base))
867  keysize = BTreeTupleGetPostingOffset(base);
868  else
869  keysize = IndexTupleSize(base);
870 
871  Assert(!BTreeTupleIsPivot(base));
872  Assert(nhtids > 0 && nhtids <= PG_UINT16_MAX);
873  Assert(keysize == MAXALIGN(keysize));
874 
875  /* Determine final size of new tuple */
876  if (nhtids > 1)
877  newsize = MAXALIGN(keysize +
878  nhtids * sizeof(ItemPointerData));
879  else
880  newsize = keysize;
881 
882  Assert(newsize <= INDEX_SIZE_MASK);
883  Assert(newsize == MAXALIGN(newsize));
884 
885  /* Allocate memory using palloc0() (matches index_form_tuple()) */
886  itup = palloc0(newsize);
887  memcpy(itup, base, keysize);
888  itup->t_info &= ~INDEX_SIZE_MASK;
889  itup->t_info |= newsize;
890  if (nhtids > 1)
891  {
892  /* Form posting list tuple */
893  BTreeTupleSetPosting(itup, nhtids, keysize);
894  memcpy(BTreeTupleGetPosting(itup), htids,
895  sizeof(ItemPointerData) * nhtids);
896  Assert(_bt_posting_valid(itup));
897  }
898  else
899  {
900  /* Form standard non-pivot tuple */
901  itup->t_info &= ~INDEX_ALT_TID_MASK;
902  ItemPointerCopy(htids, &itup->t_tid);
904  }
905 
906  return itup;
907 }
#define PG_UINT16_MAX
Definition: c.h:522
#define ItemPointerCopy(fromPointer, toPointer)
Definition: itemptr.h:161
#define ItemPointerIsValid(pointer)
Definition: itemptr.h:82
void * palloc0(Size size)
Definition: mcxt.c:1099
static void BTreeTupleSetPosting(IndexTuple itup, uint16 nhtids, int postingoffset)
Definition: nbtree.h:499
#define INDEX_ALT_TID_MASK
Definition: nbtree.h:461
unsigned short t_info
Definition: itup.h:49

References Assert(), BTreeTupleGetPosting(), BTreeTupleGetPostingOffset(), BTreeTupleIsPivot(), BTreeTupleIsPosting(), BTreeTupleSetPosting(), INDEX_ALT_TID_MASK, INDEX_SIZE_MASK, IndexTupleSize, ItemPointerCopy, ItemPointerIsValid, MAXALIGN, palloc0(), PG_UINT16_MAX, IndexTupleData::t_info, and IndexTupleData::t_tid.

Referenced by _bt_dedup_finish_pending(), _bt_sort_dedup_finish_pending(), and bt_posting_plain_tuple().

◆ _bt_freestack()

void _bt_freestack ( BTStack  stack)

Definition at line 182 of file nbtutils.c.

183 {
184  BTStack ostack;
185 
186  while (stack != NULL)
187  {
188  ostack = stack;
189  stack = stack->bts_parent;
190  pfree(ostack);
191  }
192 }
struct BTStackData * bts_parent
Definition: nbtree.h:731

References BTStackData::bts_parent, and pfree().

Referenced by _bt_doinsert(), _bt_first(), and bt_rootdescend().

◆ _bt_get_endpoint()

Buffer _bt_get_endpoint ( Relation  rel,
uint32  level,
bool  rightmost,
Snapshot  snapshot 
)

Definition at line 2307 of file nbtsearch.c.

2309 {
2310  Buffer buf;
2311  Page page;
2312  BTPageOpaque opaque;
2313  OffsetNumber offnum;
2314  BlockNumber blkno;
2315  IndexTuple itup;
2316 
2317  /*
2318  * If we are looking for a leaf page, okay to descend from fast root;
2319  * otherwise better descend from true root. (There is no point in being
2320  * smarter about intermediate levels.)
2321  */
2322  if (level == 0)
2323  buf = _bt_getroot(rel, BT_READ);
2324  else
2325  buf = _bt_gettrueroot(rel);
2326 
2327  if (!BufferIsValid(buf))
2328  return InvalidBuffer;
2329 
2330  page = BufferGetPage(buf);
2331  TestForOldSnapshot(snapshot, rel, page);
2332  opaque = BTPageGetOpaque(page);
2333 
2334  for (;;)
2335  {
2336  /*
2337  * If we landed on a deleted page, step right to find a live page
2338  * (there must be one). Also, if we want the rightmost page, step
2339  * right if needed to get to it (this could happen if the page split
2340  * since we obtained a pointer to it).
2341  */
2342  while (P_IGNORE(opaque) ||
2343  (rightmost && !P_RIGHTMOST(opaque)))
2344  {
2345  blkno = opaque->btpo_next;
2346  if (blkno == P_NONE)
2347  elog(ERROR, "fell off the end of index \"%s\"",
2349  buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2350  page = BufferGetPage(buf);
2351  TestForOldSnapshot(snapshot, rel, page);
2352  opaque = BTPageGetOpaque(page);
2353  }
2354 
2355  /* Done? */
2356  if (opaque->btpo_level == level)
2357  break;
2358  if (opaque->btpo_level < level)
2359  ereport(ERROR,
2360  (errcode(ERRCODE_INDEX_CORRUPTED),
2361  errmsg_internal("btree level %u not found in index \"%s\"",
2362  level, RelationGetRelationName(rel))));
2363 
2364  /* Descend to leftmost or rightmost child page */
2365  if (rightmost)
2366  offnum = PageGetMaxOffsetNumber(page);
2367  else
2368  offnum = P_FIRSTDATAKEY(opaque);
2369 
2370  itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
2371  blkno = BTreeTupleGetDownLink(itup);
2372 
2373  buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2374  page = BufferGetPage(buf);
2375  opaque = BTPageGetOpaque(page);
2376  }
2377 
2378  return buf;
2379 }
static void TestForOldSnapshot(Snapshot snapshot, Relation relation, Page page)
Definition: bufmgr.h:282
Buffer _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
Definition: nbtpage.c:1015
Buffer _bt_gettrueroot(Relation rel)
Definition: nbtpage.c:577
Buffer _bt_getroot(Relation rel, int access)
Definition: nbtpage.c:343
static BlockNumber BTreeTupleGetDownLink(IndexTuple pivot)
Definition: nbtree.h:551
uint32 btpo_level
Definition: nbtree.h:66

References _bt_getroot(), _bt_gettrueroot(), _bt_relandgetbuf(), BT_READ, BTPageGetOpaque, BTPageOpaqueData::btpo_level, BTPageOpaqueData::btpo_next, BTreeTupleGetDownLink(), buf, BufferGetPage, BufferIsValid, elog, ereport, errcode(), errmsg_internal(), ERROR, InvalidBuffer, P_FIRSTDATAKEY, P_IGNORE, P_NONE, P_RIGHTMOST, PageGetItem, PageGetItemId, PageGetMaxOffsetNumber, RelationGetRelationName, and TestForOldSnapshot().

Referenced by _bt_endpoint(), and _bt_insert_parent().

◆ _bt_getbuf()

Buffer _bt_getbuf ( Relation  rel,
BlockNumber  blkno,
int  access 
)

Definition at line 871 of file nbtpage.c.

872 {
873  Buffer buf;
874 
875  if (blkno != P_NEW)
876  {
877  /* Read an existing block of the relation */
878  buf = ReadBuffer(rel, blkno);
879  _bt_lockbuf(rel, buf, access);
880  _bt_checkpage(rel, buf);
881  }
882  else
883  {
884  bool needLock;
885  Page page;
886 
887  Assert(access == BT_WRITE);
888 
889  /*
890  * First see if the FSM knows of any free pages.
891  *
892  * We can't trust the FSM's report unreservedly; we have to check that
893  * the page is still free. (For example, an already-free page could
894  * have been re-used between the time the last VACUUM scanned it and
895  * the time the VACUUM made its FSM updates.)
896  *
897  * In fact, it's worse than that: we can't even assume that it's safe
898  * to take a lock on the reported page. If somebody else has a lock
899  * on it, or even worse our own caller does, we could deadlock. (The
900  * own-caller scenario is actually not improbable. Consider an index
901  * on a serial or timestamp column. Nearly all splits will be at the
902  * rightmost page, so it's entirely likely that _bt_split will call us
903  * while holding a lock on the page most recently acquired from FSM. A
904  * VACUUM running concurrently with the previous split could well have
905  * placed that page back in FSM.)
906  *
907  * To get around that, we ask for only a conditional lock on the
908  * reported page. If we fail, then someone else is using the page,
909  * and we may reasonably assume it's not free. (If we happen to be
910  * wrong, the worst consequence is the page will be lost to use till
911  * the next VACUUM, which is no big problem.)
912  */
913  for (;;)
914  {
915  blkno = GetFreeIndexPage(rel);
916  if (blkno == InvalidBlockNumber)
917  break;
918  buf = ReadBuffer(rel, blkno);
919  if (_bt_conditionallockbuf(rel, buf))
920  {
921  page = BufferGetPage(buf);
922 
923  /*
924  * It's possible to find an all-zeroes page in an index. For
925  * example, a backend might successfully extend the relation
926  * one page and then crash before it is able to make a WAL
927  * entry for adding the page. If we find a zeroed page then
928  * reclaim it immediately.
929  */
930  if (PageIsNew(page))
931  {
932  /* Okay to use page. Initialize and return it. */
934  return buf;
935  }
936 
937  if (BTPageIsRecyclable(page))
938  {
939  /*
940  * If we are generating WAL for Hot Standby then create a
941  * WAL record that will allow us to conflict with queries
942  * running on standby, in case they have snapshots older
943  * than safexid value
944  */
946  _bt_log_reuse_page(rel, blkno,
947  BTPageGetDeleteXid(page));
948 
949  /* Okay to use page. Re-initialize and return it. */
951  return buf;
952  }
953  elog(DEBUG2, "FSM returned nonrecyclable page");
954  _bt_relbuf(rel, buf);
955  }
956  else
957  {
958  elog(DEBUG2, "FSM returned nonlockable page");
959  /* couldn't get lock, so just drop pin */
961  }
962  }
963 
964  /*
965  * Extend the relation by one page.
966  *
967  * We have to use a lock to ensure no one else is extending the rel at
968  * the same time, else we will both try to initialize the same new
969  * page. We can skip locking for new or temp relations, however,
970  * since no one else could be accessing them.
971  */
972  needLock = !RELATION_IS_LOCAL(rel);
973 
974  if (needLock)
976 
977  buf = ReadBuffer(rel, P_NEW);
978 
979  /* Acquire buffer lock on new page */
980  _bt_lockbuf(rel, buf, BT_WRITE);
981 
982  /*
983  * Release the file-extension lock; it's now OK for someone else to
984  * extend the relation some more. Note that we cannot release this
985  * lock before we have buffer lock on the new page, or we risk a race
986  * condition against btvacuumscan --- see comments therein.
987  */
988  if (needLock)
990 
991  /* Initialize the new page before returning it */
992  page = BufferGetPage(buf);
993  Assert(PageIsNew(page));
995  }
996 
997  /* ref count and lock type are correct */
998  return buf;
999 }
void ReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:3915
Buffer ReadBuffer(Relation reln, BlockNumber blockNum)
Definition: bufmgr.c:702
#define BufferGetPageSize(buffer)
Definition: bufmgr.h:156
#define P_NEW
Definition: bufmgr.h:91
#define DEBUG2
Definition: elog.h:23
BlockNumber GetFreeIndexPage(Relation rel)
Definition: indexfsm.c:38
void LockRelationForExtension(Relation relation, LOCKMODE lockmode)
Definition: lmgr.c:431
void UnlockRelationForExtension(Relation relation, LOCKMODE lockmode)
Definition: lmgr.c:481
#define ExclusiveLock
Definition: lockdefs.h:42
void _bt_pageinit(Page page, Size size)
Definition: nbtpage.c:1141
void _bt_checkpage(Relation rel, Buffer buf)
Definition: nbtpage.c:794
bool _bt_conditionallockbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:1105
void _bt_lockbuf(Relation rel, Buffer buf, int access)
Definition: nbtpage.c:1051
static void _bt_log_reuse_page(Relation rel, BlockNumber blkno, FullTransactionId safexid)
Definition: nbtpage.c:828
static FullTransactionId BTPageGetDeleteXid(Page page)
Definition: nbtree.h:261
static bool BTPageIsRecyclable(Page page)
Definition: nbtree.h:292
#define RELATION_IS_LOCAL(relation)
Definition: rel.h:633

References _bt_checkpage(), _bt_conditionallockbuf(), _bt_lockbuf(), _bt_log_reuse_page(), _bt_pageinit(), _bt_relbuf(), Assert(), BT_WRITE, BTPageGetDeleteXid(), BTPageIsRecyclable(), buf, BufferGetPage, BufferGetPageSize, DEBUG2, elog, ExclusiveLock, GetFreeIndexPage(), InvalidBlockNumber, LockRelationForExtension(), P_NEW, PageIsNew, ReadBuffer(), RELATION_IS_LOCAL, RelationNeedsWAL, ReleaseBuffer(), UnlockRelationForExtension(), and XLogStandbyInfoActive.

Referenced by _bt_finish_split(), _bt_getroot(), _bt_getrootheight(), _bt_getstackbuf(), _bt_gettrueroot(), _bt_insertonpg(), _bt_killitems(), _bt_leftsib_splitflag(), _bt_metaversion(), _bt_moveright(), _bt_newroot(), _bt_pagedel(), _bt_readnextpage(), _bt_rightsib_halfdeadflag(), _bt_set_cleanup_info(), _bt_split(), _bt_unlink_halfdead_page(), _bt_vacuum_needs_cleanup(), and _bt_walk_left().

◆ _bt_getroot()

Buffer _bt_getroot ( Relation  rel,
int  access 
)

Definition at line 343 of file nbtpage.c.

344 {
345  Buffer metabuf;
346  Buffer rootbuf;
347  Page rootpage;
348  BTPageOpaque rootopaque;
349  BlockNumber rootblkno;
350  uint32 rootlevel;
351  BTMetaPageData *metad;
352 
353  /*
354  * Try to use previously-cached metapage data to find the root. This
355  * normally saves one buffer access per index search, which is a very
356  * helpful savings in bufmgr traffic and hence contention.
357  */
358  if (rel->rd_amcache != NULL)
359  {
360  metad = (BTMetaPageData *) rel->rd_amcache;
361  /* We shouldn't have cached it if any of these fail */
362  Assert(metad->btm_magic == BTREE_MAGIC);
364  Assert(metad->btm_version <= BTREE_VERSION);
365  Assert(!metad->btm_allequalimage ||
367  Assert(metad->btm_root != P_NONE);
368 
369  rootblkno = metad->btm_fastroot;
370  Assert(rootblkno != P_NONE);
371  rootlevel = metad->btm_fastlevel;
372 
373  rootbuf = _bt_getbuf(rel, rootblkno, BT_READ);
374  rootpage = BufferGetPage(rootbuf);
375  rootopaque = BTPageGetOpaque(rootpage);
376 
377  /*
378  * Since the cache might be stale, we check the page more carefully
379  * here than normal. We *must* check that it's not deleted. If it's
380  * not alone on its level, then we reject too --- this may be overly
381  * paranoid but better safe than sorry. Note we don't check P_ISROOT,
382  * because that's not set in a "fast root".
383  */
384  if (!P_IGNORE(rootopaque) &&
385  rootopaque->btpo_level == rootlevel &&
386  P_LEFTMOST(rootopaque) &&
387  P_RIGHTMOST(rootopaque))
388  {
389  /* OK, accept cached page as the root */
390  return rootbuf;
391  }
392  _bt_relbuf(rel, rootbuf);
393  /* Cache is stale, throw it away */
394  if (rel->rd_amcache)
395  pfree(rel->rd_amcache);
396  rel->rd_amcache = NULL;
397  }
398 
399  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
400  metad = _bt_getmeta(rel, metabuf);
401 
402  /* if no root page initialized yet, do it */
403  if (metad->btm_root == P_NONE)
404  {
405  Page metapg;
406 
407  /* If access = BT_READ, caller doesn't want us to create root yet */
408  if (access == BT_READ)
409  {
410  _bt_relbuf(rel, metabuf);
411  return InvalidBuffer;
412  }
413 
414  /* trade in our read lock for a write lock */
415  _bt_unlockbuf(rel, metabuf);
416  _bt_lockbuf(rel, metabuf, BT_WRITE);
417 
418  /*
419  * Race condition: if someone else initialized the metadata between
420  * the time we released the read lock and acquired the write lock, we
421  * must avoid doing it again.
422  */
423  if (metad->btm_root != P_NONE)
424  {
425  /*
426  * Metadata initialized by someone else. In order to guarantee no
427  * deadlocks, we have to release the metadata page and start all
428  * over again. (Is that really true? But it's hardly worth trying
429  * to optimize this case.)
430  */
431  _bt_relbuf(rel, metabuf);
432  return _bt_getroot(rel, access);
433  }
434 
435  /*
436  * Get, initialize, write, and leave a lock of the appropriate type on
437  * the new root page. Since this is the first page in the tree, it's
438  * a leaf as well as the root.
439  */
440  rootbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
441  rootblkno = BufferGetBlockNumber(rootbuf);
442  rootpage = BufferGetPage(rootbuf);
443  rootopaque = BTPageGetOpaque(rootpage);
444  rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE;
445  rootopaque->btpo_flags = (BTP_LEAF | BTP_ROOT);
446  rootopaque->btpo_level = 0;
447  rootopaque->btpo_cycleid = 0;
448  /* Get raw page pointer for metapage */
449  metapg = BufferGetPage(metabuf);
450 
451  /* NO ELOG(ERROR) till meta is updated */
453 
454  /* upgrade metapage if needed */
455  if (metad->btm_version < BTREE_NOVAC_VERSION)
456  _bt_upgrademetapage(metapg);
457 
458  metad->btm_root = rootblkno;
459  metad->btm_level = 0;
460  metad->btm_fastroot = rootblkno;
461  metad->btm_fastlevel = 0;
463  metad->btm_last_cleanup_num_heap_tuples = -1.0;
464 
465  MarkBufferDirty(rootbuf);
466  MarkBufferDirty(metabuf);
467 
468  /* XLOG stuff */
469  if (RelationNeedsWAL(rel))
470  {
471  xl_btree_newroot xlrec;
472  XLogRecPtr recptr;
474 
475  XLogBeginInsert();
478 
480  md.version = metad->btm_version;
481  md.root = rootblkno;
482  md.level = 0;
483  md.fastroot = rootblkno;
484  md.fastlevel = 0;
486  md.allequalimage = metad->btm_allequalimage;
487 
488  XLogRegisterBufData(2, (char *) &md, sizeof(xl_btree_metadata));
489 
490  xlrec.rootblk = rootblkno;
491  xlrec.level = 0;
492 
493  XLogRegisterData((char *) &xlrec, SizeOfBtreeNewroot);
494 
495  recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT);
496 
497  PageSetLSN(rootpage, recptr);
498  PageSetLSN(metapg, recptr);
499  }
500 
502 
503  /*
504  * swap root write lock for read lock. There is no danger of anyone
505  * else accessing the new root page while it's unlocked, since no one
506  * else knows where it is yet.
507  */
508  _bt_unlockbuf(rel, rootbuf);
509  _bt_lockbuf(rel, rootbuf, BT_READ);
510 
511  /* okay, metadata is correct, release lock on it without caching */
512  _bt_relbuf(rel, metabuf);
513  }
514  else
515  {
516  rootblkno = metad->btm_fastroot;
517  Assert(rootblkno != P_NONE);
518  rootlevel = metad->btm_fastlevel;
519 
520  /*
521  * Cache the metapage data for next time
522  */
524  sizeof(BTMetaPageData));
525  memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));
526 
527  /*
528  * We are done with the metapage; arrange to release it via first
529  * _bt_relandgetbuf call
530  */
531  rootbuf = metabuf;
532 
533  for (;;)
534  {
535  rootbuf = _bt_relandgetbuf(rel, rootbuf, rootblkno, BT_READ);
536  rootpage = BufferGetPage(rootbuf);
537  rootopaque = BTPageGetOpaque(rootpage);
538 
539  if (!P_IGNORE(rootopaque))
540  break;
541 
542  /* it's dead, Jim. step right one page */
543  if (P_RIGHTMOST(rootopaque))
544  elog(ERROR, "no live root page found in index \"%s\"",
546  rootblkno = rootopaque->btpo_next;
547  }
548 
549  if (rootopaque->btpo_level != rootlevel)
550  elog(ERROR, "root page %u of index \"%s\" has level %u, expected %u",
551  rootblkno, RelationGetRelationName(rel),
552  rootopaque->btpo_level, rootlevel);
553  }
554 
555  /*
556  * By here, we have a pin and read lock on the root page, and no lock set
557  * on the metadata page. Return the root page's buffer.
558  */
559  return rootbuf;
560 }
void * MemoryContextAlloc(MemoryContext context, Size size)
Definition: mcxt.c:863
void _bt_upgrademetapage(Page page)
Definition: nbtpage.c:109
static BTMetaPageData * _bt_getmeta(Relation rel, Buffer metabuf)
Definition: nbtpage.c:144
#define BTREE_MIN_VERSION
Definition: nbtree.h:151
#define BTP_LEAF
Definition: nbtree.h:76
#define BTREE_MAGIC
Definition: nbtree.h:149
#define BTP_ROOT
Definition: nbtree.h:77
#define SizeOfBtreeNewroot
Definition: nbtxlog.h:335
#define XLOG_BTREE_NEWROOT
Definition: nbtxlog.h:37
uint32 btm_last_cleanup_num_delpages
Definition: nbtree.h:114
uint32 btm_level
Definition: nbtree.h:108
float8 btm_last_cleanup_num_heap_tuples
Definition: nbtree.h:116
BlockNumber btm_fastroot
Definition: nbtree.h:109
uint32 btm_version
Definition: nbtree.h:106
uint32 btm_magic
Definition: nbtree.h:105
bool btm_allequalimage
Definition: nbtree.h:118
uint32 btm_fastlevel
Definition: nbtree.h:110
BlockNumber btpo_prev
Definition: nbtree.h:64
void * rd_amcache
Definition: rel.h:225
MemoryContext rd_indexcxt
Definition: rel.h:200
uint32 level
Definition: nbtxlog.h:50
uint32 version
Definition: nbtxlog.h:48
bool allequalimage
Definition: nbtxlog.h:54
BlockNumber fastroot
Definition: nbtxlog.h:51
uint32 fastlevel
Definition: nbtxlog.h:52
BlockNumber root
Definition: nbtxlog.h:49
uint32 last_cleanup_num_delpages
Definition: nbtxlog.h:53
uint32 level
Definition: nbtxlog.h:332
BlockNumber rootblk
Definition: nbtxlog.h:331
#define REGBUF_WILL_INIT
Definition: xloginsert.h:33

References _bt_getbuf(), _bt_getmeta(), _bt_lockbuf(), _bt_relandgetbuf(), _bt_relbuf(), _bt_unlockbuf(), _bt_upgrademetapage(), xl_btree_metadata::allequalimage, Assert(), BT_READ, BT_WRITE, BTMetaPageData::btm_allequalimage, BTMetaPageData::btm_fastlevel, BTMetaPageData::btm_fastroot, BTMetaPageData::btm_last_cleanup_num_delpages, BTMetaPageData::btm_last_cleanup_num_heap_tuples, BTMetaPageData::btm_level, BTMetaPageData::btm_magic, BTMetaPageData::btm_root, BTMetaPageData::btm_version, BTP_LEAF, BTP_ROOT, BTPageGetOpaque, BTPageOpaqueData::btpo_cycleid, BTPageOpaqueData::btpo_flags, BTPageOpaqueData::btpo_level, BTPageOpaqueData::btpo_next, BTPageOpaqueData::btpo_prev, BTREE_MAGIC, BTREE_METAPAGE, BTREE_MIN_VERSION, BTREE_NOVAC_VERSION, BTREE_VERSION, BufferGetBlockNumber(), BufferGetPage, elog, END_CRIT_SECTION, ERROR, xl_btree_metadata::fastlevel, xl_btree_metadata::fastroot, InvalidBuffer, xl_btree_metadata::last_cleanup_num_delpages, xl_btree_metadata::level, xl_btree_newroot::level, MarkBufferDirty(), MemoryContextAlloc(), P_IGNORE, P_LEFTMOST, P_NEW, P_NONE, P_RIGHTMOST, PageSetLSN, pfree(), RelationData::rd_amcache, RelationData::rd_indexcxt, REGBUF_STANDARD, REGBUF_WILL_INIT, RelationGetRelationName, RelationNeedsWAL, xl_btree_metadata::root, xl_btree_newroot::rootblk, SizeOfBtreeNewroot, START_CRIT_SECTION, xl_btree_metadata::version, XLOG_BTREE_NEWROOT, XLogBeginInsert(), XLogInsert(), XLogRegisterBufData(), XLogRegisterBuffer(), and XLogRegisterData().

Referenced by _bt_get_endpoint(), and _bt_search().

◆ _bt_getrootheight()

int _bt_getrootheight ( Relation  rel)

Definition at line 672 of file nbtpage.c.

673 {
674  BTMetaPageData *metad;
675 
676  if (rel->rd_amcache == NULL)
677  {
678  Buffer metabuf;
679 
680  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
681  metad = _bt_getmeta(rel, metabuf);
682 
683  /*
684  * If there's no root page yet, _bt_getroot() doesn't expect a cache
685  * to be made, so just stop here and report the index height is zero.
686  * (XXX perhaps _bt_getroot() should be changed to allow this case.)
687  */
688  if (metad->btm_root == P_NONE)
689  {
690  _bt_relbuf(rel, metabuf);
691  return 0;
692  }
693 
694  /*
695  * Cache the metapage data for next time
696  */
698  sizeof(BTMetaPageData));
699  memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));
700  _bt_relbuf(rel, metabuf);
701  }
702 
703  /* Get cached page */
704  metad = (BTMetaPageData *) rel->rd_amcache;
705  /* We shouldn't have cached it if any of these fail */
706  Assert(metad->btm_magic == BTREE_MAGIC);
708  Assert(metad->btm_version <= BTREE_VERSION);
709  Assert(!metad->btm_allequalimage ||
711  Assert(metad->btm_fastroot != P_NONE);
712 
713  return metad->btm_fastlevel;
714 }

References _bt_getbuf(), _bt_getmeta(), _bt_relbuf(), Assert(), BT_READ, BTMetaPageData::btm_allequalimage, BTMetaPageData::btm_fastlevel, BTMetaPageData::btm_fastroot, BTMetaPageData::btm_magic, BTMetaPageData::btm_root, BTMetaPageData::btm_version, BTREE_MAGIC, BTREE_METAPAGE, BTREE_MIN_VERSION, BTREE_NOVAC_VERSION, BTREE_VERSION, MemoryContextAlloc(), P_NONE, RelationData::rd_amcache, and RelationData::rd_indexcxt.

Referenced by _bt_insertonpg(), and get_relation_info().

◆ _bt_getstackbuf()

Buffer _bt_getstackbuf ( Relation  rel,
BTStack  stack,
BlockNumber  child 
)

Definition at line 2307 of file nbtinsert.c.

2308 {
2309  BlockNumber blkno;
2310  OffsetNumber start;
2311 
2312  blkno = stack->bts_blkno;
2313  start = stack->bts_offset;
2314 
2315  for (;;)
2316  {
2317  Buffer buf;
2318  Page page;
2319  BTPageOpaque opaque;
2320 
2321  buf = _bt_getbuf(rel, blkno, BT_WRITE);
2322  page = BufferGetPage(buf);
2323  opaque = BTPageGetOpaque(page);
2324 
2325  if (P_INCOMPLETE_SPLIT(opaque))
2326  {
2327  _bt_finish_split(rel, buf, stack->bts_parent);
2328  continue;
2329  }
2330 
2331  if (!P_IGNORE(opaque))
2332  {
2333  OffsetNumber offnum,
2334  minoff,
2335  maxoff;
2336  ItemId itemid;
2337  IndexTuple item;
2338 
2339  minoff = P_FIRSTDATAKEY(opaque);
2340  maxoff = PageGetMaxOffsetNumber(page);
2341 
2342  /*
2343  * start = InvalidOffsetNumber means "search the whole page". We
2344  * need this test anyway due to possibility that page has a high
2345  * key now when it didn't before.
2346  */
2347  if (start < minoff)
2348  start = minoff;
2349 
2350  /*
2351  * Need this check too, to guard against possibility that page
2352  * split since we visited it originally.
2353  */
2354  if (start > maxoff)
2355  start = OffsetNumberNext(maxoff);
2356 
2357  /*
2358  * These loops will check every item on the page --- but in an
2359  * order that's attuned to the probability of where it actually
2360  * is. Scan to the right first, then to the left.
2361  */
2362  for (offnum = start;
2363  offnum <= maxoff;
2364  offnum = OffsetNumberNext(offnum))
2365  {
2366  itemid = PageGetItemId(page, offnum);
2367  item = (IndexTuple) PageGetItem(page, itemid);
2368 
2369  if (BTreeTupleGetDownLink(item) == child)
2370  {
2371  /* Return accurate pointer to where link is now */
2372  stack->bts_blkno = blkno;
2373  stack->bts_offset = offnum;
2374  return buf;
2375  }
2376  }
2377 
2378  for (offnum = OffsetNumberPrev(start);
2379  offnum >= minoff;
2380  offnum = OffsetNumberPrev(offnum))
2381  {
2382  itemid = PageGetItemId(page, offnum);
2383  item = (IndexTuple) PageGetItem(page, itemid);
2384 
2385  if (BTreeTupleGetDownLink(item) == child)
2386  {
2387  /* Return accurate pointer to where link is now */
2388  stack->bts_blkno = blkno;
2389  stack->bts_offset = offnum;
2390  return buf;
2391  }
2392  }
2393  }
2394 
2395  /*
2396  * The item we're looking for moved right at least one page.
2397  *
2398  * Lehman and Yao couple/chain locks when moving right here, which we
2399  * can avoid. See nbtree/README.
2400  */
2401  if (P_RIGHTMOST(opaque))
2402  {
2403  _bt_relbuf(rel, buf);
2404  return InvalidBuffer;
2405  }
2406  blkno = opaque->btpo_next;
2407  start = InvalidOffsetNumber;
2408  _bt_relbuf(rel, buf);
2409  }
2410 }
void _bt_finish_split(Relation rel, Buffer lbuf, BTStack stack)
Definition: nbtinsert.c:2230
BlockNumber bts_blkno
Definition: nbtree.h:729
OffsetNumber bts_offset
Definition: nbtree.h:730

References _bt_finish_split(), _bt_getbuf(), _bt_relbuf(), BT_WRITE, BTPageGetOpaque, BTPageOpaqueData::btpo_next, BTreeTupleGetDownLink(), BTStackData::bts_blkno, BTStackData::bts_offset, BTStackData::bts_parent, buf, BufferGetPage, InvalidBuffer, InvalidOffsetNumber, OffsetNumberNext, OffsetNumberPrev, P_FIRSTDATAKEY, P_IGNORE, P_INCOMPLETE_SPLIT, P_RIGHTMOST, PageGetItem, PageGetItemId, and PageGetMaxOffsetNumber.

Referenced by _bt_insert_parent(), and _bt_lock_subtree_parent().

◆ _bt_gettrueroot()

Buffer _bt_gettrueroot ( Relation  rel)

Definition at line 577 of file nbtpage.c.

578 {
579  Buffer metabuf;
580  Page metapg;
581  BTPageOpaque metaopaque;
582  Buffer rootbuf;
583  Page rootpage;
584  BTPageOpaque rootopaque;
585  BlockNumber rootblkno;
586  uint32 rootlevel;
587  BTMetaPageData *metad;
588 
589  /*
590  * We don't try to use cached metapage data here, since (a) this path is
591  * not performance-critical, and (b) if we are here it suggests our cache
592  * is out-of-date anyway. In light of point (b), it's probably safest to
593  * actively flush any cached metapage info.
594  */
595  if (rel->rd_amcache)
596  pfree(rel->rd_amcache);
597  rel->rd_amcache = NULL;
598 
599  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
600  metapg = BufferGetPage(metabuf);
601  metaopaque = BTPageGetOpaque(metapg);
602  metad = BTPageGetMeta(metapg);
603 
604  if (!P_ISMETA(metaopaque) ||
605  metad->btm_magic != BTREE_MAGIC)
606  ereport(ERROR,
607  (errcode(ERRCODE_INDEX_CORRUPTED),
608  errmsg("index \"%s\" is not a btree",
609  RelationGetRelationName(rel))));
610 
611  if (metad->btm_version < BTREE_MIN_VERSION ||
612  metad->btm_version > BTREE_VERSION)
613  ereport(ERROR,
614  (errcode(ERRCODE_INDEX_CORRUPTED),
615  errmsg("version mismatch in index \"%s\": file version %d, "
616  "current version %d, minimal supported version %d",
619 
620  /* if no root page initialized yet, fail */
621  if (metad->btm_root == P_NONE)
622  {
623  _bt_relbuf(rel, metabuf);
624  return InvalidBuffer;
625  }
626 
627  rootblkno = metad->btm_root;
628  rootlevel = metad->btm_level;
629 
630  /*
631  * We are done with the metapage; arrange to release it via first
632  * _bt_relandgetbuf call
633  */
634  rootbuf = metabuf;
635 
636  for (;;)
637  {
638  rootbuf = _bt_relandgetbuf(rel, rootbuf, rootblkno, BT_READ);
639  rootpage = BufferGetPage(rootbuf);
640  rootopaque = BTPageGetOpaque(rootpage);
641 
642  if (!P_IGNORE(rootopaque))
643  break;
644 
645  /* it's dead, Jim. step right one page */
646  if (P_RIGHTMOST(rootopaque))
647  elog(ERROR, "no live root page found in index \"%s\"",
649  rootblkno = rootopaque->btpo_next;
650  }
651 
652  if (rootopaque->btpo_level != rootlevel)
653  elog(ERROR, "root page %u of index \"%s\" has level %u, expected %u",
654  rootblkno, RelationGetRelationName(rel),
655  rootopaque->btpo_level, rootlevel);
656 
657  return rootbuf;
658 }
#define P_ISMETA(opaque)
Definition: nbtree.h:224

References _bt_getbuf(), _bt_relandgetbuf(), _bt_relbuf(), BT_READ, BTMetaPageData::btm_level, BTMetaPageData::btm_magic, BTMetaPageData::btm_root, BTMetaPageData::btm_version, BTPageGetMeta, BTPageGetOpaque, BTPageOpaqueData::btpo_level, BTPageOpaqueData::btpo_next, BTREE_MAGIC, BTREE_METAPAGE, BTREE_MIN_VERSION, BTREE_VERSION, BufferGetPage, elog, ereport, errcode(), errmsg(), ERROR, InvalidBuffer, P_IGNORE, P_ISMETA, P_NONE, P_RIGHTMOST, pfree(), RelationData::rd_amcache, and RelationGetRelationName.

Referenced by _bt_get_endpoint().

◆ _bt_initmetapage()

void _bt_initmetapage ( Page  page,
BlockNumber  rootbknum,
uint32  level,
bool  allequalimage 
)

Definition at line 69 of file nbtpage.c.

71 {
72  BTMetaPageData *metad;
73  BTPageOpaque metaopaque;
74 
75  _bt_pageinit(page, BLCKSZ);
76 
77  metad = BTPageGetMeta(page);
78  metad->btm_magic = BTREE_MAGIC;
79  metad->btm_version = BTREE_VERSION;
80  metad->btm_root = rootbknum;
81  metad->btm_level = level;
82  metad->btm_fastroot = rootbknum;