PostgreSQL Source Code  git master
nbtree.h File Reference
#include "access/amapi.h"
#include "access/itup.h"
#include "access/sdir.h"
#include "access/tableam.h"
#include "access/xlogreader.h"
#include "catalog/pg_am_d.h"
#include "catalog/pg_index.h"
#include "lib/stringinfo.h"
#include "storage/bufmgr.h"
#include "storage/shm_toc.h"
Include dependency graph for nbtree.h:
This graph shows which files directly or indirectly include this file:

Go to the source code of this file.

Data Structures

struct  BTPageOpaqueData
 
struct  BTMetaPageData
 
struct  BTDeletedPageData
 
struct  BTPendingFSM
 
struct  BTVacState
 
struct  BTStackData
 
struct  BTScanInsertData
 
struct  BTInsertStateData
 
struct  BTDedupInterval
 
struct  BTDedupStateData
 
struct  BTVacuumPostingData
 
struct  BTScanPosItem
 
struct  BTScanPosData
 
struct  BTArrayKeyInfo
 
struct  BTScanOpaqueData
 
struct  BTReadPageState
 
struct  BTOptions
 

Macros

#define BTPageGetOpaque(page)   ((BTPageOpaque) PageGetSpecialPointer(page))
 
#define BTP_LEAF   (1 << 0) /* leaf page, i.e. not internal page */
 
#define BTP_ROOT   (1 << 1) /* root page (has no parent) */
 
#define BTP_DELETED   (1 << 2) /* page has been deleted from tree */
 
#define BTP_META   (1 << 3) /* meta-page */
 
#define BTP_HALF_DEAD   (1 << 4) /* empty, but still in tree */
 
#define BTP_SPLIT_END   (1 << 5) /* rightmost page of split group */
 
#define BTP_HAS_GARBAGE   (1 << 6) /* page has LP_DEAD tuples (deprecated) */
 
#define BTP_INCOMPLETE_SPLIT   (1 << 7) /* right sibling's downlink is missing */
 
#define BTP_HAS_FULLXID   (1 << 8) /* contains BTDeletedPageData */
 
#define MAX_BT_CYCLE_ID   0xFF7F
 
#define BTPageGetMeta(p)    ((BTMetaPageData *) PageGetContents(p))
 
#define BTREE_METAPAGE   0 /* first page is meta */
 
#define BTREE_MAGIC   0x053162 /* magic number in metapage */
 
#define BTREE_VERSION   4 /* current version number */
 
#define BTREE_MIN_VERSION   2 /* minimum supported version */
 
#define BTREE_NOVAC_VERSION   3 /* version with all meta fields set */
 
#define BTMaxItemSize(page)
 
#define BTMaxItemSizeNoHeapTid(page)
 
#define MaxTIDsPerBTreePage
 
#define BTREE_MIN_FILLFACTOR   10
 
#define BTREE_DEFAULT_FILLFACTOR   90
 
#define BTREE_NONLEAF_FILLFACTOR   70
 
#define BTREE_SINGLEVAL_FILLFACTOR   96
 
#define P_NONE   0
 
#define P_LEFTMOST(opaque)   ((opaque)->btpo_prev == P_NONE)
 
#define P_RIGHTMOST(opaque)   ((opaque)->btpo_next == P_NONE)
 
#define P_ISLEAF(opaque)   (((opaque)->btpo_flags & BTP_LEAF) != 0)
 
#define P_ISROOT(opaque)   (((opaque)->btpo_flags & BTP_ROOT) != 0)
 
#define P_ISDELETED(opaque)   (((opaque)->btpo_flags & BTP_DELETED) != 0)
 
#define P_ISMETA(opaque)   (((opaque)->btpo_flags & BTP_META) != 0)
 
#define P_ISHALFDEAD(opaque)   (((opaque)->btpo_flags & BTP_HALF_DEAD) != 0)
 
#define P_IGNORE(opaque)   (((opaque)->btpo_flags & (BTP_DELETED|BTP_HALF_DEAD)) != 0)
 
#define P_HAS_GARBAGE(opaque)   (((opaque)->btpo_flags & BTP_HAS_GARBAGE) != 0)
 
#define P_INCOMPLETE_SPLIT(opaque)   (((opaque)->btpo_flags & BTP_INCOMPLETE_SPLIT) != 0)
 
#define P_HAS_FULLXID(opaque)   (((opaque)->btpo_flags & BTP_HAS_FULLXID) != 0)
 
#define P_HIKEY   ((OffsetNumber) 1)
 
#define P_FIRSTKEY   ((OffsetNumber) 2)
 
#define P_FIRSTDATAKEY(opaque)   (P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY)
 
#define INDEX_ALT_TID_MASK   INDEX_AM_RESERVED_BIT
 
#define BT_OFFSET_MASK   0x0FFF
 
#define BT_STATUS_OFFSET_MASK   0xF000
 
#define BT_PIVOT_HEAP_TID_ATTR   0x1000
 
#define BT_IS_POSTING   0x2000
 
#define BTreeTupleGetNAtts(itup, rel)
 
#define BTCommuteStrategyNumber(strat)   (BTMaxStrategyNumber + 1 - (strat))
 
#define BTORDER_PROC   1
 
#define BTSORTSUPPORT_PROC   2
 
#define BTINRANGE_PROC   3
 
#define BTEQUALIMAGE_PROC   4
 
#define BTOPTIONS_PROC   5
 
#define BTNProcs   5
 
#define BT_READ   BUFFER_LOCK_SHARE
 
#define BT_WRITE   BUFFER_LOCK_EXCLUSIVE
 
#define BTScanPosIsPinned(scanpos)
 
#define BTScanPosUnpin(scanpos)
 
#define BTScanPosUnpinIfPinned(scanpos)
 
#define BTScanPosIsValid(scanpos)
 
#define BTScanPosInvalidate(scanpos)
 
#define SK_BT_REQFWD   0x00010000 /* required to continue forward scan */
 
#define SK_BT_REQBKWD   0x00020000 /* required to continue backward scan */
 
#define SK_BT_INDOPTION_SHIFT   24 /* must clear the above bits */
 
#define SK_BT_DESC   (INDOPTION_DESC << SK_BT_INDOPTION_SHIFT)
 
#define SK_BT_NULLS_FIRST   (INDOPTION_NULLS_FIRST << SK_BT_INDOPTION_SHIFT)
 
#define BTGetFillFactor(relation)
 
#define BTGetTargetPageFreeSpace(relation)    (BLCKSZ * (100 - BTGetFillFactor(relation)) / 100)
 
#define BTGetDeduplicateItems(relation)
 
#define PROGRESS_BTREE_PHASE_INDEXBUILD_TABLESCAN   2
 
#define PROGRESS_BTREE_PHASE_PERFORMSORT_1   3
 
#define PROGRESS_BTREE_PHASE_PERFORMSORT_2   4
 
#define PROGRESS_BTREE_PHASE_LEAF_LOAD   5
 

Typedefs

typedef uint16 BTCycleId
 
typedef struct BTPageOpaqueData BTPageOpaqueData
 
typedef BTPageOpaqueDataBTPageOpaque
 
typedef struct BTMetaPageData BTMetaPageData
 
typedef struct BTDeletedPageData BTDeletedPageData
 
typedef struct BTPendingFSM BTPendingFSM
 
typedef struct BTVacState BTVacState
 
typedef struct BTStackData BTStackData
 
typedef BTStackDataBTStack
 
typedef struct BTScanInsertData BTScanInsertData
 
typedef BTScanInsertDataBTScanInsert
 
typedef struct BTInsertStateData BTInsertStateData
 
typedef BTInsertStateDataBTInsertState
 
typedef struct BTDedupInterval BTDedupInterval
 
typedef struct BTDedupStateData BTDedupStateData
 
typedef BTDedupStateDataBTDedupState
 
typedef struct BTVacuumPostingData BTVacuumPostingData
 
typedef BTVacuumPostingDataBTVacuumPosting
 
typedef struct BTScanPosItem BTScanPosItem
 
typedef struct BTScanPosData BTScanPosData
 
typedef BTScanPosDataBTScanPos
 
typedef struct BTArrayKeyInfo BTArrayKeyInfo
 
typedef struct BTScanOpaqueData BTScanOpaqueData
 
typedef BTScanOpaqueDataBTScanOpaque
 
typedef struct BTReadPageState BTReadPageState
 
typedef struct BTOptions BTOptions
 

Functions

static void BTPageSetDeleted (Page page, FullTransactionId safexid)
 
static FullTransactionId BTPageGetDeleteXid (Page page)
 
static bool BTPageIsRecyclable (Page page, Relation heaprel)
 
 StaticAssertDecl (BT_OFFSET_MASK >=INDEX_MAX_KEYS, "BT_OFFSET_MASK can't fit INDEX_MAX_KEYS")
 
static bool BTreeTupleIsPivot (IndexTuple itup)
 
static bool BTreeTupleIsPosting (IndexTuple itup)
 
static void BTreeTupleSetPosting (IndexTuple itup, uint16 nhtids, int postingoffset)
 
static uint16 BTreeTupleGetNPosting (IndexTuple posting)
 
static uint32 BTreeTupleGetPostingOffset (IndexTuple posting)
 
static ItemPointer BTreeTupleGetPosting (IndexTuple posting)
 
static ItemPointer BTreeTupleGetPostingN (IndexTuple posting, int n)
 
static BlockNumber BTreeTupleGetDownLink (IndexTuple pivot)
 
static void BTreeTupleSetDownLink (IndexTuple pivot, BlockNumber blkno)
 
static void BTreeTupleSetNAtts (IndexTuple itup, uint16 nkeyatts, bool heaptid)
 
static BlockNumber BTreeTupleGetTopParent (IndexTuple leafhikey)
 
static void BTreeTupleSetTopParent (IndexTuple leafhikey, BlockNumber blkno)
 
static ItemPointer BTreeTupleGetHeapTID (IndexTuple itup)
 
static ItemPointer BTreeTupleGetMaxHeapTID (IndexTuple itup)
 
void btbuildempty (Relation index)
 
bool btinsert (Relation rel, Datum *values, bool *isnull, ItemPointer ht_ctid, Relation heapRel, IndexUniqueCheck checkUnique, bool indexUnchanged, struct IndexInfo *indexInfo)
 
IndexScanDesc btbeginscan (Relation rel, int nkeys, int norderbys)
 
Size btestimateparallelscan (int nkeys, int norderbys)
 
void btinitparallelscan (void *target)
 
bool btgettuple (IndexScanDesc scan, ScanDirection dir)
 
int64 btgetbitmap (IndexScanDesc scan, TIDBitmap *tbm)
 
void btrescan (IndexScanDesc scan, ScanKey scankey, int nscankeys, ScanKey orderbys, int norderbys)
 
void btparallelrescan (IndexScanDesc scan)
 
void btendscan (IndexScanDesc scan)
 
void btmarkpos (IndexScanDesc scan)
 
void btrestrpos (IndexScanDesc scan)
 
IndexBulkDeleteResultbtbulkdelete (IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state)
 
IndexBulkDeleteResultbtvacuumcleanup (IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
 
bool btcanreturn (Relation index, int attno)
 
bool _bt_parallel_seize (IndexScanDesc scan, BlockNumber *pageno, bool first)
 
void _bt_parallel_release (IndexScanDesc scan, BlockNumber scan_page)
 
void _bt_parallel_done (IndexScanDesc scan)
 
void _bt_parallel_primscan_schedule (IndexScanDesc scan, BlockNumber prev_scan_page)
 
void _bt_dedup_pass (Relation rel, Buffer buf, IndexTuple newitem, Size newitemsz, bool bottomupdedup)
 
bool _bt_bottomupdel_pass (Relation rel, Buffer buf, Relation heapRel, Size newitemsz)
 
void _bt_dedup_start_pending (BTDedupState state, IndexTuple base, OffsetNumber baseoff)
 
bool _bt_dedup_save_htid (BTDedupState state, IndexTuple itup)
 
Size _bt_dedup_finish_pending (Page newpage, BTDedupState state)
 
IndexTuple _bt_form_posting (IndexTuple base, ItemPointer htids, int nhtids)
 
void _bt_update_posting (BTVacuumPosting vacposting)
 
IndexTuple _bt_swap_posting (IndexTuple newitem, IndexTuple oposting, int postingoff)
 
bool _bt_doinsert (Relation rel, IndexTuple itup, IndexUniqueCheck checkUnique, bool indexUnchanged, Relation heapRel)
 
void _bt_finish_split (Relation rel, Relation heaprel, Buffer lbuf, BTStack stack)
 
Buffer _bt_getstackbuf (Relation rel, Relation heaprel, BTStack stack, BlockNumber child)
 
OffsetNumber _bt_findsplitloc (Relation rel, Page origpage, OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem, bool *newitemonleft)
 
void _bt_initmetapage (Page page, BlockNumber rootbknum, uint32 level, bool allequalimage)
 
bool _bt_vacuum_needs_cleanup (Relation rel)
 
void _bt_set_cleanup_info (Relation rel, BlockNumber num_delpages)
 
void _bt_upgrademetapage (Page page)
 
Buffer _bt_getroot (Relation rel, Relation heaprel, int access)
 
Buffer _bt_gettrueroot (Relation rel)
 
int _bt_getrootheight (Relation rel)
 
void _bt_metaversion (Relation rel, bool *heapkeyspace, bool *allequalimage)
 
void _bt_checkpage (Relation rel, Buffer buf)
 
Buffer _bt_getbuf (Relation rel, BlockNumber blkno, int access)
 
Buffer _bt_allocbuf (Relation rel, Relation heaprel)
 
Buffer _bt_relandgetbuf (Relation rel, Buffer obuf, BlockNumber blkno, int access)
 
void _bt_relbuf (Relation rel, Buffer buf)
 
void _bt_lockbuf (Relation rel, Buffer buf, int access)
 
void _bt_unlockbuf (Relation rel, Buffer buf)
 
bool _bt_conditionallockbuf (Relation rel, Buffer buf)
 
void _bt_upgradelockbufcleanup (Relation rel, Buffer buf)
 
void _bt_pageinit (Page page, Size size)
 
void _bt_delitems_vacuum (Relation rel, Buffer buf, OffsetNumber *deletable, int ndeletable, BTVacuumPosting *updatable, int nupdatable)
 
void _bt_delitems_delete_check (Relation rel, Buffer buf, Relation heapRel, TM_IndexDeleteOp *delstate)
 
void _bt_pagedel (Relation rel, Buffer leafbuf, BTVacState *vstate)
 
void _bt_pendingfsm_init (Relation rel, BTVacState *vstate, bool cleanuponly)
 
void _bt_pendingfsm_finalize (Relation rel, BTVacState *vstate)
 
BTStack _bt_search (Relation rel, Relation heaprel, BTScanInsert key, Buffer *bufP, int access)
 
OffsetNumber _bt_binsrch_insert (Relation rel, BTInsertState insertstate)
 
int32 _bt_compare (Relation rel, BTScanInsert key, Page page, OffsetNumber offnum)
 
bool _bt_first (IndexScanDesc scan, ScanDirection dir)
 
bool _bt_next (IndexScanDesc scan, ScanDirection dir)
 
Buffer _bt_get_endpoint (Relation rel, uint32 level, bool rightmost)
 
BTScanInsert _bt_mkscankey (Relation rel, IndexTuple itup)
 
void _bt_freestack (BTStack stack)
 
bool _bt_start_prim_scan (IndexScanDesc scan, ScanDirection dir)
 
void _bt_start_array_keys (IndexScanDesc scan, ScanDirection dir)
 
void _bt_preprocess_keys (IndexScanDesc scan)
 
bool _bt_checkkeys (IndexScanDesc scan, BTReadPageState *pstate, bool arrayKeys, IndexTuple tuple, int tupnatts)
 
void _bt_killitems (IndexScanDesc scan)
 
BTCycleId _bt_vacuum_cycleid (Relation rel)
 
BTCycleId _bt_start_vacuum (Relation rel)
 
void _bt_end_vacuum (Relation rel)
 
void _bt_end_vacuum_callback (int code, Datum arg)
 
Size BTreeShmemSize (void)
 
void BTreeShmemInit (void)
 
byteabtoptions (Datum reloptions, bool validate)
 
bool btproperty (Oid index_oid, int attno, IndexAMProperty prop, const char *propname, bool *res, bool *isnull)
 
char * btbuildphasename (int64 phasenum)
 
IndexTuple _bt_truncate (Relation rel, IndexTuple lastleft, IndexTuple firstright, BTScanInsert itup_key)
 
int _bt_keep_natts_fast (Relation rel, IndexTuple lastleft, IndexTuple firstright)
 
bool _bt_check_natts (Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum)
 
void _bt_check_third_page (Relation rel, Relation heap, bool needheaptidspace, Page page, IndexTuple newtup)
 
bool _bt_allequalimage (Relation rel, bool debugmessage)
 
bool btvalidate (Oid opclassoid)
 
void btadjustmembers (Oid opfamilyoid, Oid opclassoid, List *operators, List *functions)
 
IndexBuildResultbtbuild (Relation heap, Relation index, struct IndexInfo *indexInfo)
 
void _bt_parallel_build_main (dsm_segment *seg, shm_toc *toc)
 

Macro Definition Documentation

◆ BT_IS_POSTING

#define BT_IS_POSTING   0x2000

Definition at line 466 of file nbtree.h.

◆ BT_OFFSET_MASK

#define BT_OFFSET_MASK   0x0FFF

Definition at line 462 of file nbtree.h.

◆ BT_PIVOT_HEAP_TID_ATTR

#define BT_PIVOT_HEAP_TID_ATTR   0x1000

Definition at line 465 of file nbtree.h.

◆ BT_READ

#define BT_READ   BUFFER_LOCK_SHARE

Definition at line 719 of file nbtree.h.

◆ BT_STATUS_OFFSET_MASK

#define BT_STATUS_OFFSET_MASK   0xF000

Definition at line 463 of file nbtree.h.

◆ BT_WRITE

#define BT_WRITE   BUFFER_LOCK_EXCLUSIVE

Definition at line 720 of file nbtree.h.

◆ BTCommuteStrategyNumber

#define BTCommuteStrategyNumber (   strat)    (BTMaxStrategyNumber + 1 - (strat))

Definition at line 685 of file nbtree.h.

◆ BTEQUALIMAGE_PROC

#define BTEQUALIMAGE_PROC   4

Definition at line 710 of file nbtree.h.

◆ BTGetDeduplicateItems

#define BTGetDeduplicateItems (   relation)
Value:
(AssertMacro(relation->rd_rel->relkind == RELKIND_INDEX && \
relation->rd_rel->relam == BTREE_AM_OID), \
((relation)->rd_options ? \
((BTOptions *) (relation)->rd_options)->deduplicate_items : true))
#define AssertMacro(condition)
Definition: c.h:859

Definition at line 1146 of file nbtree.h.

◆ BTGetFillFactor

#define BTGetFillFactor (   relation)
Value:
(AssertMacro(relation->rd_rel->relkind == RELKIND_INDEX && \
relation->rd_rel->relam == BTREE_AM_OID), \
(relation)->rd_options ? \
((BTOptions *) (relation)->rd_options)->fillfactor : \
BTREE_DEFAULT_FILLFACTOR)

Definition at line 1138 of file nbtree.h.

◆ BTGetTargetPageFreeSpace

#define BTGetTargetPageFreeSpace (   relation)     (BLCKSZ * (100 - BTGetFillFactor(relation)) / 100)

Definition at line 1144 of file nbtree.h.

◆ BTINRANGE_PROC

#define BTINRANGE_PROC   3

Definition at line 709 of file nbtree.h.

◆ BTMaxItemSize

#define BTMaxItemSize (   page)
Value:
MAXALIGN(sizeof(BTPageOpaqueData))) / 3) - \
MAXALIGN(sizeof(ItemPointerData)))
static Size PageGetPageSize(Page page)
Definition: bufpage.h:276
#define SizeOfPageHeaderData
Definition: bufpage.h:216
#define MAXALIGN_DOWN(LEN)
Definition: c.h:823
#define MAXALIGN(LEN)
Definition: c.h:811

Definition at line 164 of file nbtree.h.

◆ BTMaxItemSizeNoHeapTid

#define BTMaxItemSizeNoHeapTid (   page)
Value:

Definition at line 169 of file nbtree.h.

◆ BTNProcs

#define BTNProcs   5

Definition at line 712 of file nbtree.h.

◆ BTOPTIONS_PROC

#define BTOPTIONS_PROC   5

Definition at line 711 of file nbtree.h.

◆ BTORDER_PROC

#define BTORDER_PROC   1

Definition at line 707 of file nbtree.h.

◆ BTP_DELETED

#define BTP_DELETED   (1 << 2) /* page has been deleted from tree */

Definition at line 78 of file nbtree.h.

◆ BTP_HALF_DEAD

#define BTP_HALF_DEAD   (1 << 4) /* empty, but still in tree */

Definition at line 80 of file nbtree.h.

◆ BTP_HAS_FULLXID

#define BTP_HAS_FULLXID   (1 << 8) /* contains BTDeletedPageData */

Definition at line 84 of file nbtree.h.

◆ BTP_HAS_GARBAGE

#define BTP_HAS_GARBAGE   (1 << 6) /* page has LP_DEAD tuples (deprecated) */

Definition at line 82 of file nbtree.h.

◆ BTP_INCOMPLETE_SPLIT

#define BTP_INCOMPLETE_SPLIT   (1 << 7) /* right sibling's downlink is missing */

Definition at line 83 of file nbtree.h.

◆ BTP_LEAF

#define BTP_LEAF   (1 << 0) /* leaf page, i.e. not internal page */

Definition at line 76 of file nbtree.h.

◆ BTP_META

#define BTP_META   (1 << 3) /* meta-page */

Definition at line 79 of file nbtree.h.

◆ BTP_ROOT

#define BTP_ROOT   (1 << 1) /* root page (has no parent) */

Definition at line 77 of file nbtree.h.

◆ BTP_SPLIT_END

#define BTP_SPLIT_END   (1 << 5) /* rightmost page of split group */

Definition at line 81 of file nbtree.h.

◆ BTPageGetMeta

#define BTPageGetMeta (   p)     ((BTMetaPageData *) PageGetContents(p))

Definition at line 121 of file nbtree.h.

◆ BTPageGetOpaque

#define BTPageGetOpaque (   page)    ((BTPageOpaque) PageGetSpecialPointer(page))

Definition at line 73 of file nbtree.h.

◆ BTREE_DEFAULT_FILLFACTOR

#define BTREE_DEFAULT_FILLFACTOR   90

Definition at line 200 of file nbtree.h.

◆ BTREE_MAGIC

#define BTREE_MAGIC   0x053162 /* magic number in metapage */

Definition at line 149 of file nbtree.h.

◆ BTREE_METAPAGE

#define BTREE_METAPAGE   0 /* first page is meta */

Definition at line 148 of file nbtree.h.

◆ BTREE_MIN_FILLFACTOR

#define BTREE_MIN_FILLFACTOR   10

Definition at line 199 of file nbtree.h.

◆ BTREE_MIN_VERSION

#define BTREE_MIN_VERSION   2 /* minimum supported version */

Definition at line 151 of file nbtree.h.

◆ BTREE_NONLEAF_FILLFACTOR

#define BTREE_NONLEAF_FILLFACTOR   70

Definition at line 201 of file nbtree.h.

◆ BTREE_NOVAC_VERSION

#define BTREE_NOVAC_VERSION   3 /* version with all meta fields set */

Definition at line 152 of file nbtree.h.

◆ BTREE_SINGLEVAL_FILLFACTOR

#define BTREE_SINGLEVAL_FILLFACTOR   96

Definition at line 202 of file nbtree.h.

◆ BTREE_VERSION

#define BTREE_VERSION   4 /* current version number */

Definition at line 150 of file nbtree.h.

◆ BTreeTupleGetNAtts

#define BTreeTupleGetNAtts (   itup,
  rel 
)
Value:
( \
(BTreeTupleIsPivot(itup)) ? \
( \
) \
: \
)
static OffsetNumber ItemPointerGetOffsetNumberNoCheck(const ItemPointerData *pointer)
Definition: itemptr.h:114
static bool BTreeTupleIsPivot(IndexTuple itup)
Definition: nbtree.h:480
#define BT_OFFSET_MASK
Definition: nbtree.h:462
#define IndexRelationGetNumberOfAttributes(relation)
Definition: rel.h:517

Definition at line 577 of file nbtree.h.

◆ BTScanPosInvalidate

#define BTScanPosInvalidate (   scanpos)
Value:
do { \
(scanpos).currPage = InvalidBlockNumber; \
(scanpos).nextPage = InvalidBlockNumber; \
(scanpos).buf = InvalidBuffer; \
(scanpos).lsn = InvalidXLogRecPtr; \
(scanpos).nextTupleOffset = 0; \
} while (0)
#define InvalidBlockNumber
Definition: block.h:33
#define InvalidBuffer
Definition: buf.h:25
static char * buf
Definition: pg_test_fsync.c:73
#define InvalidXLogRecPtr
Definition: xlogdefs.h:28

Definition at line 1022 of file nbtree.h.

◆ BTScanPosIsPinned

#define BTScanPosIsPinned (   scanpos)
Value:
( \
AssertMacro(BlockNumberIsValid((scanpos).currPage) || \
!BufferIsValid((scanpos).buf)), \
BufferIsValid((scanpos).buf) \
)
static bool BlockNumberIsValid(BlockNumber blockNumber)
Definition: block.h:71
static bool BufferIsValid(Buffer bufnum)
Definition: bufmgr.h:351

Definition at line 999 of file nbtree.h.

◆ BTScanPosIsValid

#define BTScanPosIsValid (   scanpos)
Value:
( \
AssertMacro(BlockNumberIsValid((scanpos).currPage) || \
!BufferIsValid((scanpos).buf)), \
BlockNumberIsValid((scanpos).currPage) \
)

Definition at line 1016 of file nbtree.h.

◆ BTScanPosUnpin

#define BTScanPosUnpin (   scanpos)
Value:
do { \
ReleaseBuffer((scanpos).buf); \
(scanpos).buf = InvalidBuffer; \
} while (0)

Definition at line 1005 of file nbtree.h.

◆ BTScanPosUnpinIfPinned

#define BTScanPosUnpinIfPinned (   scanpos)
Value:
do { \
if (BTScanPosIsPinned(scanpos)) \
BTScanPosUnpin(scanpos); \
} while (0)
#define BTScanPosIsPinned(scanpos)
Definition: nbtree.h:999

Definition at line 1010 of file nbtree.h.

◆ BTSORTSUPPORT_PROC

#define BTSORTSUPPORT_PROC   2

Definition at line 708 of file nbtree.h.

◆ INDEX_ALT_TID_MASK

#define INDEX_ALT_TID_MASK   INDEX_AM_RESERVED_BIT

Definition at line 459 of file nbtree.h.

◆ MAX_BT_CYCLE_ID

#define MAX_BT_CYCLE_ID   0xFF7F

Definition at line 93 of file nbtree.h.

◆ MaxTIDsPerBTreePage

#define MaxTIDsPerBTreePage
Value:
(int) ((BLCKSZ - SizeOfPageHeaderData - sizeof(BTPageOpaqueData)) / \
sizeof(ItemPointerData))

Definition at line 185 of file nbtree.h.

◆ P_FIRSTDATAKEY

#define P_FIRSTDATAKEY (   opaque)    (P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY)

Definition at line 369 of file nbtree.h.

◆ P_FIRSTKEY

#define P_FIRSTKEY   ((OffsetNumber) 2)

Definition at line 368 of file nbtree.h.

◆ P_HAS_FULLXID

#define P_HAS_FULLXID (   opaque)    (((opaque)->btpo_flags & BTP_HAS_FULLXID) != 0)

Definition at line 228 of file nbtree.h.

◆ P_HAS_GARBAGE

#define P_HAS_GARBAGE (   opaque)    (((opaque)->btpo_flags & BTP_HAS_GARBAGE) != 0)

Definition at line 226 of file nbtree.h.

◆ P_HIKEY

#define P_HIKEY   ((OffsetNumber) 1)

Definition at line 367 of file nbtree.h.

◆ P_IGNORE

#define P_IGNORE (   opaque)    (((opaque)->btpo_flags & (BTP_DELETED|BTP_HALF_DEAD)) != 0)

Definition at line 225 of file nbtree.h.

◆ P_INCOMPLETE_SPLIT

#define P_INCOMPLETE_SPLIT (   opaque)    (((opaque)->btpo_flags & BTP_INCOMPLETE_SPLIT) != 0)

Definition at line 227 of file nbtree.h.

◆ P_ISDELETED

#define P_ISDELETED (   opaque)    (((opaque)->btpo_flags & BTP_DELETED) != 0)

Definition at line 222 of file nbtree.h.

◆ P_ISHALFDEAD

#define P_ISHALFDEAD (   opaque)    (((opaque)->btpo_flags & BTP_HALF_DEAD) != 0)

Definition at line 224 of file nbtree.h.

◆ P_ISLEAF

#define P_ISLEAF (   opaque)    (((opaque)->btpo_flags & BTP_LEAF) != 0)

Definition at line 220 of file nbtree.h.

◆ P_ISMETA

#define P_ISMETA (   opaque)    (((opaque)->btpo_flags & BTP_META) != 0)

Definition at line 223 of file nbtree.h.

◆ P_ISROOT

#define P_ISROOT (   opaque)    (((opaque)->btpo_flags & BTP_ROOT) != 0)

Definition at line 221 of file nbtree.h.

◆ P_LEFTMOST

#define P_LEFTMOST (   opaque)    ((opaque)->btpo_prev == P_NONE)

Definition at line 218 of file nbtree.h.

◆ P_NONE

#define P_NONE   0

Definition at line 212 of file nbtree.h.

◆ P_RIGHTMOST

#define P_RIGHTMOST (   opaque)    ((opaque)->btpo_next == P_NONE)

Definition at line 219 of file nbtree.h.

◆ PROGRESS_BTREE_PHASE_INDEXBUILD_TABLESCAN

#define PROGRESS_BTREE_PHASE_INDEXBUILD_TABLESCAN   2

Definition at line 1157 of file nbtree.h.

◆ PROGRESS_BTREE_PHASE_LEAF_LOAD

#define PROGRESS_BTREE_PHASE_LEAF_LOAD   5

Definition at line 1160 of file nbtree.h.

◆ PROGRESS_BTREE_PHASE_PERFORMSORT_1

#define PROGRESS_BTREE_PHASE_PERFORMSORT_1   3

Definition at line 1158 of file nbtree.h.

◆ PROGRESS_BTREE_PHASE_PERFORMSORT_2

#define PROGRESS_BTREE_PHASE_PERFORMSORT_2   4

Definition at line 1159 of file nbtree.h.

◆ SK_BT_DESC

#define SK_BT_DESC   (INDOPTION_DESC << SK_BT_INDOPTION_SHIFT)

Definition at line 1127 of file nbtree.h.

◆ SK_BT_INDOPTION_SHIFT

#define SK_BT_INDOPTION_SHIFT   24 /* must clear the above bits */

Definition at line 1126 of file nbtree.h.

◆ SK_BT_NULLS_FIRST

#define SK_BT_NULLS_FIRST   (INDOPTION_NULLS_FIRST << SK_BT_INDOPTION_SHIFT)

Definition at line 1128 of file nbtree.h.

◆ SK_BT_REQBKWD

#define SK_BT_REQBKWD   0x00020000 /* required to continue backward scan */

Definition at line 1125 of file nbtree.h.

◆ SK_BT_REQFWD

#define SK_BT_REQFWD   0x00010000 /* required to continue forward scan */

Definition at line 1124 of file nbtree.h.

Typedef Documentation

◆ BTArrayKeyInfo

◆ BTCycleId

typedef uint16 BTCycleId

Definition at line 29 of file nbtree.h.

◆ BTDedupInterval

◆ BTDedupState

Definition at line 893 of file nbtree.h.

◆ BTDedupStateData

◆ BTDeletedPageData

◆ BTInsertState

Definition at line 835 of file nbtree.h.

◆ BTInsertStateData

◆ BTMetaPageData

◆ BTOptions

typedef struct BTOptions BTOptions

◆ BTPageOpaque

Definition at line 71 of file nbtree.h.

◆ BTPageOpaqueData

◆ BTPendingFSM

typedef struct BTPendingFSM BTPendingFSM

◆ BTReadPageState

◆ BTScanInsert

Definition at line 796 of file nbtree.h.

◆ BTScanInsertData

◆ BTScanOpaque

Definition at line 1081 of file nbtree.h.

◆ BTScanOpaqueData

◆ BTScanPos

Definition at line 997 of file nbtree.h.

◆ BTScanPosData

typedef struct BTScanPosData BTScanPosData

◆ BTScanPosItem

typedef struct BTScanPosItem BTScanPosItem

◆ BTStack

typedef BTStackData* BTStack

Definition at line 739 of file nbtree.h.

◆ BTStackData

typedef struct BTStackData BTStackData

◆ BTVacState

typedef struct BTVacState BTVacState

◆ BTVacuumPosting

Definition at line 914 of file nbtree.h.

◆ BTVacuumPostingData

Function Documentation

◆ _bt_allequalimage()

bool _bt_allequalimage ( Relation  rel,
bool  debugmessage 
)

Definition at line 5129 of file nbtutils.c.

5130 {
5131  bool allequalimage = true;
5132 
5133  /* INCLUDE indexes can never support deduplication */
5136  return false;
5137 
5138  for (int i = 0; i < IndexRelationGetNumberOfKeyAttributes(rel); i++)
5139  {
5140  Oid opfamily = rel->rd_opfamily[i];
5141  Oid opcintype = rel->rd_opcintype[i];
5142  Oid collation = rel->rd_indcollation[i];
5143  Oid equalimageproc;
5144 
5145  equalimageproc = get_opfamily_proc(opfamily, opcintype, opcintype,
5147 
5148  /*
5149  * If there is no BTEQUALIMAGE_PROC then deduplication is assumed to
5150  * be unsafe. Otherwise, actually call proc and see what it says.
5151  */
5152  if (!OidIsValid(equalimageproc) ||
5153  !DatumGetBool(OidFunctionCall1Coll(equalimageproc, collation,
5154  ObjectIdGetDatum(opcintype))))
5155  {
5156  allequalimage = false;
5157  break;
5158  }
5159  }
5160 
5161  if (debugmessage)
5162  {
5163  if (allequalimage)
5164  elog(DEBUG1, "index \"%s\" can safely use deduplication",
5166  else
5167  elog(DEBUG1, "index \"%s\" cannot use deduplication",
5169  }
5170 
5171  return allequalimage;
5172 }
#define OidIsValid(objectId)
Definition: c.h:775
#define DEBUG1
Definition: elog.h:30
#define elog(elevel,...)
Definition: elog.h:225
Datum OidFunctionCall1Coll(Oid functionId, Oid collation, Datum arg1)
Definition: fmgr.c:1411
int i
Definition: isn.c:73
Oid get_opfamily_proc(Oid opfamily, Oid lefttype, Oid righttype, int16 procnum)
Definition: lsyscache.c:796
#define BTEQUALIMAGE_PROC
Definition: nbtree.h:710
static bool DatumGetBool(Datum X)
Definition: postgres.h:90
static Datum ObjectIdGetDatum(Oid X)
Definition: postgres.h:252
unsigned int Oid
Definition: postgres_ext.h:31
#define RelationGetRelationName(relation)
Definition: rel.h:539
#define IndexRelationGetNumberOfKeyAttributes(relation)
Definition: rel.h:524
Oid * rd_opcintype
Definition: rel.h:208
Oid * rd_opfamily
Definition: rel.h:207
Oid * rd_indcollation
Definition: rel.h:217

References BTEQUALIMAGE_PROC, DatumGetBool(), DEBUG1, elog, get_opfamily_proc(), i, IndexRelationGetNumberOfAttributes, IndexRelationGetNumberOfKeyAttributes, ObjectIdGetDatum(), OidFunctionCall1Coll(), OidIsValid, RelationData::rd_indcollation, RelationData::rd_opcintype, RelationData::rd_opfamily, and RelationGetRelationName.

Referenced by _bt_leafbuild(), bt_index_check_internal(), and btbuildempty().

◆ _bt_allocbuf()

Buffer _bt_allocbuf ( Relation  rel,
Relation  heaprel 
)

Definition at line 869 of file nbtpage.c.

870 {
871  Buffer buf;
872  BlockNumber blkno;
873  Page page;
874 
875  Assert(heaprel != NULL);
876 
877  /*
878  * First see if the FSM knows of any free pages.
879  *
880  * We can't trust the FSM's report unreservedly; we have to check that the
881  * page is still free. (For example, an already-free page could have been
882  * re-used between the time the last VACUUM scanned it and the time the
883  * VACUUM made its FSM updates.)
884  *
885  * In fact, it's worse than that: we can't even assume that it's safe to
886  * take a lock on the reported page. If somebody else has a lock on it,
887  * or even worse our own caller does, we could deadlock. (The own-caller
888  * scenario is actually not improbable. Consider an index on a serial or
889  * timestamp column. Nearly all splits will be at the rightmost page, so
890  * it's entirely likely that _bt_split will call us while holding a lock
891  * on the page most recently acquired from FSM. A VACUUM running
892  * concurrently with the previous split could well have placed that page
893  * back in FSM.)
894  *
895  * To get around that, we ask for only a conditional lock on the reported
896  * page. If we fail, then someone else is using the page, and we may
897  * reasonably assume it's not free. (If we happen to be wrong, the worst
898  * consequence is the page will be lost to use till the next VACUUM, which
899  * is no big problem.)
900  */
901  for (;;)
902  {
903  blkno = GetFreeIndexPage(rel);
904  if (blkno == InvalidBlockNumber)
905  break;
906  buf = ReadBuffer(rel, blkno);
907  if (_bt_conditionallockbuf(rel, buf))
908  {
909  page = BufferGetPage(buf);
910 
911  /*
912  * It's possible to find an all-zeroes page in an index. For
913  * example, a backend might successfully extend the relation one
914  * page and then crash before it is able to make a WAL entry for
915  * adding the page. If we find a zeroed page then reclaim it
916  * immediately.
917  */
918  if (PageIsNew(page))
919  {
920  /* Okay to use page. Initialize and return it. */
922  return buf;
923  }
924 
925  if (BTPageIsRecyclable(page, heaprel))
926  {
927  /*
928  * If we are generating WAL for Hot Standby then create a WAL
929  * record that will allow us to conflict with queries running
930  * on standby, in case they have snapshots older than safexid
931  * value
932  */
934  {
935  xl_btree_reuse_page xlrec_reuse;
936 
937  /*
938  * Note that we don't register the buffer with the record,
939  * because this operation doesn't modify the page (that
940  * already happened, back when VACUUM deleted the page).
941  * This record only exists to provide a conflict point for
942  * Hot Standby. See record REDO routine comments.
943  */
944  xlrec_reuse.locator = rel->rd_locator;
945  xlrec_reuse.block = blkno;
946  xlrec_reuse.snapshotConflictHorizon = BTPageGetDeleteXid(page);
947  xlrec_reuse.isCatalogRel =
949 
950  XLogBeginInsert();
951  XLogRegisterData((char *) &xlrec_reuse, SizeOfBtreeReusePage);
952 
953  XLogInsert(RM_BTREE_ID, XLOG_BTREE_REUSE_PAGE);
954  }
955 
956  /* Okay to use page. Re-initialize and return it. */
958  return buf;
959  }
960  elog(DEBUG2, "FSM returned nonrecyclable page");
961  _bt_relbuf(rel, buf);
962  }
963  else
964  {
965  elog(DEBUG2, "FSM returned nonlockable page");
966  /* couldn't get lock, so just drop pin */
968  }
969  }
970 
971  /*
972  * Extend the relation by one page. Need to use RBM_ZERO_AND_LOCK or we
973  * risk a race condition against btvacuumscan --- see comments therein.
974  * This forces us to repeat the valgrind request that _bt_lockbuf()
975  * otherwise would make, as we can't use _bt_lockbuf() without introducing
976  * a race.
977  */
979  if (!RelationUsesLocalBuffers(rel))
981 
982  /* Initialize the new page before returning it */
983  page = BufferGetPage(buf);
984  Assert(PageIsNew(page));
986 
987  return buf;
988 }
uint32 BlockNumber
Definition: block.h:31
int Buffer
Definition: buf.h:23
Buffer ExtendBufferedRel(BufferManagerRelation bmr, ForkNumber forkNum, BufferAccessStrategy strategy, uint32 flags)
Definition: bufmgr.c:846
void ReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:4906
Buffer ReadBuffer(Relation reln, BlockNumber blockNum)
Definition: bufmgr.c:746
static Page BufferGetPage(Buffer buffer)
Definition: bufmgr.h:400
static Size BufferGetPageSize(Buffer buffer)
Definition: bufmgr.h:389
@ EB_LOCK_FIRST
Definition: bufmgr.h:86
#define BMR_REL(p_rel)
Definition: bufmgr.h:107
Pointer Page
Definition: bufpage.h:81
static bool PageIsNew(Page page)
Definition: bufpage.h:233
#define Assert(condition)
Definition: c.h:858
#define DEBUG2
Definition: elog.h:29
BlockNumber GetFreeIndexPage(Relation rel)
Definition: indexfsm.c:38
#define VALGRIND_MAKE_MEM_DEFINED(addr, size)
Definition: memdebug.h:26
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:1023
void _bt_pageinit(Page page, Size size)
Definition: nbtpage.c:1129
bool _bt_conditionallockbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:1093
static FullTransactionId BTPageGetDeleteXid(Page page)
Definition: nbtree.h:260
static bool BTPageIsRecyclable(Page page, Relation heaprel)
Definition: nbtree.h:291
#define XLOG_BTREE_REUSE_PAGE
Definition: nbtxlog.h:40
#define SizeOfBtreeReusePage
Definition: nbtxlog.h:192
#define RelationIsAccessibleInLogicalDecoding(relation)
Definition: rel.h:684
#define RelationNeedsWAL(relation)
Definition: rel.h:628
#define RelationUsesLocalBuffers(relation)
Definition: rel.h:637
@ MAIN_FORKNUM
Definition: relpath.h:58
RelFileLocator rd_locator
Definition: rel.h:57
FullTransactionId snapshotConflictHorizon
Definition: nbtxlog.h:187
RelFileLocator locator
Definition: nbtxlog.h:185
BlockNumber block
Definition: nbtxlog.h:186
#define XLogStandbyInfoActive()
Definition: xlog.h:123
XLogRecPtr XLogInsert(RmgrId rmid, uint8 info)
Definition: xloginsert.c:474
void XLogRegisterData(const char *data, uint32 len)
Definition: xloginsert.c:364
void XLogBeginInsert(void)
Definition: xloginsert.c:149

References _bt_conditionallockbuf(), _bt_pageinit(), _bt_relbuf(), Assert, xl_btree_reuse_page::block, BMR_REL, BTPageGetDeleteXid(), BTPageIsRecyclable(), buf, BufferGetPage(), BufferGetPageSize(), DEBUG2, EB_LOCK_FIRST, elog, ExtendBufferedRel(), GetFreeIndexPage(), InvalidBlockNumber, xl_btree_reuse_page::isCatalogRel, xl_btree_reuse_page::locator, MAIN_FORKNUM, PageIsNew(), RelationData::rd_locator, ReadBuffer(), RelationIsAccessibleInLogicalDecoding, RelationNeedsWAL, RelationUsesLocalBuffers, ReleaseBuffer(), SizeOfBtreeReusePage, xl_btree_reuse_page::snapshotConflictHorizon, VALGRIND_MAKE_MEM_DEFINED, XLOG_BTREE_REUSE_PAGE, XLogBeginInsert(), XLogInsert(), XLogRegisterData(), and XLogStandbyInfoActive.

Referenced by _bt_getroot(), _bt_newlevel(), and _bt_split().

◆ _bt_binsrch_insert()

OffsetNumber _bt_binsrch_insert ( Relation  rel,
BTInsertState  insertstate 
)

Definition at line 471 of file nbtsearch.c.

472 {
473  BTScanInsert key = insertstate->itup_key;
474  Page page;
475  BTPageOpaque opaque;
476  OffsetNumber low,
477  high,
478  stricthigh;
479  int32 result,
480  cmpval;
481 
482  page = BufferGetPage(insertstate->buf);
483  opaque = BTPageGetOpaque(page);
484 
485  Assert(P_ISLEAF(opaque));
486  Assert(!key->nextkey);
487  Assert(insertstate->postingoff == 0);
488 
489  if (!insertstate->bounds_valid)
490  {
491  /* Start new binary search */
492  low = P_FIRSTDATAKEY(opaque);
493  high = PageGetMaxOffsetNumber(page);
494  }
495  else
496  {
497  /* Restore result of previous binary search against same page */
498  low = insertstate->low;
499  high = insertstate->stricthigh;
500  }
501 
502  /* If there are no keys on the page, return the first available slot */
503  if (unlikely(high < low))
504  {
505  /* Caller can't reuse bounds */
506  insertstate->low = InvalidOffsetNumber;
507  insertstate->stricthigh = InvalidOffsetNumber;
508  insertstate->bounds_valid = false;
509  return low;
510  }
511 
512  /*
513  * Binary search to find the first key on the page >= scan key. (nextkey
514  * is always false when inserting).
515  *
516  * The loop invariant is: all slots before 'low' are < scan key, all slots
517  * at or after 'high' are >= scan key. 'stricthigh' is > scan key, and is
518  * maintained to save additional search effort for caller.
519  *
520  * We can fall out when high == low.
521  */
522  if (!insertstate->bounds_valid)
523  high++; /* establish the loop invariant for high */
524  stricthigh = high; /* high initially strictly higher */
525 
526  cmpval = 1; /* !nextkey comparison value */
527 
528  while (high > low)
529  {
530  OffsetNumber mid = low + ((high - low) / 2);
531 
532  /* We have low <= mid < high, so mid points at a real slot */
533 
534  result = _bt_compare(rel, key, page, mid);
535 
536  if (result >= cmpval)
537  low = mid + 1;
538  else
539  {
540  high = mid;
541  if (result != 0)
542  stricthigh = high;
543  }
544 
545  /*
546  * If tuple at offset located by binary search is a posting list whose
547  * TID range overlaps with caller's scantid, perform posting list
548  * binary search to set postingoff for caller. Caller must split the
549  * posting list when postingoff is set. This should happen
550  * infrequently.
551  */
552  if (unlikely(result == 0 && key->scantid != NULL))
553  {
554  /*
555  * postingoff should never be set more than once per leaf page
556  * binary search. That would mean that there are duplicate table
557  * TIDs in the index, which is never okay. Check for that here.
558  */
559  if (insertstate->postingoff != 0)
560  ereport(ERROR,
561  (errcode(ERRCODE_INDEX_CORRUPTED),
562  errmsg_internal("table tid from new index tuple (%u,%u) cannot find insert offset between offsets %u and %u of block %u in index \"%s\"",
563  ItemPointerGetBlockNumber(key->scantid),
565  low, stricthigh,
566  BufferGetBlockNumber(insertstate->buf),
567  RelationGetRelationName(rel))));
568 
569  insertstate->postingoff = _bt_binsrch_posting(key, page, mid);
570  }
571  }
572 
573  /*
574  * On a leaf page, a binary search always returns the first key >= scan
575  * key (at least in !nextkey case), which could be the last slot + 1. This
576  * is also the lower bound of cached search.
577  *
578  * stricthigh may also be the last slot + 1, which prevents caller from
579  * using bounds directly, but is still useful to us if we're called a
580  * second time with cached bounds (cached low will be < stricthigh when
581  * that happens).
582  */
583  insertstate->low = low;
584  insertstate->stricthigh = stricthigh;
585  insertstate->bounds_valid = true;
586 
587  return low;
588 }
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:3706
static OffsetNumber PageGetMaxOffsetNumber(Page page)
Definition: bufpage.h:372
signed int int32
Definition: c.h:494
#define unlikely(x)
Definition: c.h:311
int errmsg_internal(const char *fmt,...)
Definition: elog.c:1157
int errcode(int sqlerrcode)
Definition: elog.c:853
#define ERROR
Definition: elog.h:39
#define ereport(elevel,...)
Definition: elog.h:149
static OffsetNumber ItemPointerGetOffsetNumber(const ItemPointerData *pointer)
Definition: itemptr.h:124
static BlockNumber ItemPointerGetBlockNumber(const ItemPointerData *pointer)
Definition: itemptr.h:103
#define P_ISLEAF(opaque)
Definition: nbtree.h:220
#define BTPageGetOpaque(page)
Definition: nbtree.h:73
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:369
static int _bt_binsrch_posting(BTScanInsert key, Page page, OffsetNumber offnum)
Definition: nbtsearch.c:599
int32 _bt_compare(Relation rel, BTScanInsert key, Page page, OffsetNumber offnum)
Definition: nbtsearch.c:685
#define InvalidOffsetNumber
Definition: off.h:26
uint16 OffsetNumber
Definition: off.h:24
OffsetNumber stricthigh
Definition: nbtree.h:825
bool bounds_valid
Definition: nbtree.h:823
OffsetNumber low
Definition: nbtree.h:824
BTScanInsert itup_key
Definition: nbtree.h:813

References _bt_binsrch_posting(), _bt_compare(), Assert, BTInsertStateData::bounds_valid, BTPageGetOpaque, BTInsertStateData::buf, BufferGetBlockNumber(), BufferGetPage(), ereport, errcode(), errmsg_internal(), ERROR, InvalidOffsetNumber, ItemPointerGetBlockNumber(), ItemPointerGetOffsetNumber(), BTInsertStateData::itup_key, sort-test::key, BTInsertStateData::low, P_FIRSTDATAKEY, P_ISLEAF, PageGetMaxOffsetNumber(), BTInsertStateData::postingoff, RelationGetRelationName, BTInsertStateData::stricthigh, and unlikely.

Referenced by _bt_check_unique(), _bt_findinsertloc(), and bt_rootdescend().

◆ _bt_bottomupdel_pass()

bool _bt_bottomupdel_pass ( Relation  rel,
Buffer  buf,
Relation  heapRel,
Size  newitemsz 
)

Definition at line 307 of file nbtdedup.c.

309 {
310  OffsetNumber offnum,
311  minoff,
312  maxoff;
313  Page page = BufferGetPage(buf);
314  BTPageOpaque opaque = BTPageGetOpaque(page);
316  TM_IndexDeleteOp delstate;
317  bool neverdedup;
318  int nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
319 
320  /* Passed-in newitemsz is MAXALIGNED but does not include line pointer */
321  newitemsz += sizeof(ItemIdData);
322 
323  /* Initialize deduplication state */
325  state->deduplicate = true;
326  state->nmaxitems = 0;
327  state->maxpostingsize = BLCKSZ; /* We're not really deduplicating */
328  state->base = NULL;
329  state->baseoff = InvalidOffsetNumber;
330  state->basetupsize = 0;
331  state->htids = palloc(state->maxpostingsize);
332  state->nhtids = 0;
333  state->nitems = 0;
334  state->phystupsize = 0;
335  state->nintervals = 0;
336 
337  /*
338  * Initialize tableam state that describes bottom-up index deletion
339  * operation.
340  *
341  * We'll go on to ask the tableam to search for TIDs whose index tuples we
342  * can safely delete. The tableam will search until our leaf page space
343  * target is satisfied, or until the cost of continuing with the tableam
344  * operation seems too high. It focuses its efforts on TIDs associated
345  * with duplicate index tuples that we mark "promising".
346  *
347  * This space target is a little arbitrary. The tableam must be able to
348  * keep the costs and benefits in balance. We provide the tableam with
349  * exhaustive information about what might work, without directly
350  * concerning ourselves with avoiding work during the tableam call. Our
351  * role in costing the bottom-up deletion process is strictly advisory.
352  */
353  delstate.irel = rel;
354  delstate.iblknum = BufferGetBlockNumber(buf);
355  delstate.bottomup = true;
356  delstate.bottomupfreespace = Max(BLCKSZ / 16, newitemsz);
357  delstate.ndeltids = 0;
358  delstate.deltids = palloc(MaxTIDsPerBTreePage * sizeof(TM_IndexDelete));
359  delstate.status = palloc(MaxTIDsPerBTreePage * sizeof(TM_IndexStatus));
360 
361  minoff = P_FIRSTDATAKEY(opaque);
362  maxoff = PageGetMaxOffsetNumber(page);
363  for (offnum = minoff;
364  offnum <= maxoff;
365  offnum = OffsetNumberNext(offnum))
366  {
367  ItemId itemid = PageGetItemId(page, offnum);
368  IndexTuple itup = (IndexTuple) PageGetItem(page, itemid);
369 
370  Assert(!ItemIdIsDead(itemid));
371 
372  if (offnum == minoff)
373  {
374  /* itup starts first pending interval */
375  _bt_dedup_start_pending(state, itup, offnum);
376  }
377  else if (_bt_keep_natts_fast(rel, state->base, itup) > nkeyatts &&
378  _bt_dedup_save_htid(state, itup))
379  {
380  /* Tuple is equal; just added its TIDs to pending interval */
381  }
382  else
383  {
384  /* Finalize interval -- move its TIDs to delete state */
385  _bt_bottomupdel_finish_pending(page, state, &delstate);
386 
387  /* itup starts new pending interval */
388  _bt_dedup_start_pending(state, itup, offnum);
389  }
390  }
391  /* Finalize final interval -- move its TIDs to delete state */
392  _bt_bottomupdel_finish_pending(page, state, &delstate);
393 
394  /*
395  * We don't give up now in the event of having few (or even zero)
396  * promising tuples for the tableam because it's not up to us as the index
397  * AM to manage costs (note that the tableam might have heuristics of its
398  * own that work out what to do). We should at least avoid having our
399  * caller do a useless deduplication pass after we return in the event of
400  * zero promising tuples, though.
401  */
402  neverdedup = false;
403  if (state->nintervals == 0)
404  neverdedup = true;
405 
406  pfree(state->htids);
407  pfree(state);
408 
409  /* Ask tableam which TIDs are deletable, then physically delete them */
410  _bt_delitems_delete_check(rel, buf, heapRel, &delstate);
411 
412  pfree(delstate.deltids);
413  pfree(delstate.status);
414 
415  /* Report "success" to caller unconditionally to avoid deduplication */
416  if (neverdedup)
417  return true;
418 
419  /* Don't dedup when we won't end up back here any time soon anyway */
420  return PageGetExactFreeSpace(page) >= Max(BLCKSZ / 24, newitemsz);
421 }
Size PageGetExactFreeSpace(Page page)
Definition: bufpage.c:958
static Item PageGetItem(Page page, ItemId itemId)
Definition: bufpage.h:354
static ItemId PageGetItemId(Page page, OffsetNumber offsetNumber)
Definition: bufpage.h:243
#define Max(x, y)
Definition: c.h:998
struct ItemIdData ItemIdData
#define ItemIdIsDead(itemId)
Definition: itemid.h:113
IndexTupleData * IndexTuple
Definition: itup.h:53
void pfree(void *pointer)
Definition: mcxt.c:1521
void * palloc(Size size)
Definition: mcxt.c:1317
bool _bt_dedup_save_htid(BTDedupState state, IndexTuple itup)
Definition: nbtdedup.c:484
void _bt_dedup_start_pending(BTDedupState state, IndexTuple base, OffsetNumber baseoff)
Definition: nbtdedup.c:433
static void _bt_bottomupdel_finish_pending(Page page, BTDedupState state, TM_IndexDeleteOp *delstate)
Definition: nbtdedup.c:648
void _bt_delitems_delete_check(Relation rel, Buffer buf, Relation heapRel, TM_IndexDeleteOp *delstate)
Definition: nbtpage.c:1513
#define MaxTIDsPerBTreePage
Definition: nbtree.h:185
BTDedupStateData * BTDedupState
Definition: nbtree.h:893
int _bt_keep_natts_fast(Relation rel, IndexTuple lastleft, IndexTuple firstright)
Definition: nbtutils.c:4864
#define OffsetNumberNext(offsetNumber)
Definition: off.h:52
TM_IndexStatus * status
Definition: tableam.h:255
int bottomupfreespace
Definition: tableam.h:250
Relation irel
Definition: tableam.h:247
TM_IndexDelete * deltids
Definition: tableam.h:254
BlockNumber iblknum
Definition: tableam.h:248
Definition: regguts.h:323

References _bt_bottomupdel_finish_pending(), _bt_dedup_save_htid(), _bt_dedup_start_pending(), _bt_delitems_delete_check(), _bt_keep_natts_fast(), Assert, TM_IndexDeleteOp::bottomup, TM_IndexDeleteOp::bottomupfreespace, BTPageGetOpaque, buf, BufferGetBlockNumber(), BufferGetPage(), TM_IndexDeleteOp::deltids, TM_IndexDeleteOp::iblknum, IndexRelationGetNumberOfKeyAttributes, InvalidOffsetNumber, TM_IndexDeleteOp::irel, ItemIdIsDead, Max, MaxTIDsPerBTreePage, TM_IndexDeleteOp::ndeltids, OffsetNumberNext, P_FIRSTDATAKEY, PageGetExactFreeSpace(), PageGetItem(), PageGetItemId(), PageGetMaxOffsetNumber(), palloc(), pfree(), and TM_IndexDeleteOp::status.

Referenced by _bt_delete_or_dedup_one_page().

◆ _bt_check_natts()

bool _bt_check_natts ( Relation  rel,
bool  heapkeyspace,
Page  page,
OffsetNumber  offnum 
)

Definition at line 4911 of file nbtutils.c.

4912 {
4915  BTPageOpaque opaque = BTPageGetOpaque(page);
4916  IndexTuple itup;
4917  int tupnatts;
4918 
4919  /*
4920  * We cannot reliably test a deleted or half-dead page, since they have
4921  * dummy high keys
4922  */
4923  if (P_IGNORE(opaque))
4924  return true;
4925 
4926  Assert(offnum >= FirstOffsetNumber &&
4927  offnum <= PageGetMaxOffsetNumber(page));
4928 
4929  itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
4930  tupnatts = BTreeTupleGetNAtts(itup, rel);
4931 
4932  /* !heapkeyspace indexes do not support deduplication */
4933  if (!heapkeyspace && BTreeTupleIsPosting(itup))
4934  return false;
4935 
4936  /* Posting list tuples should never have "pivot heap TID" bit set */
4937  if (BTreeTupleIsPosting(itup) &&
4939  BT_PIVOT_HEAP_TID_ATTR) != 0)
4940  return false;
4941 
4942  /* INCLUDE indexes do not support deduplication */
4943  if (natts != nkeyatts && BTreeTupleIsPosting(itup))
4944  return false;
4945 
4946  if (P_ISLEAF(opaque))
4947  {
4948  if (offnum >= P_FIRSTDATAKEY(opaque))
4949  {
4950  /*
4951  * Non-pivot tuple should never be explicitly marked as a pivot
4952  * tuple
4953  */
4954  if (BTreeTupleIsPivot(itup))
4955  return false;
4956 
4957  /*
4958  * Leaf tuples that are not the page high key (non-pivot tuples)
4959  * should never be truncated. (Note that tupnatts must have been
4960  * inferred, even with a posting list tuple, because only pivot
4961  * tuples store tupnatts directly.)
4962  */
4963  return tupnatts == natts;
4964  }
4965  else
4966  {
4967  /*
4968  * Rightmost page doesn't contain a page high key, so tuple was
4969  * checked above as ordinary leaf tuple
4970  */
4971  Assert(!P_RIGHTMOST(opaque));
4972 
4973  /*
4974  * !heapkeyspace high key tuple contains only key attributes. Note
4975  * that tupnatts will only have been explicitly represented in
4976  * !heapkeyspace indexes that happen to have non-key attributes.
4977  */
4978  if (!heapkeyspace)
4979  return tupnatts == nkeyatts;
4980 
4981  /* Use generic heapkeyspace pivot tuple handling */
4982  }
4983  }
4984  else /* !P_ISLEAF(opaque) */
4985  {
4986  if (offnum == P_FIRSTDATAKEY(opaque))
4987  {
4988  /*
4989  * The first tuple on any internal page (possibly the first after
4990  * its high key) is its negative infinity tuple. Negative
4991  * infinity tuples are always truncated to zero attributes. They
4992  * are a particular kind of pivot tuple.
4993  */
4994  if (heapkeyspace)
4995  return tupnatts == 0;
4996 
4997  /*
4998  * The number of attributes won't be explicitly represented if the
4999  * negative infinity tuple was generated during a page split that
5000  * occurred with a version of Postgres before v11. There must be
5001  * a problem when there is an explicit representation that is
5002  * non-zero, or when there is no explicit representation and the
5003  * tuple is evidently not a pre-pg_upgrade tuple.
5004  *
5005  * Prior to v11, downlinks always had P_HIKEY as their offset.
5006  * Accept that as an alternative indication of a valid
5007  * !heapkeyspace negative infinity tuple.
5008  */
5009  return tupnatts == 0 ||
5011  }
5012  else
5013  {
5014  /*
5015  * !heapkeyspace downlink tuple with separator key contains only
5016  * key attributes. Note that tupnatts will only have been
5017  * explicitly represented in !heapkeyspace indexes that happen to
5018  * have non-key attributes.
5019  */
5020  if (!heapkeyspace)
5021  return tupnatts == nkeyatts;
5022 
5023  /* Use generic heapkeyspace pivot tuple handling */
5024  }
5025  }
5026 
5027  /* Handle heapkeyspace pivot tuples (excluding minus infinity items) */
5028  Assert(heapkeyspace);
5029 
5030  /*
5031  * Explicit representation of the number of attributes is mandatory with
5032  * heapkeyspace index pivot tuples, regardless of whether or not there are
5033  * non-key attributes.
5034  */
5035  if (!BTreeTupleIsPivot(itup))
5036  return false;
5037 
5038  /* Pivot tuple should not use posting list representation (redundant) */
5039  if (BTreeTupleIsPosting(itup))
5040  return false;
5041 
5042  /*
5043  * Heap TID is a tiebreaker key attribute, so it cannot be untruncated
5044  * when any other key attribute is truncated
5045  */
5046  if (BTreeTupleGetHeapTID(itup) != NULL && tupnatts != nkeyatts)
5047  return false;
5048 
5049  /*
5050  * Pivot tuple must have at least one untruncated key attribute (minus
5051  * infinity pivot tuples are the only exception). Pivot tuples can never
5052  * represent that there is a value present for a key attribute that
5053  * exceeds pg_index.indnkeyatts for the index.
5054  */
5055  return tupnatts > 0 && tupnatts <= nkeyatts;
5056 }
signed short int16
Definition: c.h:493
#define BT_PIVOT_HEAP_TID_ATTR
Definition: nbtree.h:465
#define P_HIKEY
Definition: nbtree.h:367
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:219
#define P_IGNORE(opaque)
Definition: nbtree.h:225
static bool BTreeTupleIsPosting(IndexTuple itup)
Definition: nbtree.h:492
static ItemPointer BTreeTupleGetHeapTID(IndexTuple itup)
Definition: nbtree.h:638
#define BTreeTupleGetNAtts(itup, rel)
Definition: nbtree.h:577
#define FirstOffsetNumber
Definition: off.h:27
ItemPointerData t_tid
Definition: itup.h:37

References Assert, BT_PIVOT_HEAP_TID_ATTR, BTPageGetOpaque, BTreeTupleGetHeapTID(), BTreeTupleGetNAtts, BTreeTupleIsPivot(), BTreeTupleIsPosting(), FirstOffsetNumber, IndexRelationGetNumberOfAttributes, IndexRelationGetNumberOfKeyAttributes, ItemPointerGetOffsetNumber(), ItemPointerGetOffsetNumberNoCheck(), P_FIRSTDATAKEY, P_HIKEY, P_IGNORE, P_ISLEAF, P_RIGHTMOST, PageGetItem(), PageGetItemId(), PageGetMaxOffsetNumber(), and IndexTupleData::t_tid.

Referenced by _bt_compare(), and bt_target_page_check().

◆ _bt_check_third_page()

void _bt_check_third_page ( Relation  rel,
Relation  heap,
bool  needheaptidspace,
Page  page,
IndexTuple  newtup 
)

Definition at line 5071 of file nbtutils.c.

5073 {
5074  Size itemsz;
5075  BTPageOpaque opaque;
5076 
5077  itemsz = MAXALIGN(IndexTupleSize(newtup));
5078 
5079  /* Double check item size against limit */
5080  if (itemsz <= BTMaxItemSize(page))
5081  return;
5082 
5083  /*
5084  * Tuple is probably too large to fit on page, but it's possible that the
5085  * index uses version 2 or version 3, or that page is an internal page, in
5086  * which case a slightly higher limit applies.
5087  */
5088  if (!needheaptidspace && itemsz <= BTMaxItemSizeNoHeapTid(page))
5089  return;
5090 
5091  /*
5092  * Internal page insertions cannot fail here, because that would mean that
5093  * an earlier leaf level insertion that should have failed didn't
5094  */
5095  opaque = BTPageGetOpaque(page);
5096  if (!P_ISLEAF(opaque))
5097  elog(ERROR, "cannot insert oversized tuple of size %zu on internal page of index \"%s\"",
5098  itemsz, RelationGetRelationName(rel));
5099 
5100  ereport(ERROR,
5101  (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
5102  errmsg("index row size %zu exceeds btree version %u maximum %zu for index \"%s\"",
5103  itemsz,
5104  needheaptidspace ? BTREE_VERSION : BTREE_NOVAC_VERSION,
5105  needheaptidspace ? BTMaxItemSize(page) :
5106  BTMaxItemSizeNoHeapTid(page),
5108  errdetail("Index row references tuple (%u,%u) in relation \"%s\".",
5111  RelationGetRelationName(heap)),
5112  errhint("Values larger than 1/3 of a buffer page cannot be indexed.\n"
5113  "Consider a function index of an MD5 hash of the value, "
5114  "or use full text indexing."),
5116 }
size_t Size
Definition: c.h:605
int errdetail(const char *fmt,...)
Definition: elog.c:1203
int errhint(const char *fmt,...)
Definition: elog.c:1317
int errmsg(const char *fmt,...)
Definition: elog.c:1070
#define IndexTupleSize(itup)
Definition: itup.h:70
#define BTMaxItemSizeNoHeapTid(page)
Definition: nbtree.h:169
#define BTREE_VERSION
Definition: nbtree.h:150
#define BTMaxItemSize(page)
Definition: nbtree.h:164
#define BTREE_NOVAC_VERSION
Definition: nbtree.h:152
int errtableconstraint(Relation rel, const char *conname)
Definition: relcache.c:6006

References BTMaxItemSize, BTMaxItemSizeNoHeapTid, BTPageGetOpaque, BTREE_NOVAC_VERSION, BTREE_VERSION, BTreeTupleGetHeapTID(), elog, ereport, errcode(), errdetail(), errhint(), errmsg(), ERROR, errtableconstraint(), IndexTupleSize, ItemPointerGetBlockNumber(), ItemPointerGetOffsetNumber(), MAXALIGN, P_ISLEAF, and RelationGetRelationName.

Referenced by _bt_buildadd(), and _bt_findinsertloc().

◆ _bt_checkkeys()

bool _bt_checkkeys ( IndexScanDesc  scan,
BTReadPageState pstate,
bool  arrayKeys,
IndexTuple  tuple,
int  tupnatts 
)

Definition at line 3502 of file nbtutils.c.

3504 {
3505  TupleDesc tupdesc = RelationGetDescr(scan->indexRelation);
3506  BTScanOpaque so = (BTScanOpaque) scan->opaque;
3507  ScanDirection dir = pstate->dir;
3508  int ikey = 0;
3509  bool res;
3510 
3511  Assert(BTreeTupleGetNAtts(tuple, scan->indexRelation) == tupnatts);
3512 
3513  res = _bt_check_compare(scan, dir, tuple, tupnatts, tupdesc,
3514  arrayKeys, pstate->prechecked, pstate->firstmatch,
3515  &pstate->continuescan, &ikey);
3516 
3517 #ifdef USE_ASSERT_CHECKING
3518  if (!arrayKeys && so->numArrayKeys)
3519  {
3520  /*
3521  * This is a continuescan precheck call for a scan with array keys.
3522  *
3523  * Assert that the scan isn't in danger of becoming confused.
3524  */
3525  Assert(!so->scanBehind && !pstate->prechecked && !pstate->firstmatch);
3526  Assert(!_bt_tuple_before_array_skeys(scan, dir, tuple, tupdesc,
3527  tupnatts, false, 0, NULL));
3528  }
3529  if (pstate->prechecked || pstate->firstmatch)
3530  {
3531  bool dcontinuescan;
3532  int dikey = 0;
3533 
3534  /*
3535  * Call relied on continuescan/firstmatch prechecks -- assert that we
3536  * get the same answer without those optimizations
3537  */
3538  Assert(res == _bt_check_compare(scan, dir, tuple, tupnatts, tupdesc,
3539  false, false, false,
3540  &dcontinuescan, &dikey));
3541  Assert(pstate->continuescan == dcontinuescan);
3542  }
3543 #endif
3544 
3545  /*
3546  * Only one _bt_check_compare call is required in the common case where
3547  * there are no equality strategy array scan keys. Otherwise we can only
3548  * accept _bt_check_compare's answer unreservedly when it didn't set
3549  * pstate.continuescan=false.
3550  */
3551  if (!arrayKeys || pstate->continuescan)
3552  return res;
3553 
3554  /*
3555  * _bt_check_compare call set continuescan=false in the presence of
3556  * equality type array keys. This could mean that the tuple is just past
3557  * the end of matches for the current array keys.
3558  *
3559  * It's also possible that the scan is still _before_ the _start_ of
3560  * tuples matching the current set of array keys. Check for that first.
3561  */
3562  if (_bt_tuple_before_array_skeys(scan, dir, tuple, tupdesc, tupnatts, true,
3563  ikey, NULL))
3564  {
3565  /*
3566  * Tuple is still before the start of matches according to the scan's
3567  * required array keys (according to _all_ of its required equality
3568  * strategy keys, actually).
3569  *
3570  * _bt_advance_array_keys occasionally sets so->scanBehind to signal
3571  * that the scan's current position/tuples might be significantly
3572  * behind (multiple pages behind) its current array keys. When this
3573  * happens, we need to be prepared to recover by starting a new
3574  * primitive index scan here, on our own.
3575  */
3576  Assert(!so->scanBehind ||
3578  if (unlikely(so->scanBehind) && pstate->finaltup &&
3579  _bt_tuple_before_array_skeys(scan, dir, pstate->finaltup, tupdesc,
3580  BTreeTupleGetNAtts(pstate->finaltup,
3581  scan->indexRelation),
3582  false, 0, NULL))
3583  {
3584  /* Cut our losses -- start a new primitive index scan now */
3585  pstate->continuescan = false;
3586  so->needPrimScan = true;
3587  }
3588  else
3589  {
3590  /* Override _bt_check_compare, continue primitive scan */
3591  pstate->continuescan = true;
3592 
3593  /*
3594  * We will end up here repeatedly given a group of tuples > the
3595  * previous array keys and < the now-current keys (for a backwards
3596  * scan it's just the same, though the operators swap positions).
3597  *
3598  * We must avoid allowing this linear search process to scan very
3599  * many tuples from well before the start of tuples matching the
3600  * current array keys (or from well before the point where we'll
3601  * once again have to advance the scan's array keys).
3602  *
3603  * We keep the overhead under control by speculatively "looking
3604  * ahead" to later still-unscanned items from this same leaf page.
3605  * We'll only attempt this once the number of tuples that the
3606  * linear search process has examined starts to get out of hand.
3607  */
3608  pstate->rechecks++;
3609  if (pstate->rechecks >= LOOK_AHEAD_REQUIRED_RECHECKS)
3610  {
3611  /* See if we should skip ahead within the current leaf page */
3612  _bt_checkkeys_look_ahead(scan, pstate, tupnatts, tupdesc);
3613 
3614  /*
3615  * Might have set pstate.skip to a later page offset. When
3616  * that happens then _bt_readpage caller will inexpensively
3617  * skip ahead to a later tuple from the same page (the one
3618  * just after the tuple we successfully "looked ahead" to).
3619  */
3620  }
3621  }
3622 
3623  /* This indextuple doesn't match the current qual, in any case */
3624  return false;
3625  }
3626 
3627  /*
3628  * Caller's tuple is >= the current set of array keys and other equality
3629  * constraint scan keys (or <= if this is a backwards scan). It's now
3630  * clear that we _must_ advance any required array keys in lockstep with
3631  * the scan.
3632  */
3633  return _bt_advance_array_keys(scan, pstate, tuple, tupnatts, tupdesc,
3634  ikey, true);
3635 }
BTScanOpaqueData * BTScanOpaque
Definition: nbtree.h:1081
static bool _bt_check_compare(IndexScanDesc scan, ScanDirection dir, IndexTuple tuple, int tupnatts, TupleDesc tupdesc, bool advancenonrequired, bool prechecked, bool firstmatch, bool *continuescan, int *ikey)
Definition: nbtutils.c:3676
static bool _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate, IndexTuple tuple, int tupnatts, TupleDesc tupdesc, int sktrig, bool sktrig_required)
Definition: nbtutils.c:1789
#define LOOK_AHEAD_REQUIRED_RECHECKS
Definition: nbtutils.c:32
static bool _bt_tuple_before_array_skeys(IndexScanDesc scan, ScanDirection dir, IndexTuple tuple, TupleDesc tupdesc, int tupnatts, bool readpagetup, int sktrig, bool *scanBehind)
Definition: nbtutils.c:1544
static void _bt_checkkeys_look_ahead(IndexScanDesc scan, BTReadPageState *pstate, int tupnatts, TupleDesc tupdesc)
Definition: nbtutils.c:4066
#define RelationGetDescr(relation)
Definition: rel.h:531
ScanDirection
Definition: sdir.h:25
#define BTEqualStrategyNumber
Definition: stratnum.h:31
bool firstmatch
Definition: nbtree.h:1108
bool continuescan
Definition: nbtree.h:1101
IndexTuple finaltup
Definition: nbtree.h:1092
bool prechecked
Definition: nbtree.h:1107
ScanDirection dir
Definition: nbtree.h:1089
int16 rechecks
Definition: nbtree.h:1114
bool needPrimScan
Definition: nbtree.h:1049
ScanKey keyData
Definition: nbtree.h:1045
Relation indexRelation
Definition: relscan.h:118
StrategyNumber sk_strategy
Definition: skey.h:68

References _bt_advance_array_keys(), _bt_check_compare(), _bt_checkkeys_look_ahead(), _bt_tuple_before_array_skeys(), Assert, BTEqualStrategyNumber, BTreeTupleGetNAtts, BTReadPageState::continuescan, BTReadPageState::dir, BTReadPageState::finaltup, BTReadPageState::firstmatch, IndexScanDescData::indexRelation, BTScanOpaqueData::keyData, LOOK_AHEAD_REQUIRED_RECHECKS, BTScanOpaqueData::needPrimScan, BTScanOpaqueData::numArrayKeys, IndexScanDescData::opaque, BTReadPageState::prechecked, BTReadPageState::rechecks, RelationGetDescr, res, BTScanOpaqueData::scanBehind, ScanKeyData::sk_strategy, and unlikely.

Referenced by _bt_readpage().

◆ _bt_checkpage()

void _bt_checkpage ( Relation  rel,
Buffer  buf 
)

Definition at line 797 of file nbtpage.c.

798 {
799  Page page = BufferGetPage(buf);
800 
801  /*
802  * ReadBuffer verifies that every newly-read page passes
803  * PageHeaderIsValid, which means it either contains a reasonably sane
804  * page header or is all-zero. We have to defend against the all-zero
805  * case, however.
806  */
807  if (PageIsNew(page))
808  ereport(ERROR,
809  (errcode(ERRCODE_INDEX_CORRUPTED),
810  errmsg("index \"%s\" contains unexpected zero page at block %u",
813  errhint("Please REINDEX it.")));
814 
815  /*
816  * Additionally check that the special area looks sane.
817  */
818  if (PageGetSpecialSize(page) != MAXALIGN(sizeof(BTPageOpaqueData)))
819  ereport(ERROR,
820  (errcode(ERRCODE_INDEX_CORRUPTED),
821  errmsg("index \"%s\" contains corrupted page at block %u",
824  errhint("Please REINDEX it.")));
825 }
static uint16 PageGetSpecialSize(Page page)
Definition: bufpage.h:316

References buf, BufferGetBlockNumber(), BufferGetPage(), ereport, errcode(), errhint(), errmsg(), ERROR, MAXALIGN, PageGetSpecialSize(), PageIsNew(), and RelationGetRelationName.

Referenced by _bt_getbuf(), _bt_relandgetbuf(), _bt_search_insert(), bt_recheck_sibling_links(), btvacuumpage(), and palloc_btree_page().

◆ _bt_compare()

int32 _bt_compare ( Relation  rel,
BTScanInsert  key,
Page  page,
OffsetNumber  offnum 
)

Definition at line 685 of file nbtsearch.c.

689 {
690  TupleDesc itupdesc = RelationGetDescr(rel);
691  BTPageOpaque opaque = BTPageGetOpaque(page);
692  IndexTuple itup;
693  ItemPointer heapTid;
694  ScanKey scankey;
695  int ncmpkey;
696  int ntupatts;
697  int32 result;
698 
699  Assert(_bt_check_natts(rel, key->heapkeyspace, page, offnum));
701  Assert(key->heapkeyspace || key->scantid == NULL);
702 
703  /*
704  * Force result ">" if target item is first data item on an internal page
705  * --- see NOTE above.
706  */
707  if (!P_ISLEAF(opaque) && offnum == P_FIRSTDATAKEY(opaque))
708  return 1;
709 
710  itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
711  ntupatts = BTreeTupleGetNAtts(itup, rel);
712 
713  /*
714  * The scan key is set up with the attribute number associated with each
715  * term in the key. It is important that, if the index is multi-key, the
716  * scan contain the first k key attributes, and that they be in order. If
717  * you think about how multi-key ordering works, you'll understand why
718  * this is.
719  *
720  * We don't test for violation of this condition here, however. The
721  * initial setup for the index scan had better have gotten it right (see
722  * _bt_first).
723  */
724 
725  ncmpkey = Min(ntupatts, key->keysz);
726  Assert(key->heapkeyspace || ncmpkey == key->keysz);
727  Assert(!BTreeTupleIsPosting(itup) || key->allequalimage);
728  scankey = key->scankeys;
729  for (int i = 1; i <= ncmpkey; i++)
730  {
731  Datum datum;
732  bool isNull;
733 
734  datum = index_getattr(itup, scankey->sk_attno, itupdesc, &isNull);
735 
736  if (scankey->sk_flags & SK_ISNULL) /* key is NULL */
737  {
738  if (isNull)
739  result = 0; /* NULL "=" NULL */
740  else if (scankey->sk_flags & SK_BT_NULLS_FIRST)
741  result = -1; /* NULL "<" NOT_NULL */
742  else
743  result = 1; /* NULL ">" NOT_NULL */
744  }
745  else if (isNull) /* key is NOT_NULL and item is NULL */
746  {
747  if (scankey->sk_flags & SK_BT_NULLS_FIRST)
748  result = 1; /* NOT_NULL ">" NULL */
749  else
750  result = -1; /* NOT_NULL "<" NULL */
751  }
752  else
753  {
754  /*
755  * The sk_func needs to be passed the index value as left arg and
756  * the sk_argument as right arg (they might be of different
757  * types). Since it is convenient for callers to think of
758  * _bt_compare as comparing the scankey to the index item, we have
759  * to flip the sign of the comparison result. (Unless it's a DESC
760  * column, in which case we *don't* flip the sign.)
761  */
762  result = DatumGetInt32(FunctionCall2Coll(&scankey->sk_func,
763  scankey->sk_collation,
764  datum,
765  scankey->sk_argument));
766 
767  if (!(scankey->sk_flags & SK_BT_DESC))
768  INVERT_COMPARE_RESULT(result);
769  }
770 
771  /* if the keys are unequal, return the difference */
772  if (result != 0)
773  return result;
774 
775  scankey++;
776  }
777 
778  /*
779  * All non-truncated attributes (other than heap TID) were found to be
780  * equal. Treat truncated attributes as minus infinity when scankey has a
781  * key attribute value that would otherwise be compared directly.
782  *
783  * Note: it doesn't matter if ntupatts includes non-key attributes;
784  * scankey won't, so explicitly excluding non-key attributes isn't
785  * necessary.
786  */
787  if (key->keysz > ntupatts)
788  return 1;
789 
790  /*
791  * Use the heap TID attribute and scantid to try to break the tie. The
792  * rules are the same as any other key attribute -- only the
793  * representation differs.
794  */
795  heapTid = BTreeTupleGetHeapTID(itup);
796  if (key->scantid == NULL)
797  {
798  /*
799  * Forward scans have a scankey that is considered greater than a
800  * truncated pivot tuple if and when the scankey has equal values for
801  * attributes up to and including the least significant untruncated
802  * attribute in tuple. Even attributes that were omitted from the
803  * scan key are considered greater than -inf truncated attributes.
804  * (See _bt_binsrch for an explanation of our backward scan behavior.)
805  *
806  * For example, if an index has the minimum two attributes (single
807  * user key attribute, plus heap TID attribute), and a page's high key
808  * is ('foo', -inf), and scankey is ('foo', <omitted>), the search
809  * will not descend to the page to the left. The search will descend
810  * right instead. The truncated attribute in pivot tuple means that
811  * all non-pivot tuples on the page to the left are strictly < 'foo',
812  * so it isn't necessary to descend left. In other words, search
813  * doesn't have to descend left because it isn't interested in a match
814  * that has a heap TID value of -inf.
815  *
816  * Note: the heap TID part of the test ensures that scankey is being
817  * compared to a pivot tuple with one or more truncated -inf key
818  * attributes. The heap TID attribute is the last key attribute in
819  * every index, of course, but other than that it isn't special.
820  */
821  if (!key->backward && key->keysz == ntupatts && heapTid == NULL &&
822  key->heapkeyspace)
823  return 1;
824 
825  /* All provided scankey arguments found to be equal */
826  return 0;
827  }
828 
829  /*
830  * Treat truncated heap TID as minus infinity, since scankey has a key
831  * attribute value (scantid) that would otherwise be compared directly
832  */
834  if (heapTid == NULL)
835  return 1;
836 
837  /*
838  * Scankey must be treated as equal to a posting list tuple if its scantid
839  * value falls within the range of the posting list. In all other cases
840  * there can only be a single heap TID value, which is compared directly
841  * with scantid.
842  */
844  result = ItemPointerCompare(key->scantid, heapTid);
845  if (result <= 0 || !BTreeTupleIsPosting(itup))
846  return result;
847  else
848  {
849  result = ItemPointerCompare(key->scantid,
851  if (result > 0)
852  return 1;
853  }
854 
855  return 0;
856 }
#define Min(x, y)
Definition: c.h:1004
#define INVERT_COMPARE_RESULT(var)
Definition: c.h:1106
Datum FunctionCall2Coll(FmgrInfo *flinfo, Oid collation, Datum arg1, Datum arg2)
Definition: fmgr.c:1149
int32 ItemPointerCompare(ItemPointer arg1, ItemPointer arg2)
Definition: itemptr.c:51
static Datum index_getattr(IndexTuple tup, int attnum, TupleDesc tupleDesc, bool *isnull)
Definition: itup.h:117
#define SK_BT_NULLS_FIRST
Definition: nbtree.h:1128
#define SK_BT_DESC
Definition: nbtree.h:1127
static ItemPointer BTreeTupleGetMaxHeapTID(IndexTuple itup)
Definition: nbtree.h:664
bool _bt_check_natts(Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum)
Definition: nbtutils.c:4911
uintptr_t Datum
Definition: postgres.h:64
static int32 DatumGetInt32(Datum X)
Definition: postgres.h:202
#define SK_ISNULL
Definition: skey.h:115
int sk_flags
Definition: skey.h:66
Datum sk_argument
Definition: skey.h:72
FmgrInfo sk_func
Definition: skey.h:71
Oid sk_collation
Definition: skey.h:70
AttrNumber sk_attno
Definition: skey.h:67

References _bt_check_natts(), Assert, BTPageGetOpaque, BTreeTupleGetHeapTID(), BTreeTupleGetMaxHeapTID(), BTreeTupleGetNAtts, BTreeTupleIsPosting(), DatumGetInt32(), FunctionCall2Coll(), i, index_getattr(), IndexRelationGetNumberOfKeyAttributes, INVERT_COMPARE_RESULT, ItemPointerCompare(), sort-test::key, Min, P_FIRSTDATAKEY, P_ISLEAF, PageGetItem(), PageGetItemId(), RelationGetDescr, ScanKeyData::sk_argument, ScanKeyData::sk_attno, SK_BT_DESC, SK_BT_NULLS_FIRST, ScanKeyData::sk_collation, ScanKeyData::sk_flags, ScanKeyData::sk_func, and SK_ISNULL.

Referenced by _bt_binsrch(), _bt_binsrch_insert(), _bt_check_unique(), _bt_findinsertloc(), _bt_moveright(), _bt_search_insert(), bt_rootdescend(), bt_target_page_check(), invariant_g_offset(), invariant_l_nontarget_offset(), invariant_l_offset(), and invariant_leq_offset().

◆ _bt_conditionallockbuf()

bool _bt_conditionallockbuf ( Relation  rel,
Buffer  buf 
)

Definition at line 1093 of file nbtpage.c.

1094 {
1095  /* ConditionalLockBuffer() asserts that pin is held by this backend */
1096  if (!ConditionalLockBuffer(buf))
1097  return false;
1098 
1099  if (!RelationUsesLocalBuffers(rel))
1101 
1102  return true;
1103 }
bool ConditionalLockBuffer(Buffer buffer)
Definition: bufmgr.c:5166

References buf, BufferGetPage(), ConditionalLockBuffer(), RelationUsesLocalBuffers, and VALGRIND_MAKE_MEM_DEFINED.

Referenced by _bt_allocbuf(), and _bt_search_insert().

◆ _bt_dedup_finish_pending()

Size _bt_dedup_finish_pending ( Page  newpage,
BTDedupState  state 
)

Definition at line 555 of file nbtdedup.c.

556 {
557  OffsetNumber tupoff;
558  Size tuplesz;
559  Size spacesaving;
560 
561  Assert(state->nitems > 0);
562  Assert(state->nitems <= state->nhtids);
563  Assert(state->intervals[state->nintervals].baseoff == state->baseoff);
564 
565  tupoff = OffsetNumberNext(PageGetMaxOffsetNumber(newpage));
566  if (state->nitems == 1)
567  {
568  /* Use original, unchanged base tuple */
569  tuplesz = IndexTupleSize(state->base);
570  Assert(tuplesz == MAXALIGN(IndexTupleSize(state->base)));
571  Assert(tuplesz <= BTMaxItemSize(newpage));
572  if (PageAddItem(newpage, (Item) state->base, tuplesz, tupoff,
573  false, false) == InvalidOffsetNumber)
574  elog(ERROR, "deduplication failed to add tuple to page");
575 
576  spacesaving = 0;
577  }
578  else
579  {
580  IndexTuple final;
581 
582  /* Form a tuple with a posting list */
583  final = _bt_form_posting(state->base, state->htids, state->nhtids);
584  tuplesz = IndexTupleSize(final);
585  Assert(tuplesz <= state->maxpostingsize);
586 
587  /* Save final number of items for posting list */
588  state->intervals[state->nintervals].nitems = state->nitems;
589 
590  Assert(tuplesz == MAXALIGN(IndexTupleSize(final)));
591  Assert(tuplesz <= BTMaxItemSize(newpage));
592  if (PageAddItem(newpage, (Item) final, tuplesz, tupoff, false,
593  false) == InvalidOffsetNumber)
594  elog(ERROR, "deduplication failed to add tuple to page");
595 
596  pfree(final);
597  spacesaving = state->phystupsize - (tuplesz + sizeof(ItemIdData));
598  /* Increment nintervals, since we wrote a new posting list tuple */
599  state->nintervals++;
600  Assert(spacesaving > 0 && spacesaving < BLCKSZ);
601  }
602 
603  /* Reset state for next pending posting list */
604  state->nhtids = 0;
605  state->nitems = 0;
606  state->phystupsize = 0;
607 
608  return spacesaving;
609 }
#define PageAddItem(page, item, size, offsetNumber, overwrite, is_heap)
Definition: bufpage.h:471
Pointer Item
Definition: item.h:17
IndexTuple _bt_form_posting(IndexTuple base, ItemPointer htids, int nhtids)
Definition: nbtdedup.c:864

References _bt_form_posting(), Assert, BTMaxItemSize, elog, ERROR, IndexTupleSize, InvalidOffsetNumber, MAXALIGN, OffsetNumberNext, PageAddItem, PageGetMaxOffsetNumber(), and pfree().

Referenced by _bt_dedup_pass(), and btree_xlog_dedup().

◆ _bt_dedup_pass()

void _bt_dedup_pass ( Relation  rel,
Buffer  buf,
IndexTuple  newitem,
Size  newitemsz,
bool  bottomupdedup 
)

Definition at line 58 of file nbtdedup.c.

60 {
61  OffsetNumber offnum,
62  minoff,
63  maxoff;
64  Page page = BufferGetPage(buf);
65  BTPageOpaque opaque = BTPageGetOpaque(page);
66  Page newpage;
68  Size pagesaving PG_USED_FOR_ASSERTS_ONLY = 0;
69  bool singlevalstrat = false;
70  int nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
71 
72  /* Passed-in newitemsz is MAXALIGNED but does not include line pointer */
73  newitemsz += sizeof(ItemIdData);
74 
75  /*
76  * Initialize deduplication state.
77  *
78  * It would be possible for maxpostingsize (limit on posting list tuple
79  * size) to be set to one third of the page. However, it seems like a
80  * good idea to limit the size of posting lists to one sixth of a page.
81  * That ought to leave us with a good split point when pages full of
82  * duplicates can be split several times.
83  */
85  state->deduplicate = true;
86  state->nmaxitems = 0;
87  state->maxpostingsize = Min(BTMaxItemSize(page) / 2, INDEX_SIZE_MASK);
88  /* Metadata about base tuple of current pending posting list */
89  state->base = NULL;
90  state->baseoff = InvalidOffsetNumber;
91  state->basetupsize = 0;
92  /* Metadata about current pending posting list TIDs */
93  state->htids = palloc(state->maxpostingsize);
94  state->nhtids = 0;
95  state->nitems = 0;
96  /* Size of all physical tuples to be replaced by pending posting list */
97  state->phystupsize = 0;
98  /* nintervals should be initialized to zero */
99  state->nintervals = 0;
100 
101  minoff = P_FIRSTDATAKEY(opaque);
102  maxoff = PageGetMaxOffsetNumber(page);
103 
104  /*
105  * Consider applying "single value" strategy, though only if the page
106  * seems likely to be split in the near future
107  */
108  if (!bottomupdedup)
109  singlevalstrat = _bt_do_singleval(rel, page, state, minoff, newitem);
110 
111  /*
112  * Deduplicate items from page, and write them to newpage.
113  *
114  * Copy the original page's LSN into newpage copy. This will become the
115  * updated version of the page. We need this because XLogInsert will
116  * examine the LSN and possibly dump it in a page image.
117  */
118  newpage = PageGetTempPageCopySpecial(page);
119  PageSetLSN(newpage, PageGetLSN(page));
120 
121  /* Copy high key, if any */
122  if (!P_RIGHTMOST(opaque))
123  {
124  ItemId hitemid = PageGetItemId(page, P_HIKEY);
125  Size hitemsz = ItemIdGetLength(hitemid);
126  IndexTuple hitem = (IndexTuple) PageGetItem(page, hitemid);
127 
128  if (PageAddItem(newpage, (Item) hitem, hitemsz, P_HIKEY,
129  false, false) == InvalidOffsetNumber)
130  elog(ERROR, "deduplication failed to add highkey");
131  }
132 
133  for (offnum = minoff;
134  offnum <= maxoff;
135  offnum = OffsetNumberNext(offnum))
136  {
137  ItemId itemid = PageGetItemId(page, offnum);
138  IndexTuple itup = (IndexTuple) PageGetItem(page, itemid);
139 
140  Assert(!ItemIdIsDead(itemid));
141 
142  if (offnum == minoff)
143  {
144  /*
145  * No previous/base tuple for the data item -- use the data item
146  * as base tuple of pending posting list
147  */
148  _bt_dedup_start_pending(state, itup, offnum);
149  }
150  else if (state->deduplicate &&
151  _bt_keep_natts_fast(rel, state->base, itup) > nkeyatts &&
152  _bt_dedup_save_htid(state, itup))
153  {
154  /*
155  * Tuple is equal to base tuple of pending posting list. Heap
156  * TID(s) for itup have been saved in state.
157  */
158  }
159  else
160  {
161  /*
162  * Tuple is not equal to pending posting list tuple, or
163  * _bt_dedup_save_htid() opted to not merge current item into
164  * pending posting list for some other reason (e.g., adding more
165  * TIDs would have caused posting list to exceed current
166  * maxpostingsize).
167  *
168  * If state contains pending posting list with more than one item,
169  * form new posting tuple and add it to our temp page (newpage).
170  * Else add pending interval's base tuple to the temp page as-is.
171  */
172  pagesaving += _bt_dedup_finish_pending(newpage, state);
173 
174  if (singlevalstrat)
175  {
176  /*
177  * Single value strategy's extra steps.
178  *
179  * Lower maxpostingsize for sixth and final large posting list
180  * tuple at the point where 5 maxpostingsize-capped tuples
181  * have either been formed or observed.
182  *
183  * When a sixth maxpostingsize-capped item is formed/observed,
184  * stop merging together tuples altogether. The few tuples
185  * that remain at the end of the page won't be merged together
186  * at all (at least not until after a future page split takes
187  * place, when this page's newly allocated right sibling page
188  * gets its first deduplication pass).
189  */
190  if (state->nmaxitems == 5)
191  _bt_singleval_fillfactor(page, state, newitemsz);
192  else if (state->nmaxitems == 6)
193  {
194  state->deduplicate = false;
195  singlevalstrat = false; /* won't be back here */
196  }
197  }
198 
199  /* itup starts new pending posting list */
200  _bt_dedup_start_pending(state, itup, offnum);
201  }
202  }
203 
204  /* Handle the last item */
205  pagesaving += _bt_dedup_finish_pending(newpage, state);
206 
207  /*
208  * If no items suitable for deduplication were found, newpage must be
209  * exactly the same as the original page, so just return from function.
210  *
211  * We could determine whether or not to proceed on the basis the space
212  * savings being sufficient to avoid an immediate page split instead. We
213  * don't do that because there is some small value in nbtsplitloc.c always
214  * operating against a page that is fully deduplicated (apart from
215  * newitem). Besides, most of the cost has already been paid.
216  */
217  if (state->nintervals == 0)
218  {
219  /* cannot leak memory here */
220  pfree(newpage);
221  pfree(state->htids);
222  pfree(state);
223  return;
224  }
225 
226  /*
227  * By here, it's clear that deduplication will definitely go ahead.
228  *
229  * Clear the BTP_HAS_GARBAGE page flag. The index must be a heapkeyspace
230  * index, and as such we'll never pay attention to BTP_HAS_GARBAGE anyway.
231  * But keep things tidy.
232  */
233  if (P_HAS_GARBAGE(opaque))
234  {
235  BTPageOpaque nopaque = BTPageGetOpaque(newpage);
236 
237  nopaque->btpo_flags &= ~BTP_HAS_GARBAGE;
238  }
239 
241 
242  PageRestoreTempPage(newpage, page);
244 
245  /* XLOG stuff */
246  if (RelationNeedsWAL(rel))
247  {
248  XLogRecPtr recptr;
249  xl_btree_dedup xlrec_dedup;
250 
251  xlrec_dedup.nintervals = state->nintervals;
252 
253  XLogBeginInsert();
255  XLogRegisterData((char *) &xlrec_dedup, SizeOfBtreeDedup);
256 
257  /*
258  * The intervals array is not in the buffer, but pretend that it is.
259  * When XLogInsert stores the whole buffer, the array need not be
260  * stored too.
261  */
262  XLogRegisterBufData(0, (char *) state->intervals,
263  state->nintervals * sizeof(BTDedupInterval));
264 
265  recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DEDUP);
266 
267  PageSetLSN(page, recptr);
268  }
269 
271 
272  /* Local space accounting should agree with page accounting */
273  Assert(pagesaving < newitemsz || PageGetExactFreeSpace(page) >= newitemsz);
274 
275  /* cannot leak memory here */
276  pfree(state->htids);
277  pfree(state);
278 }
void MarkBufferDirty(Buffer buffer)
Definition: bufmgr.c:2514
void PageRestoreTempPage(Page tempPage, Page oldPage)
Definition: bufpage.c:424
Page PageGetTempPageCopySpecial(Page page)
Definition: bufpage.c:402
static void PageSetLSN(Page page, XLogRecPtr lsn)
Definition: bufpage.h:391
static XLogRecPtr PageGetLSN(const char *page)
Definition: bufpage.h:386
#define PG_USED_FOR_ASSERTS_ONLY
Definition: c.h:182
#define ItemIdGetLength(itemId)
Definition: itemid.h:59
#define INDEX_SIZE_MASK
Definition: itup.h:65
#define START_CRIT_SECTION()
Definition: miscadmin.h:149
#define END_CRIT_SECTION()
Definition: miscadmin.h:151
static bool _bt_do_singleval(Relation rel, Page page, BTDedupState state, OffsetNumber minoff, IndexTuple newitem)
Definition: nbtdedup.c:782
Size _bt_dedup_finish_pending(Page newpage, BTDedupState state)
Definition: nbtdedup.c:555
static void _bt_singleval_fillfactor(Page page, BTDedupState state, Size newitemsz)
Definition: nbtdedup.c:822
#define P_HAS_GARBAGE(opaque)
Definition: nbtree.h:226
#define BTP_HAS_GARBAGE
Definition: nbtree.h:82
#define XLOG_BTREE_DEDUP
Definition: nbtxlog.h:33
#define SizeOfBtreeDedup
Definition: nbtxlog.h:174
uint16 btpo_flags
Definition: nbtree.h:67
uint16 nintervals
Definition: nbtxlog.h:169
uint64 XLogRecPtr
Definition: xlogdefs.h:21
void XLogRegisterBufData(uint8 block_id, const char *data, uint32 len)
Definition: xloginsert.c:405
void XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags)
Definition: xloginsert.c:242
#define REGBUF_STANDARD
Definition: xloginsert.h:34

References _bt_dedup_finish_pending(), _bt_dedup_save_htid(), _bt_dedup_start_pending(), _bt_do_singleval(), _bt_keep_natts_fast(), _bt_singleval_fillfactor(), Assert, BTMaxItemSize, BTP_HAS_GARBAGE, BTPageGetOpaque, BTPageOpaqueData::btpo_flags, buf, BufferGetPage(), elog, END_CRIT_SECTION, ERROR, INDEX_SIZE_MASK, IndexRelationGetNumberOfKeyAttributes, InvalidOffsetNumber, ItemIdGetLength, ItemIdIsDead, MarkBufferDirty(), Min, xl_btree_dedup::nintervals, OffsetNumberNext, P_FIRSTDATAKEY, P_HAS_GARBAGE, P_HIKEY, P_RIGHTMOST, PageAddItem, PageGetExactFreeSpace(), PageGetItem(), PageGetItemId(), PageGetLSN(), PageGetMaxOffsetNumber(), PageGetTempPageCopySpecial(), PageRestoreTempPage(), PageSetLSN(), palloc(), pfree(), PG_USED_FOR_ASSERTS_ONLY, REGBUF_STANDARD, RelationNeedsWAL, SizeOfBtreeDedup, START_CRIT_SECTION, XLOG_BTREE_DEDUP, XLogBeginInsert(), XLogInsert(), XLogRegisterBufData(), XLogRegisterBuffer(), and XLogRegisterData().

Referenced by _bt_delete_or_dedup_one_page().

◆ _bt_dedup_save_htid()

bool _bt_dedup_save_htid ( BTDedupState  state,
IndexTuple  itup 
)

Definition at line 484 of file nbtdedup.c.

485 {
486  int nhtids;
487  ItemPointer htids;
488  Size mergedtupsz;
489 
490  Assert(!BTreeTupleIsPivot(itup));
491 
492  if (!BTreeTupleIsPosting(itup))
493  {
494  nhtids = 1;
495  htids = &itup->t_tid;
496  }
497  else
498  {
499  nhtids = BTreeTupleGetNPosting(itup);
500  htids = BTreeTupleGetPosting(itup);
501  }
502 
503  /*
504  * Don't append (have caller finish pending posting list as-is) if
505  * appending heap TID(s) from itup would put us over maxpostingsize limit.
506  *
507  * This calculation needs to match the code used within _bt_form_posting()
508  * for new posting list tuples.
509  */
510  mergedtupsz = MAXALIGN(state->basetupsize +
511  (state->nhtids + nhtids) * sizeof(ItemPointerData));
512 
513  if (mergedtupsz > state->maxpostingsize)
514  {
515  /*
516  * Count this as an oversized item for single value strategy, though
517  * only when there are 50 TIDs in the final posting list tuple. This
518  * limit (which is fairly arbitrary) avoids confusion about how many
519  * 1/6 of a page tuples have been encountered/created by the current
520  * deduplication pass.
521  *
522  * Note: We deliberately don't consider which deduplication pass
523  * merged together tuples to create this item (could be a previous
524  * deduplication pass, or current pass). See _bt_do_singleval()
525  * comments.
526  */
527  if (state->nhtids > 50)
528  state->nmaxitems++;
529 
530  return false;
531  }
532 
533  /*
534  * Save heap TIDs to pending posting list tuple -- itup can be merged into
535  * pending posting list
536  */
537  state->nitems++;
538  memcpy(state->htids + state->nhtids, htids,
539  sizeof(ItemPointerData) * nhtids);
540  state->nhtids += nhtids;
541  state->phystupsize += MAXALIGN(IndexTupleSize(itup)) + sizeof(ItemIdData);
542 
543  return true;
544 }
static uint16 BTreeTupleGetNPosting(IndexTuple posting)
Definition: nbtree.h:518
static ItemPointer BTreeTupleGetPosting(IndexTuple posting)
Definition: nbtree.h:537

References Assert, BTreeTupleGetNPosting(), BTreeTupleGetPosting(), BTreeTupleIsPivot(), BTreeTupleIsPosting(), IndexTupleSize, MAXALIGN, and IndexTupleData::t_tid.

Referenced by _bt_bottomupdel_pass(), _bt_dedup_pass(), _bt_load(), and btree_xlog_dedup().

◆ _bt_dedup_start_pending()

void _bt_dedup_start_pending ( BTDedupState  state,
IndexTuple  base,
OffsetNumber  baseoff 
)

Definition at line 433 of file nbtdedup.c.

435 {
436  Assert(state->nhtids == 0);
437  Assert(state->nitems == 0);
438  Assert(!BTreeTupleIsPivot(base));
439 
440  /*
441  * Copy heap TID(s) from new base tuple for new candidate posting list
442  * into working state's array
443  */
444  if (!BTreeTupleIsPosting(base))
445  {
446  memcpy(state->htids, &base->t_tid, sizeof(ItemPointerData));
447  state->nhtids = 1;
448  state->basetupsize = IndexTupleSize(base);
449  }
450  else
451  {
452  int nposting;
453 
454  nposting = BTreeTupleGetNPosting(base);
455  memcpy(state->htids, BTreeTupleGetPosting(base),
456  sizeof(ItemPointerData) * nposting);
457  state->nhtids = nposting;
458  /* basetupsize should not include existing posting list */
459  state->basetupsize = BTreeTupleGetPostingOffset(base);
460  }
461 
462  /*
463  * Save new base tuple itself -- it'll be needed if we actually create a
464  * new posting list from new pending posting list.
465  *
466  * Must maintain physical size of all existing tuples (including line
467  * pointer overhead) so that we can calculate space savings on page.
468  */
469  state->nitems = 1;
470  state->base = base;
471  state->baseoff = baseoff;
472  state->phystupsize = MAXALIGN(IndexTupleSize(base)) + sizeof(ItemIdData);
473  /* Also save baseoff in pending state for interval */
474  state->intervals[state->nintervals].baseoff = state->baseoff;
475 }
static uint32 BTreeTupleGetPostingOffset(IndexTuple posting)
Definition: nbtree.h:529

References Assert, BTreeTupleGetNPosting(), BTreeTupleGetPosting(), BTreeTupleGetPostingOffset(), BTreeTupleIsPivot(), BTreeTupleIsPosting(), IndexTupleSize, MAXALIGN, and IndexTupleData::t_tid.

Referenced by _bt_bottomupdel_pass(), _bt_dedup_pass(), _bt_load(), and btree_xlog_dedup().

◆ _bt_delitems_delete_check()

void _bt_delitems_delete_check ( Relation  rel,
Buffer  buf,
Relation  heapRel,
TM_IndexDeleteOp delstate 
)

Definition at line 1513 of file nbtpage.c.

1515 {
1516  Page page = BufferGetPage(buf);
1517  TransactionId snapshotConflictHorizon;
1518  bool isCatalogRel;
1519  OffsetNumber postingidxoffnum = InvalidOffsetNumber;
1520  int ndeletable = 0,
1521  nupdatable = 0;
1524 
1525  /* Use tableam interface to determine which tuples to delete first */
1526  snapshotConflictHorizon = table_index_delete_tuples(heapRel, delstate);
1527  isCatalogRel = RelationIsAccessibleInLogicalDecoding(heapRel);
1528 
1529  /* Should not WAL-log snapshotConflictHorizon unless it's required */
1530  if (!XLogStandbyInfoActive())
1531  snapshotConflictHorizon = InvalidTransactionId;
1532 
1533  /*
1534  * Construct a leaf-page-wise description of what _bt_delitems_delete()
1535  * needs to do to physically delete index tuples from the page.
1536  *
1537  * Must sort deltids array to restore leaf-page-wise order (original order
1538  * before call to tableam). This is the order that the loop expects.
1539  *
1540  * Note that deltids array might be a lot smaller now. It might even have
1541  * no entries at all (with bottom-up deletion caller), in which case there
1542  * is nothing left to do.
1543  */
1544  qsort(delstate->deltids, delstate->ndeltids, sizeof(TM_IndexDelete),
1546  if (delstate->ndeltids == 0)
1547  {
1548  Assert(delstate->bottomup);
1549  return;
1550  }
1551 
1552  /* We definitely have to delete at least one index tuple (or one TID) */
1553  for (int i = 0; i < delstate->ndeltids; i++)
1554  {
1555  TM_IndexStatus *dstatus = delstate->status + delstate->deltids[i].id;
1556  OffsetNumber idxoffnum = dstatus->idxoffnum;
1557  ItemId itemid = PageGetItemId(page, idxoffnum);
1558  IndexTuple itup = (IndexTuple) PageGetItem(page, itemid);
1559  int nestedi,
1560  nitem;
1561  BTVacuumPosting vacposting;
1562 
1563  Assert(OffsetNumberIsValid(idxoffnum));
1564 
1565  if (idxoffnum == postingidxoffnum)
1566  {
1567  /*
1568  * This deltid entry is a TID from a posting list tuple that has
1569  * already been completely processed
1570  */
1571  Assert(BTreeTupleIsPosting(itup));
1573  &delstate->deltids[i].tid) < 0);
1575  &delstate->deltids[i].tid) >= 0);
1576  continue;
1577  }
1578 
1579  if (!BTreeTupleIsPosting(itup))
1580  {
1581  /* Plain non-pivot tuple */
1582  Assert(ItemPointerEquals(&itup->t_tid, &delstate->deltids[i].tid));
1583  if (dstatus->knowndeletable)
1584  deletable[ndeletable++] = idxoffnum;
1585  continue;
1586  }
1587 
1588  /*
1589  * itup is a posting list tuple whose lowest deltids entry (which may
1590  * or may not be for the first TID from itup) is considered here now.
1591  * We should process all of the deltids entries for the posting list
1592  * together now, though (not just the lowest). Remember to skip over
1593  * later itup-related entries during later iterations of outermost
1594  * loop.
1595  */
1596  postingidxoffnum = idxoffnum; /* Remember work in outermost loop */
1597  nestedi = i; /* Initialize for first itup deltids entry */
1598  vacposting = NULL; /* Describes final action for itup */
1599  nitem = BTreeTupleGetNPosting(itup);
1600  for (int p = 0; p < nitem; p++)
1601  {
1602  ItemPointer ptid = BTreeTupleGetPostingN(itup, p);
1603  int ptidcmp = -1;
1604 
1605  /*
1606  * This nested loop reuses work across ptid TIDs taken from itup.
1607  * We take advantage of the fact that both itup's TIDs and deltids
1608  * entries (within a single itup/posting list grouping) must both
1609  * be in ascending TID order.
1610  */
1611  for (; nestedi < delstate->ndeltids; nestedi++)
1612  {
1613  TM_IndexDelete *tcdeltid = &delstate->deltids[nestedi];
1614  TM_IndexStatus *tdstatus = (delstate->status + tcdeltid->id);
1615 
1616  /* Stop once we get past all itup related deltids entries */
1617  Assert(tdstatus->idxoffnum >= idxoffnum);
1618  if (tdstatus->idxoffnum != idxoffnum)
1619  break;
1620 
1621  /* Skip past non-deletable itup related entries up front */
1622  if (!tdstatus->knowndeletable)
1623  continue;
1624 
1625  /* Entry is first partial ptid match (or an exact match)? */
1626  ptidcmp = ItemPointerCompare(&tcdeltid->tid, ptid);
1627  if (ptidcmp >= 0)
1628  {
1629  /* Greater than or equal (partial or exact) match... */
1630  break;
1631  }
1632  }
1633 
1634  /* ...exact ptid match to a deletable deltids entry? */
1635  if (ptidcmp != 0)
1636  continue;
1637 
1638  /* Exact match for deletable deltids entry -- ptid gets deleted */
1639  if (vacposting == NULL)
1640  {
1641  vacposting = palloc(offsetof(BTVacuumPostingData, deletetids) +
1642  nitem * sizeof(uint16));
1643  vacposting->itup = itup;
1644  vacposting->updatedoffset = idxoffnum;
1645  vacposting->ndeletedtids = 0;
1646  }
1647  vacposting->deletetids[vacposting->ndeletedtids++] = p;
1648  }
1649 
1650  /* Final decision on itup, a posting list tuple */
1651 
1652  if (vacposting == NULL)
1653  {
1654  /* No TIDs to delete from itup -- do nothing */
1655  }
1656  else if (vacposting->ndeletedtids == nitem)
1657  {
1658  /* Straight delete of itup (to delete all TIDs) */
1659  deletable[ndeletable++] = idxoffnum;
1660  /* Turns out we won't need granular information */
1661  pfree(vacposting);
1662  }
1663  else
1664  {
1665  /* Delete some (but not all) TIDs from itup */
1666  Assert(vacposting->ndeletedtids > 0 &&
1667  vacposting->ndeletedtids < nitem);
1668  updatable[nupdatable++] = vacposting;
1669  }
1670  }
1671 
1672  /* Physically delete tuples (or TIDs) using deletable (or updatable) */
1673  _bt_delitems_delete(rel, buf, snapshotConflictHorizon, isCatalogRel,
1674  deletable, ndeletable, updatable, nupdatable);
1675 
1676  /* be tidy */
1677  for (int i = 0; i < nupdatable; i++)
1678  pfree(updatable[i]);
1679 }
unsigned short uint16
Definition: c.h:505
uint32 TransactionId
Definition: c.h:652
bool ItemPointerEquals(ItemPointer pointer1, ItemPointer pointer2)
Definition: itemptr.c:35
#define MaxIndexTuplesPerPage
Definition: itup.h:165
static void _bt_delitems_delete(Relation rel, Buffer buf, TransactionId snapshotConflictHorizon, bool isCatalogRel, OffsetNumber *deletable, int ndeletable, BTVacuumPosting *updatable, int nupdatable)
Definition: nbtpage.c:1284
static int _bt_delitems_cmp(const void *a, const void *b)
Definition: nbtpage.c:1464
static ItemPointer BTreeTupleGetPostingN(IndexTuple posting, int n)
Definition: nbtree.h:544
#define OffsetNumberIsValid(offsetNumber)
Definition: off.h:39
#define qsort(a, b, c, d)
Definition: port.h:447
uint16 deletetids[FLEXIBLE_ARRAY_MEMBER]
Definition: nbtree.h:911
uint16 ndeletedtids
Definition: nbtree.h:910
IndexTuple itup
Definition: nbtree.h:906
OffsetNumber updatedoffset
Definition: nbtree.h:907
ItemPointerData tid
Definition: tableam.h:213
bool knowndeletable
Definition: tableam.h:220
OffsetNumber idxoffnum
Definition: tableam.h:219
static TransactionId table_index_delete_tuples(Relation rel, TM_IndexDeleteOp *delstate)
Definition: tableam.h:1356
#define InvalidTransactionId
Definition: transam.h:31

References _bt_delitems_cmp(), _bt_delitems_delete(), Assert, TM_IndexDeleteOp::bottomup, BTreeTupleGetHeapTID(), BTreeTupleGetMaxHeapTID(), BTreeTupleGetNPosting(), BTreeTupleGetPostingN(), BTreeTupleIsPosting(), buf, BufferGetPage(), BTVacuumPostingData::deletetids, TM_IndexDeleteOp::deltids, i, TM_IndexDelete::id, TM_IndexStatus::idxoffnum, InvalidOffsetNumber, InvalidTransactionId, ItemPointerCompare(), ItemPointerEquals(), BTVacuumPostingData::itup, TM_IndexStatus::knowndeletable, MaxIndexTuplesPerPage, BTVacuumPostingData::ndeletedtids, TM_IndexDeleteOp::ndeltids, OffsetNumberIsValid, PageGetItem(), PageGetItemId(), palloc(), pfree(), qsort, RelationIsAccessibleInLogicalDecoding, TM_IndexDeleteOp::status, IndexTupleData::t_tid, table_index_delete_tuples(), TM_IndexDelete::tid, BTVacuumPostingData::updatedoffset, and XLogStandbyInfoActive.

Referenced by _bt_bottomupdel_pass(), and _bt_simpledel_pass().

◆ _bt_delitems_vacuum()

void _bt_delitems_vacuum ( Relation  rel,
Buffer  buf,
OffsetNumber deletable,
int  ndeletable,
BTVacuumPosting updatable,
int  nupdatable 
)

Definition at line 1154 of file nbtpage.c.

1157 {
1158  Page page = BufferGetPage(buf);
1159  BTPageOpaque opaque;
1160  bool needswal = RelationNeedsWAL(rel);
1161  char *updatedbuf = NULL;
1162  Size updatedbuflen = 0;
1163  OffsetNumber updatedoffsets[MaxIndexTuplesPerPage];
1164 
1165  /* Shouldn't be called unless there's something to do */
1166  Assert(ndeletable > 0 || nupdatable > 0);
1167 
1168  /* Generate new version of posting lists without deleted TIDs */
1169  if (nupdatable > 0)
1170  updatedbuf = _bt_delitems_update(updatable, nupdatable,
1171  updatedoffsets, &updatedbuflen,
1172  needswal);
1173 
1174  /* No ereport(ERROR) until changes are logged */
1176 
1177  /*
1178  * Handle posting tuple updates.
1179  *
1180  * Deliberately do this before handling simple deletes. If we did it the
1181  * other way around (i.e. WAL record order -- simple deletes before
1182  * updates) then we'd have to make compensating changes to the 'updatable'
1183  * array of offset numbers.
1184  *
1185  * PageIndexTupleOverwrite() won't unset each item's LP_DEAD bit when it
1186  * happens to already be set. It's important that we not interfere with
1187  * any future simple index tuple deletion operations.
1188  */
1189  for (int i = 0; i < nupdatable; i++)
1190  {
1191  OffsetNumber updatedoffset = updatedoffsets[i];
1192  IndexTuple itup;
1193  Size itemsz;
1194 
1195  itup = updatable[i]->itup;
1196  itemsz = MAXALIGN(IndexTupleSize(itup));
1197  if (!PageIndexTupleOverwrite(page, updatedoffset, (Item) itup,
1198  itemsz))
1199  elog(PANIC, "failed to update partially dead item in block %u of index \"%s\"",
1201  }
1202 
1203  /* Now handle simple deletes of entire tuples */
1204  if (ndeletable > 0)
1205  PageIndexMultiDelete(page, deletable, ndeletable);
1206 
1207  /*
1208  * We can clear the vacuum cycle ID since this page has certainly been
1209  * processed by the current vacuum scan.
1210  */
1211  opaque = BTPageGetOpaque(page);
1212  opaque->btpo_cycleid = 0;
1213 
1214  /*
1215  * Clear the BTP_HAS_GARBAGE page flag.
1216  *
1217  * This flag indicates the presence of LP_DEAD items on the page (though
1218  * not reliably). Note that we only rely on it with pg_upgrade'd
1219  * !heapkeyspace indexes. That's why clearing it here won't usually
1220  * interfere with simple index tuple deletion.
1221  */
1222  opaque->btpo_flags &= ~BTP_HAS_GARBAGE;
1223 
1225 
1226  /* XLOG stuff */
1227  if (needswal)
1228  {
1229  XLogRecPtr recptr;
1230  xl_btree_vacuum xlrec_vacuum;
1231 
1232  xlrec_vacuum.ndeleted = ndeletable;
1233  xlrec_vacuum.nupdated = nupdatable;
1234 
1235  XLogBeginInsert();
1237  XLogRegisterData((char *) &xlrec_vacuum, SizeOfBtreeVacuum);
1238 
1239  if (ndeletable > 0)
1240  XLogRegisterBufData(0, (char *) deletable,
1241  ndeletable * sizeof(OffsetNumber));
1242 
1243  if (nupdatable > 0)
1244  {
1245  XLogRegisterBufData(0, (char *) updatedoffsets,
1246  nupdatable * sizeof(OffsetNumber));
1247  XLogRegisterBufData(0, updatedbuf, updatedbuflen);
1248  }
1249 
1250  recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_VACUUM);
1251 
1252  PageSetLSN(page, recptr);
1253  }
1254 
1255  END_CRIT_SECTION();
1256 
1257  /* can't leak memory here */
1258  if (updatedbuf != NULL)
1259  pfree(updatedbuf);
1260  /* free tuples allocated within _bt_delitems_update() */
1261  for (int i = 0; i < nupdatable; i++)
1262  pfree(updatable[i]->itup);
1263 }
void PageIndexMultiDelete(Page page, OffsetNumber *itemnos, int nitems)
Definition: bufpage.c:1161
bool PageIndexTupleOverwrite(Page page, OffsetNumber offnum, Item newtup, Size newsize)
Definition: bufpage.c:1405
#define PANIC
Definition: elog.h:42
static char * _bt_delitems_update(BTVacuumPosting *updatable, int nupdatable, OffsetNumber *updatedoffsets, Size *updatedbuflen, bool needswal)
Definition: nbtpage.c:1405
#define SizeOfBtreeVacuum
Definition: nbtxlog.h:234
#define XLOG_BTREE_VACUUM
Definition: nbtxlog.h:39
BTCycleId btpo_cycleid
Definition: nbtree.h:68
uint16 ndeleted
Definition: nbtxlog.h:222
uint16 nupdated
Definition: nbtxlog.h:223

References _bt_delitems_update(), Assert, BTP_HAS_GARBAGE, BTPageGetOpaque, BTPageOpaqueData::btpo_cycleid, BTPageOpaqueData::btpo_flags, buf, BufferGetBlockNumber(), BufferGetPage(), elog, END_CRIT_SECTION, i, IndexTupleSize, BTVacuumPostingData::itup, MarkBufferDirty(), MAXALIGN, MaxIndexTuplesPerPage, xl_btree_vacuum::ndeleted, xl_btree_vacuum::nupdated, PageIndexMultiDelete(), PageIndexTupleOverwrite(), PageSetLSN(), PANIC, pfree(), REGBUF_STANDARD, RelationGetRelationName, RelationNeedsWAL, SizeOfBtreeVacuum, START_CRIT_SECTION, XLOG_BTREE_VACUUM, XLogBeginInsert(), XLogInsert(), XLogRegisterBufData(), XLogRegisterBuffer(), and XLogRegisterData().

Referenced by btvacuumpage().

◆ _bt_doinsert()

bool _bt_doinsert ( Relation  rel,
IndexTuple  itup,
IndexUniqueCheck  checkUnique,
bool  indexUnchanged,
Relation  heapRel 
)

Definition at line 102 of file nbtinsert.c.

105 {
106  bool is_unique = false;
107  BTInsertStateData insertstate;
108  BTScanInsert itup_key;
109  BTStack stack;
110  bool checkingunique = (checkUnique != UNIQUE_CHECK_NO);
111 
112  /* we need an insertion scan key to do our search, so build one */
113  itup_key = _bt_mkscankey(rel, itup);
114 
115  if (checkingunique)
116  {
117  if (!itup_key->anynullkeys)
118  {
119  /* No (heapkeyspace) scantid until uniqueness established */
120  itup_key->scantid = NULL;
121  }
122  else
123  {
124  /*
125  * Scan key for new tuple contains NULL key values. Bypass
126  * checkingunique steps. They are unnecessary because core code
127  * considers NULL unequal to every value, including NULL.
128  *
129  * This optimization avoids O(N^2) behavior within the
130  * _bt_findinsertloc() heapkeyspace path when a unique index has a
131  * large number of "duplicates" with NULL key values.
132  */
133  checkingunique = false;
134  /* Tuple is unique in the sense that core code cares about */
135  Assert(checkUnique != UNIQUE_CHECK_EXISTING);
136  is_unique = true;
137  }
138  }
139 
140  /*
141  * Fill in the BTInsertState working area, to track the current page and
142  * position within the page to insert on.
143  *
144  * Note that itemsz is passed down to lower level code that deals with
145  * inserting the item. It must be MAXALIGN()'d. This ensures that space
146  * accounting code consistently considers the alignment overhead that we
147  * expect PageAddItem() will add later. (Actually, index_form_tuple() is
148  * already conservative about alignment, but we don't rely on that from
149  * this distance. Besides, preserving the "true" tuple size in index
150  * tuple headers for the benefit of nbtsplitloc.c might happen someday.
151  * Note that heapam does not MAXALIGN() each heap tuple's lp_len field.)
152  */
153  insertstate.itup = itup;
154  insertstate.itemsz = MAXALIGN(IndexTupleSize(itup));
155  insertstate.itup_key = itup_key;
156  insertstate.bounds_valid = false;
157  insertstate.buf = InvalidBuffer;
158  insertstate.postingoff = 0;
159 
160 search:
161 
162  /*
163  * Find and lock the leaf page that the tuple should be added to by
164  * searching from the root page. insertstate.buf will hold a buffer that
165  * is locked in exclusive mode afterwards.
166  */
167  stack = _bt_search_insert(rel, heapRel, &insertstate);
168 
169  /*
170  * checkingunique inserts are not allowed to go ahead when two tuples with
171  * equal key attribute values would be visible to new MVCC snapshots once
172  * the xact commits. Check for conflicts in the locked page/buffer (if
173  * needed) here.
174  *
175  * It might be necessary to check a page to the right in _bt_check_unique,
176  * though that should be very rare. In practice the first page the value
177  * could be on (with scantid omitted) is almost always also the only page
178  * that a matching tuple might be found on. This is due to the behavior
179  * of _bt_findsplitloc with duplicate tuples -- a group of duplicates can
180  * only be allowed to cross a page boundary when there is no candidate
181  * leaf page split point that avoids it. Also, _bt_check_unique can use
182  * the leaf page high key to determine that there will be no duplicates on
183  * the right sibling without actually visiting it (it uses the high key in
184  * cases where the new item happens to belong at the far right of the leaf
185  * page).
186  *
187  * NOTE: obviously, _bt_check_unique can only detect keys that are already
188  * in the index; so it cannot defend against concurrent insertions of the
189  * same key. We protect against that by means of holding a write lock on
190  * the first page the value could be on, with omitted/-inf value for the
191  * implicit heap TID tiebreaker attribute. Any other would-be inserter of
192  * the same key must acquire a write lock on the same page, so only one
193  * would-be inserter can be making the check at one time. Furthermore,
194  * once we are past the check we hold write locks continuously until we
195  * have performed our insertion, so no later inserter can fail to see our
196  * insertion. (This requires some care in _bt_findinsertloc.)
197  *
198  * If we must wait for another xact, we release the lock while waiting,
199  * and then must perform a new search.
200  *
201  * For a partial uniqueness check, we don't wait for the other xact. Just
202  * let the tuple in and return false for possibly non-unique, or true for
203  * definitely unique.
204  */
205  if (checkingunique)
206  {
207  TransactionId xwait;
208  uint32 speculativeToken;
209 
210  xwait = _bt_check_unique(rel, &insertstate, heapRel, checkUnique,
211  &is_unique, &speculativeToken);
212 
213  if (unlikely(TransactionIdIsValid(xwait)))
214  {
215  /* Have to wait for the other guy ... */
216  _bt_relbuf(rel, insertstate.buf);
217  insertstate.buf = InvalidBuffer;
218 
219  /*
220  * If it's a speculative insertion, wait for it to finish (ie. to
221  * go ahead with the insertion, or kill the tuple). Otherwise
222  * wait for the transaction to finish as usual.
223  */
224  if (speculativeToken)
225  SpeculativeInsertionWait(xwait, speculativeToken);
226  else
227  XactLockTableWait(xwait, rel, &itup->t_tid, XLTW_InsertIndex);
228 
229  /* start over... */
230  if (stack)
231  _bt_freestack(stack);
232  goto search;
233  }
234 
235  /* Uniqueness is established -- restore heap tid as scantid */
236  if (itup_key->heapkeyspace)
237  itup_key->scantid = &itup->t_tid;
238  }
239 
240  if (checkUnique != UNIQUE_CHECK_EXISTING)
241  {
242  OffsetNumber newitemoff;
243 
244  /*
245  * The only conflict predicate locking cares about for indexes is when
246  * an index tuple insert conflicts with an existing lock. We don't
247  * know the actual page we're going to insert on for sure just yet in
248  * checkingunique and !heapkeyspace cases, but it's okay to use the
249  * first page the value could be on (with scantid omitted) instead.
250  */
252 
253  /*
254  * Do the insertion. Note that insertstate contains cached binary
255  * search bounds established within _bt_check_unique when insertion is
256  * checkingunique.
257  */
258  newitemoff = _bt_findinsertloc(rel, &insertstate, checkingunique,
259  indexUnchanged, stack, heapRel);
260  _bt_insertonpg(rel, heapRel, itup_key, insertstate.buf, InvalidBuffer,
261  stack, itup, insertstate.itemsz, newitemoff,
262  insertstate.postingoff, false);
263  }
264  else
265  {
266  /* just release the buffer */
267  _bt_relbuf(rel, insertstate.buf);
268  }
269 
270  /* be tidy */
271  if (stack)
272  _bt_freestack(stack);
273  pfree(itup_key);
274 
275  return is_unique;
276 }
unsigned int uint32
Definition: c.h:506
@ UNIQUE_CHECK_NO
Definition: genam.h:117
@ UNIQUE_CHECK_EXISTING
Definition: genam.h:120
void XactLockTableWait(TransactionId xid, Relation rel, ItemPointer ctid, XLTW_Oper oper)
Definition: lmgr.c:657
void SpeculativeInsertionWait(TransactionId xid, uint32 token)
Definition: lmgr.c:814
@ XLTW_InsertIndex
Definition: lmgr.h:31
static BTStack _bt_search_insert(Relation rel, Relation heaprel, BTInsertState insertstate)
Definition: nbtinsert.c:317
static OffsetNumber _bt_findinsertloc(Relation rel, BTInsertState insertstate, bool checkingunique, bool indexUnchanged, BTStack stack, Relation heapRel)
Definition: nbtinsert.c:815
static void _bt_insertonpg(Relation rel, Relation heaprel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, BTStack stack, IndexTuple itup, Size itemsz, OffsetNumber newitemoff, int postingoff, bool split_only_page)
Definition: nbtinsert.c:1105
static TransactionId _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel, IndexUniqueCheck checkUnique, bool *is_unique, uint32 *speculativeToken)
Definition: nbtinsert.c:408
void _bt_freestack(BTStack stack)
Definition: nbtutils.c:221
BTScanInsert _bt_mkscankey(Relation rel, IndexTuple itup)
Definition: nbtutils.c:129
void CheckForSerializableConflictIn(Relation relation, ItemPointer tid, BlockNumber blkno)
Definition: predicate.c:4321
IndexTuple itup
Definition: nbtree.h:811
ItemPointer scantid
Definition: nbtree.h:791
bool heapkeyspace
Definition: nbtree.h:786
bool anynullkeys
Definition: nbtree.h:788
#define TransactionIdIsValid(xid)
Definition: transam.h:41

References _bt_check_unique(), _bt_findinsertloc(), _bt_freestack(), _bt_insertonpg(), _bt_mkscankey(), _bt_relbuf(), _bt_search_insert(), BTScanInsertData::anynullkeys, Assert, BTInsertStateData::bounds_valid, BTInsertStateData::buf, BufferGetBlockNumber(), CheckForSerializableConflictIn(), BTScanInsertData::heapkeyspace, IndexTupleSize, InvalidBuffer, BTInsertStateData::itemsz, BTInsertStateData::itup, BTInsertStateData::itup_key, MAXALIGN, pfree(), BTInsertStateData::postingoff, BTScanInsertData::scantid, SpeculativeInsertionWait(), IndexTupleData::t_tid, TransactionIdIsValid, UNIQUE_CHECK_EXISTING, UNIQUE_CHECK_NO, unlikely, XactLockTableWait(), and XLTW_InsertIndex.

Referenced by btinsert().

◆ _bt_end_vacuum()

void _bt_end_vacuum ( Relation  rel)

Definition at line 4473 of file nbtutils.c.

4474 {
4475  int i;
4476 
4477  LWLockAcquire(BtreeVacuumLock, LW_EXCLUSIVE);
4478 
4479  /* Find the array entry */
4480  for (i = 0; i < btvacinfo->num_vacuums; i++)
4481  {
4482  BTOneVacInfo *vac = &btvacinfo->vacuums[i];
4483 
4484  if (vac->relid.relId == rel->rd_lockInfo.lockRelId.relId &&
4485  vac->relid.dbId == rel->rd_lockInfo.lockRelId.dbId)
4486  {
4487  /* Remove it by shifting down the last entry */
4488  *vac = btvacinfo->vacuums[btvacinfo->num_vacuums - 1];
4490  break;
4491  }
4492  }
4493 
4494  LWLockRelease(BtreeVacuumLock);
4495 }
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1168
void LWLockRelease(LWLock *lock)
Definition: lwlock.c:1781
@ LW_EXCLUSIVE
Definition: lwlock.h:114
static BTVacInfo * btvacinfo
Definition: nbtutils.c:4369
LockRelId relid
Definition: nbtutils.c:4357
int num_vacuums
Definition: nbtutils.c:4364
BTOneVacInfo vacuums[FLEXIBLE_ARRAY_MEMBER]
Definition: nbtutils.c:4366
LockRelId lockRelId
Definition: rel.h:46
Oid relId
Definition: rel.h:40
Oid dbId
Definition: rel.h:41
LockInfoData rd_lockInfo
Definition: rel.h:114

References btvacinfo, LockRelId::dbId, i, LockInfoData::lockRelId, LW_EXCLUSIVE, LWLockAcquire(), LWLockRelease(), BTVacInfo::num_vacuums, RelationData::rd_lockInfo, BTOneVacInfo::relid, LockRelId::relId, and BTVacInfo::vacuums.

Referenced by _bt_end_vacuum_callback(), and btbulkdelete().

◆ _bt_end_vacuum_callback()

void _bt_end_vacuum_callback ( int  code,
Datum  arg 
)

Definition at line 4501 of file nbtutils.c.

4502 {
4504 }
void _bt_end_vacuum(Relation rel)
Definition: nbtutils.c:4473
void * arg
static Pointer DatumGetPointer(Datum X)
Definition: postgres.h:312

References _bt_end_vacuum(), arg, and DatumGetPointer().

Referenced by btbulkdelete().

◆ _bt_findsplitloc()

OffsetNumber _bt_findsplitloc ( Relation  rel,
Page  origpage,
OffsetNumber  newitemoff,
Size  newitemsz,
IndexTuple  newitem,
bool newitemonleft 
)

Definition at line 129 of file nbtsplitloc.c.

135 {
136  BTPageOpaque opaque;
137  int leftspace,
138  rightspace,
139  olddataitemstotal,
140  olddataitemstoleft,
141  perfectpenalty,
142  leaffillfactor;
144  FindSplitStrat strategy;
145  ItemId itemid;
146  OffsetNumber offnum,
147  maxoff,
148  firstrightoff;
149  double fillfactormult;
150  bool usemult;
151  SplitPoint leftpage,
152  rightpage;
153 
154  opaque = BTPageGetOpaque(origpage);
155  maxoff = PageGetMaxOffsetNumber(origpage);
156 
157  /* Total free space available on a btree page, after fixed overhead */
158  leftspace = rightspace =
160  MAXALIGN(sizeof(BTPageOpaqueData));
161 
162  /* The right page will have the same high key as the old page */
163  if (!P_RIGHTMOST(opaque))
164  {
165  itemid = PageGetItemId(origpage, P_HIKEY);
166  rightspace -= (int) (MAXALIGN(ItemIdGetLength(itemid)) +
167  sizeof(ItemIdData));
168  }
169 
170  /* Count up total space in data items before actually scanning 'em */
171  olddataitemstotal = rightspace - (int) PageGetExactFreeSpace(origpage);
172  leaffillfactor = BTGetFillFactor(rel);
173 
174  /* Passed-in newitemsz is MAXALIGNED but does not include line pointer */
175  newitemsz += sizeof(ItemIdData);
176  state.rel = rel;
177  state.origpage = origpage;
178  state.newitem = newitem;
179  state.newitemsz = newitemsz;
180  state.is_leaf = P_ISLEAF(opaque);
181  state.is_rightmost = P_RIGHTMOST(opaque);
182  state.leftspace = leftspace;
183  state.rightspace = rightspace;
184  state.olddataitemstotal = olddataitemstotal;
185  state.minfirstrightsz = SIZE_MAX;
186  state.newitemoff = newitemoff;
187 
188  /* newitem cannot be a posting list item */
189  Assert(!BTreeTupleIsPosting(newitem));
190 
191  /*
192  * nsplits should never exceed maxoff because there will be at most as
193  * many candidate split points as there are points _between_ tuples, once
194  * you imagine that the new item is already on the original page (the
195  * final number of splits may be slightly lower because not all points
196  * between tuples will be legal).
197  */
198  state.maxsplits = maxoff;
199  state.splits = palloc(sizeof(SplitPoint) * state.maxsplits);
200  state.nsplits = 0;
201 
202  /*
203  * Scan through the data items and calculate space usage for a split at
204  * each possible position
205  */
206  olddataitemstoleft = 0;
207 
208  for (offnum = P_FIRSTDATAKEY(opaque);
209  offnum <= maxoff;
210  offnum = OffsetNumberNext(offnum))
211  {
212  Size itemsz;
213 
214  itemid = PageGetItemId(origpage, offnum);
215  itemsz = MAXALIGN(ItemIdGetLength(itemid)) + sizeof(ItemIdData);
216 
217  /*
218  * When item offset number is not newitemoff, neither side of the
219  * split can be newitem. Record a split after the previous data item
220  * from original page, but before the current data item from original
221  * page. (_bt_recsplitloc() will reject the split when there are no
222  * previous items, which we rely on.)
223  */
224  if (offnum < newitemoff)
225  _bt_recsplitloc(&state, offnum, false, olddataitemstoleft, itemsz);
226  else if (offnum > newitemoff)
227  _bt_recsplitloc(&state, offnum, true, olddataitemstoleft, itemsz);
228  else
229  {
230  /*
231  * Record a split after all "offnum < newitemoff" original page
232  * data items, but before newitem
233  */
234  _bt_recsplitloc(&state, offnum, false, olddataitemstoleft, itemsz);
235 
236  /*
237  * Record a split after newitem, but before data item from
238  * original page at offset newitemoff/current offset
239  */
240  _bt_recsplitloc(&state, offnum, true, olddataitemstoleft, itemsz);
241  }
242 
243  olddataitemstoleft += itemsz;
244  }
245 
246  /*
247  * Record a split after all original page data items, but before newitem.
248  * (Though only when it's possible that newitem will end up alone on new
249  * right page.)
250  */
251  Assert(olddataitemstoleft == olddataitemstotal);
252  if (newitemoff > maxoff)
253  _bt_recsplitloc(&state, newitemoff, false, olddataitemstotal, 0);
254 
255  /*
256  * I believe it is not possible to fail to find a feasible split, but just
257  * in case ...
258  */
259  if (state.nsplits == 0)
260  elog(ERROR, "could not find a feasible split point for index \"%s\"",
262 
263  /*
264  * Start search for a split point among list of legal split points. Give
265  * primary consideration to equalizing available free space in each half
266  * of the split initially (start with default strategy), while applying
267  * rightmost and split-after-new-item optimizations where appropriate.
268  * Either of the two other fallback strategies may be required for cases
269  * with a large number of duplicates around the original/space-optimal
270  * split point.
271  *
272  * Default strategy gives some weight to suffix truncation in deciding a
273  * split point on leaf pages. It attempts to select a split point where a
274  * distinguishing attribute appears earlier in the new high key for the
275  * left side of the split, in order to maximize the number of trailing
276  * attributes that can be truncated away. Only candidate split points
277  * that imply an acceptable balance of free space on each side are
278  * considered. See _bt_defaultinterval().
279  */
280  if (!state.is_leaf)
281  {
282  /* fillfactormult only used on rightmost page */
283  usemult = state.is_rightmost;
284  fillfactormult = BTREE_NONLEAF_FILLFACTOR / 100.0;
285  }
286  else if (state.is_rightmost)
287  {
288  /* Rightmost leaf page -- fillfactormult always used */
289  usemult = true;
290  fillfactormult = leaffillfactor / 100.0;
291  }
292  else if (_bt_afternewitemoff(&state, maxoff, leaffillfactor, &usemult))
293  {
294  /*
295  * New item inserted at rightmost point among a localized grouping on
296  * a leaf page -- apply "split after new item" optimization, either by
297  * applying leaf fillfactor multiplier, or by choosing the exact split
298  * point that leaves newitem as lastleft. (usemult is set for us.)
299  */
300  if (usemult)
301  {
302  /* fillfactormult should be set based on leaf fillfactor */
303  fillfactormult = leaffillfactor / 100.0;
304  }
305  else
306  {
307  /* find precise split point after newitemoff */
308  for (int i = 0; i < state.nsplits; i++)
309  {
310  SplitPoint *split = state.splits + i;
311 
312  if (split->newitemonleft &&
313  newitemoff == split->firstrightoff)
314  {
315  pfree(state.splits);
316  *newitemonleft = true;
317  return newitemoff;
318  }
319  }
320 
321  /*
322  * Cannot legally split after newitemoff; proceed with split
323  * without using fillfactor multiplier. This is defensive, and
324  * should never be needed in practice.
325  */
326  fillfactormult = 0.50;
327  }
328  }
329  else
330  {
331  /* Other leaf page. 50:50 page split. */
332  usemult = false;
333  /* fillfactormult not used, but be tidy */
334  fillfactormult = 0.50;
335  }
336 
337  /*
338  * Save leftmost and rightmost splits for page before original ordinal
339  * sort order is lost by delta/fillfactormult sort
340  */
341  leftpage = state.splits[0];
342  rightpage = state.splits[state.nsplits - 1];
343 
344  /* Give split points a fillfactormult-wise delta, and sort on deltas */
345  _bt_deltasortsplits(&state, fillfactormult, usemult);
346 
347  /* Determine split interval for default strategy */
348  state.interval = _bt_defaultinterval(&state);
349 
350  /*
351  * Determine if default strategy/split interval will produce a
352  * sufficiently distinguishing split, or if we should change strategies.
353  * Alternative strategies change the range of split points that are
354  * considered acceptable (split interval), and possibly change
355  * fillfactormult, in order to deal with pages with a large number of
356  * duplicates gracefully.
357  *
358  * Pass low and high splits for the entire page (actually, they're for an
359  * imaginary version of the page that includes newitem). These are used
360  * when the initial split interval encloses split points that are full of
361  * duplicates, and we need to consider if it's even possible to avoid
362  * appending a heap TID.
363  */
364  perfectpenalty = _bt_strategy(&state, &leftpage, &rightpage, &strategy);
365 
366  if (strategy == SPLIT_DEFAULT)
367  {
368  /*
369  * Default strategy worked out (always works out with internal page).
370  * Original split interval still stands.
371  */
372  }
373 
374  /*
375  * Many duplicates strategy is used when a heap TID would otherwise be
376  * appended, but the page isn't completely full of logical duplicates.
377  *
378  * The split interval is widened to include all legal candidate split
379  * points. There might be a few as two distinct values in the whole-page
380  * split interval, though it's also possible that most of the values on
381  * the page are unique. The final split point will either be to the
382  * immediate left or to the immediate right of the group of duplicate
383  * tuples that enclose the first/delta-optimal split point (perfect
384  * penalty was set so that the lowest delta split point that avoids
385  * appending a heap TID will be chosen). Maximizing the number of
386  * attributes that can be truncated away is not a goal of the many
387  * duplicates strategy.
388  *
389  * Single value strategy is used when it is impossible to avoid appending
390  * a heap TID. It arranges to leave the left page very full. This
391  * maximizes space utilization in cases where tuples with the same
392  * attribute values span many pages. Newly inserted duplicates will tend
393  * to have higher heap TID values, so we'll end up splitting to the right
394  * consistently. (Single value strategy is harmless though not
395  * particularly useful with !heapkeyspace indexes.)
396  */
397  else if (strategy == SPLIT_MANY_DUPLICATES)
398  {
399  Assert(state.is_leaf);
400  /* Shouldn't try to truncate away extra user attributes */
401  Assert(perfectpenalty ==
403  /* No need to resort splits -- no change in fillfactormult/deltas */
404  state.interval = state.nsplits;
405  }
406  else if (strategy == SPLIT_SINGLE_VALUE)
407  {
408  Assert(state.is_leaf);
409  /* Split near the end of the page */
410  usemult = true;
411  fillfactormult = BTREE_SINGLEVAL_FILLFACTOR / 100.0;
412  /* Resort split points with new delta */
413  _bt_deltasortsplits(&state, fillfactormult, usemult);
414  /* Appending a heap TID is unavoidable, so interval of 1 is fine */
415  state.interval = 1;
416  }
417 
418  /*
419  * Search among acceptable split points (using final split interval) for
420  * the entry that has the lowest penalty, and is therefore expected to
421  * maximize fan-out. Sets *newitemonleft for us.
422  */
423  firstrightoff = _bt_bestsplitloc(&state, perfectpenalty, newitemonleft,
424  strategy);
425  pfree(state.splits);
426 
427  return firstrightoff;
428 }
#define BTREE_SINGLEVAL_FILLFACTOR
Definition: nbtree.h:202
#define BTGetFillFactor(relation)
Definition: nbtree.h:1138
#define BTREE_NONLEAF_FILLFACTOR
Definition: nbtree.h:201
static void _bt_deltasortsplits(FindSplitData *state, double fillfactormult, bool usemult)
Definition: nbtsplitloc.c:566
static int _bt_strategy(FindSplitData *state, SplitPoint *leftpage, SplitPoint *rightpage, FindSplitStrat *strategy)
Definition: nbtsplitloc.c:934
static bool _bt_afternewitemoff(FindSplitData *state, OffsetNumber maxoff, int leaffillfactor, bool *usemult)
Definition: nbtsplitloc.c:630
static void _bt_recsplitloc(FindSplitData *state, OffsetNumber firstrightoff, bool newitemonleft, int olddataitemstoleft, Size firstrightofforigpagetuplesz)
Definition: nbtsplitloc.c:449
FindSplitStrat
Definition: nbtsplitloc.c:21
@ SPLIT_DEFAULT
Definition: nbtsplitloc.c:23
@ SPLIT_MANY_DUPLICATES
Definition: nbtsplitloc.c:24
@ SPLIT_SINGLE_VALUE
Definition: nbtsplitloc.c:25
static OffsetNumber _bt_bestsplitloc(FindSplitData *state, int perfectpenalty, bool *newitemonleft, FindSplitStrat strategy)
Definition: nbtsplitloc.c:788
static int _bt_defaultinterval(FindSplitData *state)
Definition: nbtsplitloc.c:876
bool newitemonleft
Definition: nbtsplitloc.c:37
OffsetNumber firstrightoff
Definition: nbtsplitloc.c:36

References _bt_afternewitemoff(), _bt_bestsplitloc(), _bt_defaultinterval(), _bt_deltasortsplits(), _bt_recsplitloc(), _bt_strategy(), Assert, BTGetFillFactor, BTPageGetOpaque, BTREE_NONLEAF_FILLFACTOR, BTREE_SINGLEVAL_FILLFACTOR, BTreeTupleIsPosting(), elog, ERROR, SplitPoint::firstrightoff, i, IndexRelationGetNumberOfKeyAttributes, ItemIdGetLength, MAXALIGN, SplitPoint::newitemonleft, OffsetNumberNext, P_FIRSTDATAKEY, P_HIKEY, P_ISLEAF, P_RIGHTMOST, PageGetExactFreeSpace(), PageGetItemId(), PageGetMaxOffsetNumber(), PageGetPageSize(), palloc(), pfree(), RelationGetRelationName, SizeOfPageHeaderData, SPLIT_DEFAULT, SPLIT_MANY_DUPLICATES, and SPLIT_SINGLE_VALUE.

Referenced by _bt_split().

◆ _bt_finish_split()

void _bt_finish_split ( Relation  rel,
Relation  heaprel,
Buffer  lbuf,
BTStack  stack 
)

Definition at line 2241 of file nbtinsert.c.

2242 {
2243  Page lpage = BufferGetPage(lbuf);
2244  BTPageOpaque lpageop = BTPageGetOpaque(lpage);
2245  Buffer rbuf;
2246  Page rpage;
2247  BTPageOpaque rpageop;
2248  bool wasroot;
2249  bool wasonly;
2250 
2251  Assert(P_INCOMPLETE_SPLIT(lpageop));
2252  Assert(heaprel != NULL);
2253 
2254  /* Lock right sibling, the one missing the downlink */
2255  rbuf = _bt_getbuf(rel, lpageop->btpo_next, BT_WRITE);
2256  rpage = BufferGetPage(rbuf);
2257  rpageop = BTPageGetOpaque(rpage);
2258 
2259  /* Could this be a root split? */
2260  if (!stack)
2261  {
2262  Buffer metabuf;
2263  Page metapg;
2264  BTMetaPageData *metad;
2265 
2266  /* acquire lock on the metapage */
2267  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
2268  metapg = BufferGetPage(metabuf);
2269  metad = BTPageGetMeta(metapg);
2270 
2271  wasroot = (metad->btm_root == BufferGetBlockNumber(lbuf));
2272 
2273  _bt_relbuf(rel, metabuf);
2274  }
2275  else
2276  wasroot = false;
2277 
2278  /* Was this the only page on the level before split? */
2279  wasonly = (P_LEFTMOST(lpageop) && P_RIGHTMOST(rpageop));
2280 
2281  elog(DEBUG1, "finishing incomplete split of %u/%u",
2283 
2284  _bt_insert_parent(rel, heaprel, lbuf, rbuf, stack, wasroot, wasonly);
2285 }
static void _bt_insert_parent(Relation rel, Relation heaprel, Buffer buf, Buffer rbuf, BTStack stack, bool isroot, bool isonly)
Definition: nbtinsert.c:2099
Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access)
Definition: nbtpage.c:845
#define BTPageGetMeta(p)
Definition: nbtree.h:121
#define P_LEFTMOST(opaque)
Definition: nbtree.h:218
#define P_INCOMPLETE_SPLIT(opaque)
Definition: nbtree.h:227
#define BTREE_METAPAGE
Definition: nbtree.h:148
#define BT_WRITE
Definition: nbtree.h:720
BlockNumber btm_root
Definition: nbtree.h:107
BlockNumber btpo_next
Definition: nbtree.h:65

References _bt_getbuf(), _bt_insert_parent(), _bt_relbuf(), Assert, BT_WRITE, BTMetaPageData::btm_root, BTPageGetMeta, BTPageGetOpaque, BTPageOpaqueData::btpo_next, BTREE_METAPAGE, BufferGetBlockNumber(), BufferGetPage(), DEBUG1, elog, P_INCOMPLETE_SPLIT, P_LEFTMOST, and P_RIGHTMOST.

Referenced by _bt_getstackbuf(), _bt_moveright(), and _bt_stepright().

◆ _bt_first()

bool _bt_first ( IndexScanDesc  scan,
ScanDirection  dir 
)

Definition at line 879 of file nbtsearch.c.

880 {
881  Relation rel = scan->indexRelation;
882  BTScanOpaque so = (BTScanOpaque) scan->opaque;
883  Buffer buf;
884  BTStack stack;
885  OffsetNumber offnum;
886  StrategyNumber strat;
887  BTScanInsertData inskey;
888  ScanKey startKeys[INDEX_MAX_KEYS];
889  ScanKeyData notnullkeys[INDEX_MAX_KEYS];
890  int keysz = 0;
891  int i;
892  bool status;
893  StrategyNumber strat_total;
894  BTScanPosItem *currItem;
895  BlockNumber blkno;
896 
898 
900 
901  /*
902  * Examine the scan keys and eliminate any redundant keys; also mark the
903  * keys that must be matched to continue the scan.
904  */
905  _bt_preprocess_keys(scan);
906 
907  /*
908  * Quit now if _bt_preprocess_keys() discovered that the scan keys can
909  * never be satisfied (eg, x == 1 AND x > 2).
910  */
911  if (!so->qual_ok)
912  {
913  _bt_parallel_done(scan);
914  return false;
915  }
916 
917  /*
918  * For parallel scans, get the starting page from shared state. If the
919  * scan has not started, proceed to find out first leaf page in the usual
920  * way while keeping other participating processes waiting. If the scan
921  * has already begun, use the page number from the shared structure.
922  *
923  * When a parallel scan has another primitive index scan scheduled, a
924  * parallel worker will seize the scan for that purpose now. This is
925  * similar to the case where the top-level scan hasn't started.
926  */
927  if (scan->parallel_scan != NULL)
928  {
929  status = _bt_parallel_seize(scan, &blkno, true);
930 
931  /*
932  * Initialize arrays (when _bt_parallel_seize didn't already set up
933  * the next primitive index scan)
934  */
935  if (so->numArrayKeys && !so->needPrimScan)
936  _bt_start_array_keys(scan, dir);
937 
938  if (!status)
939  return false;
940  else if (blkno == P_NONE)
941  {
942  _bt_parallel_done(scan);
943  return false;
944  }
945  else if (blkno != InvalidBlockNumber)
946  {
947  if (!_bt_parallel_readpage(scan, blkno, dir))
948  return false;
949  goto readcomplete;
950  }
951  }
952  else if (so->numArrayKeys && !so->needPrimScan)
953  {
954  /*
955  * First _bt_first call (for current btrescan) without parallelism.
956  *
957  * Initialize arrays, and the corresponding scan keys that were just
958  * output by _bt_preprocess_keys.
959  */
960  _bt_start_array_keys(scan, dir);
961  }
962 
963  /*----------
964  * Examine the scan keys to discover where we need to start the scan.
965  *
966  * We want to identify the keys that can be used as starting boundaries;
967  * these are =, >, or >= keys for a forward scan or =, <, <= keys for
968  * a backwards scan. We can use keys for multiple attributes so long as
969  * the prior attributes had only =, >= (resp. =, <=) keys. Once we accept
970  * a > or < boundary or find an attribute with no boundary (which can be
971  * thought of as the same as "> -infinity"), we can't use keys for any
972  * attributes to its right, because it would break our simplistic notion
973  * of what initial positioning strategy to use.
974  *
975  * When the scan keys include cross-type operators, _bt_preprocess_keys
976  * may not be able to eliminate redundant keys; in such cases we will
977  * arbitrarily pick a usable one for each attribute. This is correct
978  * but possibly not optimal behavior. (For example, with keys like
979  * "x >= 4 AND x >= 5" we would elect to scan starting at x=4 when
980  * x=5 would be more efficient.) Since the situation only arises given
981  * a poorly-worded query plus an incomplete opfamily, live with it.
982  *
983  * When both equality and inequality keys appear for a single attribute
984  * (again, only possible when cross-type operators appear), we *must*
985  * select one of the equality keys for the starting point, because
986  * _bt_checkkeys() will stop the scan as soon as an equality qual fails.
987  * For example, if we have keys like "x >= 4 AND x = 10" and we elect to
988  * start at x=4, we will fail and stop before reaching x=10. If multiple
989  * equality quals survive preprocessing, however, it doesn't matter which
990  * one we use --- by definition, they are either redundant or
991  * contradictory.
992  *
993  * Any regular (not SK_SEARCHNULL) key implies a NOT NULL qualifier.
994  * If the index stores nulls at the end of the index we'll be starting
995  * from, and we have no boundary key for the column (which means the key
996  * we deduced NOT NULL from is an inequality key that constrains the other
997  * end of the index), then we cons up an explicit SK_SEARCHNOTNULL key to
998  * use as a boundary key. If we didn't do this, we might find ourselves
999  * traversing a lot of null entries at the start of the scan.
1000  *
1001  * In this loop, row-comparison keys are treated the same as keys on their
1002  * first (leftmost) columns. We'll add on lower-order columns of the row
1003  * comparison below, if possible.
1004  *
1005  * The selected scan keys (at most one per index column) are remembered by
1006  * storing their addresses into the local startKeys[] array.
1007  *
1008  * _bt_checkkeys/_bt_advance_array_keys decide whether and when to start
1009  * the next primitive index scan (for scans with array keys) based in part
1010  * on an understanding of how it'll enable us to reposition the scan.
1011  * They're directly aware of how we'll sometimes cons up an explicit
1012  * SK_SEARCHNOTNULL key. They'll even end primitive scans by applying a
1013  * symmetric "deduce NOT NULL" rule of their own. This allows top-level
1014  * scans to skip large groups of NULLs through repeated deductions about
1015  * key strictness (for a required inequality key) and whether NULLs in the
1016  * key's index column are stored last or first (relative to non-NULLs).
1017  * If you update anything here, _bt_checkkeys/_bt_advance_array_keys might
1018  * need to be kept in sync.
1019  *----------
1020  */
1021  strat_total = BTEqualStrategyNumber;
1022  if (so->numberOfKeys > 0)
1023  {
1024  AttrNumber curattr;
1025  ScanKey chosen;
1026  ScanKey impliesNN;
1027  ScanKey cur;
1028 
1029  /*
1030  * chosen is the so-far-chosen key for the current attribute, if any.
1031  * We don't cast the decision in stone until we reach keys for the
1032  * next attribute.
1033  */
1034  curattr = 1;
1035  chosen = NULL;
1036  /* Also remember any scankey that implies a NOT NULL constraint */
1037  impliesNN = NULL;
1038 
1039  /*
1040  * Loop iterates from 0 to numberOfKeys inclusive; we use the last
1041  * pass to handle after-last-key processing. Actual exit from the
1042  * loop is at one of the "break" statements below.
1043  */
1044  for (cur = so->keyData, i = 0;; cur++, i++)
1045  {
1046  if (i >= so->numberOfKeys || cur->sk_attno != curattr)
1047  {
1048  /*
1049  * Done looking at keys for curattr. If we didn't find a
1050  * usable boundary key, see if we can deduce a NOT NULL key.
1051  */
1052  if (chosen == NULL && impliesNN != NULL &&
1053  ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
1054  ScanDirectionIsForward(dir) :
1056  {
1057  /* Yes, so build the key in notnullkeys[keysz] */
1058  chosen = &notnullkeys[keysz];
1059  ScanKeyEntryInitialize(chosen,
1061  (impliesNN->sk_flags &
1063  curattr,
1064  ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
1067  InvalidOid,
1068  InvalidOid,
1069  InvalidOid,
1070  (Datum) 0);
1071  }
1072 
1073  /*
1074  * If we still didn't find a usable boundary key, quit; else
1075  * save the boundary key pointer in startKeys.
1076  */
1077  if (chosen == NULL)
1078  break;
1079  startKeys[keysz++] = chosen;
1080 
1081  /*
1082  * Adjust strat_total, and quit if we have stored a > or <
1083  * key.
1084  */
1085  strat = chosen->sk_strategy;
1086  if (strat != BTEqualStrategyNumber)
1087  {
1088  strat_total = strat;
1089  if (strat == BTGreaterStrategyNumber ||
1090  strat == BTLessStrategyNumber)
1091  break;
1092  }
1093 
1094  /*
1095  * Done if that was the last attribute, or if next key is not
1096  * in sequence (implying no boundary key is available for the
1097  * next attribute).
1098  */
1099  if (i >= so->numberOfKeys ||
1100  cur->sk_attno != curattr + 1)
1101  break;
1102 
1103  /*
1104  * Reset for next attr.
1105  */
1106  curattr = cur->sk_attno;
1107  chosen = NULL;
1108  impliesNN = NULL;
1109  }
1110 
1111  /*
1112  * Can we use this key as a starting boundary for this attr?
1113  *
1114  * If not, does it imply a NOT NULL constraint? (Because
1115  * SK_SEARCHNULL keys are always assigned BTEqualStrategyNumber,
1116  * *any* inequality key works for that; we need not test.)
1117  */
1118  switch (cur->sk_strategy)
1119  {
1120  case BTLessStrategyNumber:
1122  if (chosen == NULL)
1123  {
1124  if (ScanDirectionIsBackward(dir))
1125  chosen = cur;
1126  else
1127  impliesNN = cur;
1128  }
1129  break;
1130  case BTEqualStrategyNumber:
1131  /* override any non-equality choice */
1132  chosen = cur;
1133  break;
1136  if (chosen == NULL)
1137  {
1138  if (ScanDirectionIsForward(dir))
1139  chosen = cur;
1140  else
1141  impliesNN = cur;
1142  }
1143  break;
1144  }
1145  }
1146  }
1147 
1148  /*
1149  * If we found no usable boundary keys, we have to start from one end of
1150  * the tree. Walk down that edge to the first or last key, and scan from
1151  * there.
1152  */
1153  if (keysz == 0)
1154  {
1155  bool match;
1156 
1157  match = _bt_endpoint(scan, dir);
1158 
1159  if (!match)
1160  {
1161  /* No match, so mark (parallel) scan finished */
1162  _bt_parallel_done(scan);
1163  }
1164 
1165  return match;
1166  }
1167 
1168  /*
1169  * We want to start the scan somewhere within the index. Set up an
1170  * insertion scankey we can use to search for the boundary point we
1171  * identified above. The insertion scankey is built using the keys
1172  * identified by startKeys[]. (Remaining insertion scankey fields are
1173  * initialized after initial-positioning strategy is finalized.)
1174  */
1175  Assert(keysz <= INDEX_MAX_KEYS);
1176  for (i = 0; i < keysz; i++)
1177  {
1178  ScanKey cur = startKeys[i];
1179 
1180  Assert(cur->sk_attno == i + 1);
1181 
1182  if (cur->sk_flags & SK_ROW_HEADER)
1183  {
1184  /*
1185  * Row comparison header: look to the first row member instead.
1186  *
1187  * The member scankeys are already in insertion format (ie, they
1188  * have sk_func = 3-way-comparison function), but we have to watch
1189  * out for nulls, which _bt_preprocess_keys didn't check. A null
1190  * in the first row member makes the condition unmatchable, just
1191  * like qual_ok = false.
1192  */
1193  ScanKey subkey = (ScanKey) DatumGetPointer(cur->sk_argument);
1194 
1195  Assert(subkey->sk_flags & SK_ROW_MEMBER);
1196  if (subkey->sk_flags & SK_ISNULL)
1197  {
1198  _bt_parallel_done(scan);
1199  return false;
1200  }
1201  memcpy(inskey.scankeys + i, subkey, sizeof(ScanKeyData));
1202 
1203  /*
1204  * If the row comparison is the last positioning key we accepted,
1205  * try to add additional keys from the lower-order row members.
1206  * (If we accepted independent conditions on additional index
1207  * columns, we use those instead --- doesn't seem worth trying to
1208  * determine which is more restrictive.) Note that this is OK
1209  * even if the row comparison is of ">" or "<" type, because the
1210  * condition applied to all but the last row member is effectively
1211  * ">=" or "<=", and so the extra keys don't break the positioning
1212  * scheme. But, by the same token, if we aren't able to use all
1213  * the row members, then the part of the row comparison that we
1214  * did use has to be treated as just a ">=" or "<=" condition, and
1215  * so we'd better adjust strat_total accordingly.
1216  */
1217  if (i == keysz - 1)
1218  {
1219  bool used_all_subkeys = false;
1220 
1221  Assert(!(subkey->sk_flags & SK_ROW_END));
1222  for (;;)
1223  {
1224  subkey++;
1225  Assert(subkey->sk_flags & SK_ROW_MEMBER);
1226  if (subkey->sk_attno != keysz + 1)
1227  break; /* out-of-sequence, can't use it */
1228  if (subkey->sk_strategy != cur->sk_strategy)
1229  break; /* wrong direction, can't use it */
1230  if (subkey->sk_flags & SK_ISNULL)
1231  break; /* can't use null keys */
1232  Assert(keysz < INDEX_MAX_KEYS);
1233  memcpy(inskey.scankeys + keysz, subkey,
1234  sizeof(ScanKeyData));
1235  keysz++;
1236  if (subkey->sk_flags & SK_ROW_END)
1237  {
1238  used_all_subkeys = true;
1239  break;
1240  }
1241  }
1242  if (!used_all_subkeys)
1243  {
1244  switch (strat_total)
1245  {
1246  case BTLessStrategyNumber:
1247  strat_total = BTLessEqualStrategyNumber;
1248  break;
1250  strat_total = BTGreaterEqualStrategyNumber;
1251  break;
1252  }
1253  }
1254  break; /* done with outer loop */
1255  }
1256  }
1257  else
1258  {
1259  /*
1260  * Ordinary comparison key. Transform the search-style scan key
1261  * to an insertion scan key by replacing the sk_func with the
1262  * appropriate btree comparison function.
1263  *
1264  * If scankey operator is not a cross-type comparison, we can use
1265  * the cached comparison function; otherwise gotta look it up in
1266  * the catalogs. (That can't lead to infinite recursion, since no
1267  * indexscan initiated by syscache lookup will use cross-data-type
1268  * operators.)
1269  *
1270  * We support the convention that sk_subtype == InvalidOid means
1271  * the opclass input type; this is a hack to simplify life for
1272  * ScanKeyInit().
1273  */
1274  if (cur->sk_subtype == rel->rd_opcintype[i] ||
1275  cur->sk_subtype == InvalidOid)
1276  {
1277  FmgrInfo *procinfo;
1278 
1279  procinfo = index_getprocinfo(rel, cur->sk_attno, BTORDER_PROC);
1280  ScanKeyEntryInitializeWithInfo(inskey.scankeys + i,
1281  cur->sk_flags,
1282  cur->sk_attno,
1284  cur->sk_subtype,
1285  cur->sk_collation,
1286  procinfo,
1287  cur->sk_argument);
1288  }
1289  else
1290  {
1291  RegProcedure cmp_proc;
1292 
1293  cmp_proc = get_opfamily_proc(rel->rd_opfamily[i],
1294  rel->rd_opcintype[i],
1295  cur->sk_subtype,
1296  BTORDER_PROC);
1297  if (!RegProcedureIsValid(cmp_proc))
1298  elog(ERROR, "missing support function %d(%u,%u) for attribute %d of index \"%s\"",
1299  BTORDER_PROC, rel->rd_opcintype[i], cur->sk_subtype,
1300  cur->sk_attno, RelationGetRelationName(rel));
1301  ScanKeyEntryInitialize(inskey.scankeys + i,
1302  cur->sk_flags,
1303  cur->sk_attno,
1305  cur->sk_subtype,
1306  cur->sk_collation,
1307  cmp_proc,
1308  cur->sk_argument);
1309  }
1310  }
1311  }
1312 
1313  /*----------
1314  * Examine the selected initial-positioning strategy to determine exactly
1315  * where we need to start the scan, and set flag variables to control the
1316  * initial descent by _bt_search (and our _bt_binsrch call for the leaf
1317  * page _bt_search returns).
1318  *----------
1319  */
1320  _bt_metaversion(rel, &inskey.heapkeyspace, &inskey.allequalimage);
1321  inskey.anynullkeys = false; /* unused */
1322  inskey.scantid = NULL;
1323  inskey.keysz = keysz;
1324  switch (strat_total)
1325  {
1326  case BTLessStrategyNumber:
1327 
1328  inskey.nextkey = false;
1329  inskey.backward = true;
1330  break;
1331 
1333 
1334  inskey.nextkey = true;
1335  inskey.backward = true;
1336  break;
1337 
1338  case BTEqualStrategyNumber:
1339 
1340  /*
1341  * If a backward scan was specified, need to start with last equal
1342  * item not first one.
1343  */
1344  if (ScanDirectionIsBackward(dir))
1345  {
1346  /*
1347  * This is the same as the <= strategy
1348  */
1349  inskey.nextkey = true;
1350  inskey.backward = true;
1351  }
1352  else
1353  {
1354  /*
1355  * This is the same as the >= strategy
1356  */
1357  inskey.nextkey = false;
1358  inskey.backward = false;
1359  }
1360  break;
1361 
1363 
1364  /*
1365  * Find first item >= scankey
1366  */
1367  inskey.nextkey = false;
1368  inskey.backward = false;
1369  break;
1370 
1372 
1373  /*
1374  * Find first item > scankey
1375  */
1376  inskey.nextkey = true;
1377  inskey.backward = false;
1378  break;
1379 
1380  default:
1381  /* can't get here, but keep compiler quiet */
1382  elog(ERROR, "unrecognized strat_total: %d", (int) strat_total);
1383  return false;
1384  }
1385 
1386  /*
1387  * Use the manufactured insertion scan key to descend the tree and
1388  * position ourselves on the target leaf page.
1389  */
1390  Assert(ScanDirectionIsBackward(dir) == inskey.backward);
1391  stack = _bt_search(rel, NULL, &inskey, &buf, BT_READ);
1392 
1393  /* don't need to keep the stack around... */
1394  _bt_freestack(stack);
1395 
1396  if (!BufferIsValid(buf))
1397  {
1398  /*
1399  * We only get here if the index is completely empty. Lock relation
1400  * because nothing finer to lock exists. Without a buffer lock, it's
1401  * possible for another transaction to insert data between
1402  * _bt_search() and PredicateLockRelation(). We have to try again
1403  * after taking the relation-level predicate lock, to close a narrow
1404  * window where we wouldn't scan concurrently inserted tuples, but the
1405  * writer wouldn't see our predicate lock.
1406  */
1408  {
1409  PredicateLockRelation(rel, scan->xs_snapshot);
1410  stack = _bt_search(rel, NULL, &inskey, &buf, BT_READ);
1411  _bt_freestack(stack);
1412  }
1413 
1414  if (!BufferIsValid(buf))
1415  {
1416  /*
1417  * Mark parallel scan as done, so that all the workers can finish
1418  * their scan.
1419  */
1420  _bt_parallel_done(scan);
1422  return false;
1423  }
1424  }
1425 
1427 
1428  _bt_initialize_more_data(so, dir);
1429 
1430  /* position to the precise item on the page */
1431  offnum = _bt_binsrch(rel, &inskey, buf);
1433  so->currPos.buf = buf;
1434 
1435  /*
1436  * Now load data from the first page of the scan.
1437  *
1438  * If inskey.nextkey = false and inskey.backward = false, offnum is
1439  * positioned at the first non-pivot tuple >= inskey.scankeys.
1440  *
1441  * If inskey.nextkey = false and inskey.backward = true, offnum is
1442  * positioned at the last non-pivot tuple < inskey.scankeys.
1443  *
1444  * If inskey.nextkey = true and inskey.backward = false, offnum is
1445  * positioned at the first non-pivot tuple > inskey.scankeys.
1446  *
1447  * If inskey.nextkey = true and inskey.backward = true, offnum is
1448  * positioned at the last non-pivot tuple <= inskey.scankeys.
1449  *
1450  * It's possible that _bt_binsrch returned an offnum that is out of bounds
1451  * for the page. For example, when inskey is both < the leaf page's high
1452  * key and > all of its non-pivot tuples, offnum will be "maxoff + 1".
1453  */
1454  if (!_bt_readpage(scan, dir, offnum, true))
1455  {
1456  /*
1457  * There's no actually-matching data on this page. Try to advance to
1458  * the next page. Return false if there's no matching data at all.
1459  */
1460  _bt_unlockbuf(scan->indexRelation, so->currPos.buf);
1461  if (!_bt_steppage(scan, dir))
1462  return false;
1463  }
1464  else
1465  {
1466  /* We have at least one item to return as scan's first item */
1468  }
1469 
1470 readcomplete:
1471  /* OK, itemIndex says what to return */
1472  currItem = &so->currPos.items[so->currPos.itemIndex];
1473  scan->xs_heaptid = currItem->heapTid;
1474  if (scan->xs_want_itup)
1475  scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
1476 
1477  return true;
1478 }
int16 AttrNumber
Definition: attnum.h:21
#define RegProcedureIsValid(p)
Definition: c.h:777
regproc RegProcedure
Definition: c.h:650
struct cursor * cur
Definition: ecpg.c:28
FmgrInfo * index_getprocinfo(Relation irel, AttrNumber attnum, uint16 procnum)
Definition: indexam.c:860
void _bt_metaversion(Relation rel, bool *heapkeyspace, bool *allequalimage)
Definition: nbtpage.c:739
void _bt_unlockbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:1070
void _bt_parallel_done(IndexScanDesc scan)
Definition: nbtree.c:732
bool _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno, bool first)
Definition: nbtree.c:605
#define BTORDER_PROC
Definition: nbtree.h:707
#define BTScanPosIsValid(scanpos)
Definition: nbtree.h:1016
#define P_NONE
Definition: nbtree.h:212
#define BT_READ
Definition: nbtree.h:719
#define BTScanPosInvalidate(scanpos)
Definition: nbtree.h:1022
static bool _bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
Definition: nbtsearch.c:2350
static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp)
Definition: nbtsearch.c:64
static OffsetNumber _bt_binsrch(Relation rel, BTScanInsert key, Buffer buf)
Definition: nbtsearch.c:340
static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:2576
static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:2043
BTStack _bt_search(Relation rel, Relation heaprel, BTScanInsert key, Buffer *bufP, int access)
Definition: nbtsearch.c:99
static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, bool firstPage)
Definition: nbtsearch.c:1559
static void _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir)
Definition: nbtsearch.c:2666
void _bt_start_array_keys(IndexScanDesc scan, ScanDirection dir)
Definition: nbtutils.c:1343
void _bt_preprocess_keys(IndexScanDesc scan)
Definition: nbtutils.c:2546
#define INDEX_MAX_KEYS
#define pgstat_count_index_scan(rel)
Definition: pgstat.h:658
#define InvalidOid
Definition: postgres_ext.h:36
void PredicateLockPage(Relation relation, BlockNumber blkno, Snapshot snapshot)
Definition: predicate.c:2584
void PredicateLockRelation(Relation relation, Snapshot snapshot)
Definition: predicate.c:2561
void ScanKeyEntryInitialize(ScanKey entry, int flags, AttrNumber attributeNumber, StrategyNumber strategy, Oid subtype, Oid collation, RegProcedure procedure, Datum argument)
Definition: scankey.c:32
void ScanKeyEntryInitializeWithInfo(ScanKey entry, int flags, AttrNumber attributeNumber, StrategyNumber strategy, Oid subtype, Oid collation, FmgrInfo *finfo, Datum argument)
Definition: scankey.c:101
#define ScanDirectionIsForward(direction)
Definition: sdir.h:64
#define ScanDirectionIsBackward(direction)
Definition: sdir.h:50
#define SK_ROW_HEADER
Definition: skey.h:117
#define SK_ROW_MEMBER
Definition: skey.h:118
#define SK_SEARCHNOTNULL
Definition: skey.h:122
#define SK_ROW_END
Definition: skey.h:119
ScanKeyData * ScanKey
Definition: skey.h:75
uint16 StrategyNumber
Definition: stratnum.h:22
#define BTGreaterStrategyNumber
Definition: stratnum.h:33
#define InvalidStrategy
Definition: stratnum.h:24
#define BTLessStrategyNumber
Definition: stratnum.h:29
#define BTLessEqualStrategyNumber
Definition: stratnum.h:30
#define BTGreaterEqualStrategyNumber
Definition: stratnum.h:32
BTScanPosData currPos
Definition: nbtree.h:1077
char * currTuples
Definition: nbtree.h:1064
Buffer buf
Definition: nbtree.h:953
BTScanPosItem items[MaxTIDsPerBTreePage]
Definition: nbtree.h:994
int itemIndex
Definition: nbtree.h:992
Definition: fmgr.h:57
struct ParallelIndexScanDescData * parallel_scan
Definition: relscan.h:166
IndexTuple xs_itup
Definition: relscan.h:142
ItemPointerData xs_heaptid
Definition: relscan.h:147
struct SnapshotData * xs_snapshot
Definition: relscan.h:119
#define IsolationIsSerializable()
Definition: xact.h:52

References _bt_binsrch(), _bt_drop_lock_and_maybe_pin(), _bt_endpoint(), _bt_freestack(), _bt_initialize_more_data(), _bt_metaversion(), _bt_parallel_done(), _bt_parallel_readpage(), _bt_parallel_seize(), _bt_preprocess_keys(), _bt_readpage(), _bt_search(), _bt_start_array_keys(), _bt_steppage(), _bt_unlockbuf(), Assert, BT_READ, BTEqualStrategyNumber, BTGreaterEqualStrategyNumber, BTGreaterStrategyNumber, BTLessEqualStrategyNumber, BTLessStrategyNumber, BTORDER_PROC, BTScanPosInvalidate, BTScanPosIsValid, buf, BTScanPosData::buf, BufferGetBlockNumber(), BufferIsValid(), cur, BTScanOpaqueData::currPos, BTScanOpaqueData::currTuples, DatumGetPointer(), elog, ERROR, get_opfamily_proc(), i, index_getprocinfo(), INDEX_MAX_KEYS, IndexScanDescData::indexRelation, InvalidBlockNumber, InvalidOid, InvalidStrategy, IsolationIsSerializable, BTScanPosData::itemIndex, BTScanPosData::items, BTScanOpaqueData::keyData, BTScanOpaqueData::needPrimScan, BTScanOpaqueData::numArrayKeys, BTScanOpaqueData::numberOfKeys, IndexScanDescData::opaque, P_NONE, IndexScanDescData::parallel_scan, pgstat_count_index_scan, PredicateLockPage(), PredicateLockRelation(), BTScanOpaqueData::qual_ok, RelationData::rd_opcintype, RelationData::rd_opfamily, RegProcedureIsValid, RelationGetRelationName, ScanDirectionIsBackward, ScanDirectionIsForward, ScanKeyEntryInitialize(), ScanKeyEntryInitializeWithInfo(), ScanKeyData::sk_attno, SK_BT_DESC, SK_BT_NULLS_FIRST, ScanKeyData::sk_flags, SK_ISNULL, SK_ROW_END, SK_ROW_HEADER, SK_ROW_MEMBER, SK_SEARCHNOTNULL, ScanKeyData::sk_strategy, IndexScanDescData::xs_heaptid, IndexScanDescData::xs_itup, IndexScanDescData::xs_snapshot, and IndexScanDescData::xs_want_itup.

Referenced by btgetbitmap(), and btgettuple().

◆ _bt_form_posting()

IndexTuple _bt_form_posting ( IndexTuple  base,
ItemPointer  htids,
int  nhtids 
)

Definition at line 864 of file nbtdedup.c.

865 {
866  uint32 keysize,
867  newsize;
868  IndexTuple itup;
869 
870  if (BTreeTupleIsPosting(base))
871  keysize = BTreeTupleGetPostingOffset(base);
872  else
873  keysize = IndexTupleSize(base);
874 
875  Assert(!BTreeTupleIsPivot(base));
876  Assert(nhtids > 0 && nhtids <= PG_UINT16_MAX);
877  Assert(keysize == MAXALIGN(keysize));
878 
879  /* Determine final size of new tuple */
880  if (nhtids > 1)
881  newsize = MAXALIGN(keysize +
882  nhtids * sizeof(ItemPointerData));
883  else
884  newsize = keysize;
885 
886  Assert(newsize <= INDEX_SIZE_MASK);
887  Assert(newsize == MAXALIGN(newsize));
888 
889  /* Allocate memory using palloc0() (matches index_form_tuple()) */
890  itup = palloc0(newsize);
891  memcpy(itup, base, keysize);
892  itup->t_info &= ~INDEX_SIZE_MASK;
893  itup->t_info |= newsize;
894  if (nhtids > 1)
895  {
896  /* Form posting list tuple */
897  BTreeTupleSetPosting(itup, nhtids, keysize);
898  memcpy(BTreeTupleGetPosting(itup), htids,
899  sizeof(ItemPointerData) * nhtids);
900  Assert(_bt_posting_valid(itup));
901  }
902  else
903  {
904  /* Form standard non-pivot tuple */
905  itup->t_info &= ~INDEX_ALT_TID_MASK;
906  ItemPointerCopy(htids, &itup->t_tid);
908  }
909 
910  return itup;
911 }
#define PG_UINT16_MAX
Definition: c.h:587
static void ItemPointerCopy(const ItemPointerData *fromPointer, ItemPointerData *toPointer)
Definition: itemptr.h:172
static bool ItemPointerIsValid(const ItemPointerData *pointer)
Definition: itemptr.h:83
void * palloc0(Size size)
Definition: mcxt.c:1347
static void BTreeTupleSetPosting(IndexTuple itup, uint16 nhtids, int postingoffset)
Definition: nbtree.h:504
#define INDEX_ALT_TID_MASK
Definition: nbtree.h:459
unsigned short t_info
Definition: itup.h:49

References Assert, BTreeTupleGetPosting(), BTreeTupleGetPostingOffset(), BTreeTupleIsPivot(), BTreeTupleIsPosting(), BTreeTupleSetPosting(), INDEX_ALT_TID_MASK, INDEX_SIZE_MASK, IndexTupleSize, ItemPointerCopy(), ItemPointerIsValid(), MAXALIGN, palloc0(), PG_UINT16_MAX, IndexTupleData::t_info, and IndexTupleData::t_tid.

Referenced by _bt_dedup_finish_pending(), _bt_sort_dedup_finish_pending(), and bt_posting_plain_tuple().

◆ _bt_freestack()

void _bt_freestack ( BTStack  stack)

Definition at line 221 of file nbtutils.c.

222 {
223  BTStack ostack;
224 
225  while (stack != NULL)
226  {
227  ostack = stack;
228  stack = stack->bts_parent;
229  pfree(ostack);
230  }
231 }
struct BTStackData * bts_parent
Definition: nbtree.h:736

References BTStackData::bts_parent, and pfree().

Referenced by _bt_doinsert(), _bt_first(), and bt_rootdescend().

◆ _bt_get_endpoint()

Buffer _bt_get_endpoint ( Relation  rel,
uint32  level,
bool  rightmost 
)

Definition at line 2495 of file nbtsearch.c.

2496 {
2497  Buffer buf;
2498  Page page;
2499  BTPageOpaque opaque;
2500  OffsetNumber offnum;
2501  BlockNumber blkno;
2502  IndexTuple itup;
2503 
2504  /*
2505  * If we are looking for a leaf page, okay to descend from fast root;
2506  * otherwise better descend from true root. (There is no point in being
2507  * smarter about intermediate levels.)
2508  */
2509  if (level == 0)
2510  buf = _bt_getroot(rel, NULL, BT_READ);
2511  else
2512  buf = _bt_gettrueroot(rel);
2513 
2514  if (!BufferIsValid(buf))
2515  return InvalidBuffer;
2516 
2517  page = BufferGetPage(buf);
2518  opaque = BTPageGetOpaque(page);
2519 
2520  for (;;)
2521  {
2522  /*
2523  * If we landed on a deleted page, step right to find a live page
2524  * (there must be one). Also, if we want the rightmost page, step
2525  * right if needed to get to it (this could happen if the page split
2526  * since we obtained a pointer to it).
2527  */
2528  while (P_IGNORE(opaque) ||
2529  (rightmost && !P_RIGHTMOST(opaque)))
2530  {
2531  blkno = opaque->btpo_next;
2532  if (blkno == P_NONE)
2533  elog(ERROR, "fell off the end of index \"%s\"",
2535  buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2536  page = BufferGetPage(buf);
2537  opaque = BTPageGetOpaque(page);
2538  }
2539 
2540  /* Done? */
2541  if (opaque->btpo_level == level)
2542  break;
2543  if (opaque->btpo_level < level)
2544  ereport(ERROR,
2545  (errcode(ERRCODE_INDEX_CORRUPTED),
2546  errmsg_internal("btree level %u not found in index \"%s\"",
2547  level, RelationGetRelationName(rel))));
2548 
2549  /* Descend to leftmost or rightmost child page */
2550  if (rightmost)
2551  offnum = PageGetMaxOffsetNumber(page);
2552  else
2553  offnum = P_FIRSTDATAKEY(opaque);
2554 
2555  itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
2556  blkno = BTreeTupleGetDownLink(itup);
2557 
2558  buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2559  page = BufferGetPage(buf);
2560  opaque = BTPageGetOpaque(page);
2561  }
2562 
2563  return buf;
2564 }
Buffer _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
Definition: nbtpage.c:1003
Buffer _bt_gettrueroot(Relation rel)
Definition: nbtpage.c:580
Buffer _bt_getroot(Relation rel, Relation heaprel, int access)
Definition: nbtpage.c:344
static BlockNumber BTreeTupleGetDownLink(IndexTuple pivot)
Definition: nbtree.h:556
uint32 btpo_level
Definition: nbtree.h:66

References _bt_getroot(), _bt_gettrueroot(), _bt_relandgetbuf(), BT_READ, BTPageGetOpaque, BTPageOpaqueData::btpo_level, BTPageOpaqueData::btpo_next, BTreeTupleGetDownLink(), buf, BufferGetPage(), BufferIsValid(), elog, ereport, errcode(), errmsg_internal(), ERROR, InvalidBuffer, P_FIRSTDATAKEY, P_IGNORE, P_NONE, P_RIGHTMOST, PageGetItem(), PageGetItemId(), PageGetMaxOffsetNumber(), and RelationGetRelationName.

Referenced by _bt_endpoint(), and _bt_insert_parent().

◆ _bt_getbuf()

Buffer _bt_getbuf ( Relation  rel,
BlockNumber  blkno,
int  access 
)

Definition at line 845 of file nbtpage.c.

846 {
847  Buffer buf;
848 
849  Assert(BlockNumberIsValid(blkno));
850 
851  /* Read an existing block of the relation */
852  buf = ReadBuffer(rel, blkno);
853  _bt_lockbuf(rel, buf, access);
854  _bt_checkpage(rel, buf);
855 
856  return buf;
857 }
void _bt_checkpage(Relation rel, Buffer buf)
Definition: nbtpage.c:797
void _bt_lockbuf(Relation rel, Buffer buf, int access)
Definition: nbtpage.c:1039
short access
Definition: preproc-type.c:36

References _bt_checkpage(), _bt_lockbuf(), Assert, BlockNumberIsValid(), buf, and ReadBuffer().

Referenced by _bt_finish_split(), _bt_getroot(), _bt_getrootheight(), _bt_getstackbuf(), _bt_gettrueroot(), _bt_insertonpg(), _bt_killitems(), _bt_leftsib_splitflag(), _bt_metaversion(), _bt_moveright(), _bt_newlevel(), _bt_pagedel(), _bt_readnextpage(), _bt_rightsib_halfdeadflag(), _bt_set_cleanup_info(), _bt_split(), _bt_unlink_halfdead_page(), _bt_vacuum_needs_cleanup(), and _bt_walk_left().

◆ _bt_getroot()

Buffer _bt_getroot ( Relation  rel,
Relation  heaprel,
int  access 
)

Definition at line 344 of file nbtpage.c.

345 {
346  Buffer metabuf;
347  Buffer rootbuf;
348  Page rootpage;
349  BTPageOpaque rootopaque;
350  BlockNumber rootblkno;
351  uint32 rootlevel;
352  BTMetaPageData *metad;
353 
354  Assert(access == BT_READ || heaprel != NULL);
355 
356  /*
357  * Try to use previously-cached metapage data to find the root. This
358  * normally saves one buffer access per index search, which is a very
359  * helpful savings in bufmgr traffic and hence contention.
360  */
361  if (rel->rd_amcache != NULL)
362  {
363  metad = (BTMetaPageData *) rel->rd_amcache;
364  /* We shouldn't have cached it if any of these fail */
365  Assert(metad->btm_magic == BTREE_MAGIC);
367  Assert(metad->btm_version <= BTREE_VERSION);
368  Assert(!metad->btm_allequalimage ||
370  Assert(metad->btm_root != P_NONE);
371 
372  rootblkno = metad->btm_fastroot;
373  Assert(rootblkno != P_NONE);
374  rootlevel = metad->btm_fastlevel;
375 
376  rootbuf = _bt_getbuf(rel, rootblkno, BT_READ);
377  rootpage = BufferGetPage(rootbuf);
378  rootopaque = BTPageGetOpaque(rootpage);
379 
380  /*
381  * Since the cache might be stale, we check the page more carefully
382  * here than normal. We *must* check that it's not deleted. If it's
383  * not alone on its level, then we reject too --- this may be overly
384  * paranoid but better safe than sorry. Note we don't check P_ISROOT,
385  * because that's not set in a "fast root".
386  */
387  if (!P_IGNORE(rootopaque) &&
388  rootopaque->btpo_level == rootlevel &&
389  P_LEFTMOST(rootopaque) &&
390  P_RIGHTMOST(rootopaque))
391  {
392  /* OK, accept cached page as the root */
393  return rootbuf;
394  }
395  _bt_relbuf(rel, rootbuf);
396  /* Cache is stale, throw it away */
397  if (rel->rd_amcache)
398  pfree(rel->rd_amcache);
399  rel->rd_amcache = NULL;
400  }
401 
402  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
403  metad = _bt_getmeta(rel, metabuf);
404 
405  /* if no root page initialized yet, do it */
406  if (metad->btm_root == P_NONE)
407  {
408  Page metapg;
409 
410  /* If access = BT_READ, caller doesn't want us to create root yet */
411  if (access == BT_READ)
412  {
413  _bt_relbuf(rel, metabuf);
414  return InvalidBuffer;
415  }
416 
417  /* trade in our read lock for a write lock */
418  _bt_unlockbuf(rel, metabuf);
419  _bt_lockbuf(rel, metabuf, BT_WRITE);
420 
421  /*
422  * Race condition: if someone else initialized the metadata between
423  * the time we released the read lock and acquired the write lock, we
424  * must avoid doing it again.
425  */
426  if (metad->btm_root != P_NONE)
427  {
428  /*
429  * Metadata initialized by someone else. In order to guarantee no
430  * deadlocks, we have to release the metadata page and start all
431  * over again. (Is that really true? But it's hardly worth trying
432  * to optimize this case.)
433  */
434  _bt_relbuf(rel, metabuf);
435  return _bt_getroot(rel, heaprel, access);
436  }
437 
438  /*
439  * Get, initialize, write, and leave a lock of the appropriate type on
440  * the new root page. Since this is the first page in the tree, it's
441  * a leaf as well as the root.
442  */
443  rootbuf = _bt_allocbuf(rel, heaprel);
444  rootblkno = BufferGetBlockNumber(rootbuf);
445  rootpage = BufferGetPage(rootbuf);
446  rootopaque = BTPageGetOpaque(rootpage);
447  rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE;
448  rootopaque->btpo_flags = (BTP_LEAF | BTP_ROOT);
449  rootopaque->btpo_level = 0;
450  rootopaque->btpo_cycleid = 0;
451  /* Get raw page pointer for metapage */
452  metapg = BufferGetPage(metabuf);
453 
454  /* NO ELOG(ERROR) till meta is updated */
456 
457  /* upgrade metapage if needed */
458  if (metad->btm_version < BTREE_NOVAC_VERSION)
459  _bt_upgrademetapage(metapg);
460 
461  metad->btm_root = rootblkno;
462  metad->btm_level = 0;
463  metad->btm_fastroot = rootblkno;
464  metad->btm_fastlevel = 0;
466  metad->btm_last_cleanup_num_heap_tuples = -1.0;
467 
468  MarkBufferDirty(rootbuf);
469  MarkBufferDirty(metabuf);
470 
471  /* XLOG stuff */
472  if (RelationNeedsWAL(rel))
473  {
474  xl_btree_newroot xlrec;
475  XLogRecPtr recptr;
477 
478  XLogBeginInsert();
481 
483  md.version = metad->btm_version;
484  md.root = rootblkno;
485  md.level = 0;
486  md.fastroot = rootblkno;
487  md.fastlevel = 0;
489  md.allequalimage = metad->btm_allequalimage;
490 
491  XLogRegisterBufData(2, (char *) &md, sizeof(xl_btree_metadata));
492 
493  xlrec.rootblk = rootblkno;
494  xlrec.level = 0;
495 
496  XLogRegisterData((char *) &xlrec, SizeOfBtreeNewroot);
497 
498  recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT);
499 
500  PageSetLSN(rootpage, recptr);
501  PageSetLSN(metapg, recptr);
502  }
503 
505 
506  /*
507  * swap root write lock for read lock. There is no danger of anyone
508  * else accessing the new root page while it's unlocked, since no one
509  * else knows where it is yet.
510  */
511  _bt_unlockbuf(rel, rootbuf);
512  _bt_lockbuf(rel, rootbuf, BT_READ);
513 
514  /* okay, metadata is correct, release lock on it without caching */
515  _bt_relbuf(rel, metabuf);
516  }
517  else
518  {
519  rootblkno = metad->btm_fastroot;
520  Assert(rootblkno != P_NONE);
521  rootlevel = metad->btm_fastlevel;
522 
523  /*
524  * Cache the metapage data for next time
525  */
527  sizeof(BTMetaPageData));
528  memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));
529 
530  /*
531  * We are done with the metapage; arrange to release it via first
532  * _bt_relandgetbuf call
533  */
534  rootbuf = metabuf;
535 
536  for (;;)
537  {
538  rootbuf = _bt_relandgetbuf(rel, rootbuf, rootblkno, BT_READ);
539  rootpage = BufferGetPage(rootbuf);
540  rootopaque = BTPageGetOpaque(rootpage);
541 
542  if (!P_IGNORE(rootopaque))
543  break;
544 
545  /* it's dead, Jim. step right one page */
546  if (P_RIGHTMOST(rootopaque))
547  elog(ERROR, "no live root page found in index \"%s\"",
549  rootblkno = rootopaque->btpo_next;
550  }
551 
552  if (rootopaque->btpo_level != rootlevel)
553  elog(ERROR, "root page %u of index \"%s\" has level %u, expected %u",
554  rootblkno, RelationGetRelationName(rel),
555  rootopaque->btpo_level, rootlevel);
556  }
557 
558  /*
559  * By here, we have a pin and read lock on the root page, and no lock set
560  * on the metadata page. Return the root page's buffer.
561  */
562  return rootbuf;
563 }
void * MemoryContextAlloc(MemoryContext context, Size size)
Definition: mcxt.c:1181
void _bt_upgrademetapage(Page page)
Definition: nbtpage.c:107
Buffer _bt_allocbuf(Relation rel, Relation heaprel)
Definition: nbtpage.c:869
static BTMetaPageData * _bt_getmeta(Relation rel, Buffer metabuf)
Definition: nbtpage.c:142
#define BTREE_MIN_VERSION
Definition: nbtree.h:151
#define BTP_LEAF
Definition: nbtree.h:76
#define BTREE_MAGIC
Definition: nbtree.h:149
#define BTP_ROOT
Definition: nbtree.h:77
#define SizeOfBtreeNewroot
Definition: nbtxlog.h:347
#define XLOG_BTREE_NEWROOT
Definition: nbtxlog.h:37
uint32 btm_last_cleanup_num_delpages
Definition: nbtree.h:114
uint32 btm_level
Definition: nbtree.h:108
float8 btm_last_cleanup_num_heap_tuples
Definition: nbtree.h:116
BlockNumber btm_fastroot
Definition: nbtree.h:109
uint32 btm_version
Definition: nbtree.h:106
uint32 btm_magic
Definition: nbtree.h:105
bool btm_allequalimage
Definition: nbtree.h:118
uint32 btm_fastlevel
Definition: nbtree.h:110
BlockNumber btpo_prev
Definition: nbtree.h:64
void * rd_amcache
Definition: rel.h:229
MemoryContext rd_indexcxt
Definition: rel.h:204
uint32 level
Definition: nbtxlog.h:50
uint32 version
Definition: nbtxlog.h:48
bool allequalimage
Definition: nbtxlog.h:54
BlockNumber fastroot
Definition: nbtxlog.h:51
uint32 fastlevel
Definition: nbtxlog.h:52
BlockNumber root
Definition: nbtxlog.h:49
uint32 last_cleanup_num_delpages
Definition: nbtxlog.h:53
uint32 level
Definition: nbtxlog.h:344
BlockNumber rootblk
Definition: nbtxlog.h:343
#define REGBUF_WILL_INIT
Definition: xloginsert.h:33

References _bt_allocbuf(), _bt_getbuf(), _bt_getmeta(), _bt_lockbuf(), _bt_relandgetbuf(), _bt_relbuf(), _bt_unlockbuf(), _bt_upgrademetapage(), xl_btree_metadata::allequalimage, Assert, BT_READ, BT_WRITE, BTMetaPageData::btm_allequalimage, BTMetaPageData::btm_fastlevel, BTMetaPageData::btm_fastroot, BTMetaPageData::btm_last_cleanup_num_delpages, BTMetaPageData::btm_last_cleanup_num_heap_tuples, BTMetaPageData::btm_level, BTMetaPageData::btm_magic, BTMetaPageData::btm_root, BTMetaPageData::btm_version, BTP_LEAF, BTP_ROOT, BTPageGetOpaque, BTPageOpaqueData::btpo_cycleid, BTPageOpaqueData::btpo_flags, BTPageOpaqueData::btpo_level, BTPageOpaqueData::btpo_next, BTPageOpaqueData::btpo_prev, BTREE_MAGIC, BTREE_METAPAGE, BTREE_MIN_VERSION, BTREE_NOVAC_VERSION, BTREE_VERSION, BufferGetBlockNumber(), BufferGetPage(), elog, END_CRIT_SECTION, ERROR, xl_btree_metadata::fastlevel, xl_btree_metadata::fastroot, InvalidBuffer, xl_btree_metadata::last_cleanup_num_delpages, xl_btree_metadata::level, xl_btree_newroot::level, MarkBufferDirty(), MemoryContextAlloc(), P_IGNORE, P_LEFTMOST, P_NONE, P_RIGHTMOST, PageSetLSN(), pfree(), RelationData::rd_amcache, RelationData::rd_indexcxt, REGBUF_STANDARD, REGBUF_WILL_INIT, RelationGetRelationName, RelationNeedsWAL, xl_btree_metadata::root, xl_btree_newroot::rootblk, SizeOfBtreeNewroot, START_CRIT_SECTION, xl_btree_metadata::version, XLOG_BTREE_NEWROOT, XLogBeginInsert(), XLogInsert(), XLogRegisterBufData(), XLogRegisterBuffer(), and XLogRegisterData().

Referenced by _bt_get_endpoint(), and _bt_search().

◆ _bt_getrootheight()

int _bt_getrootheight ( Relation  rel)

Definition at line 675 of file nbtpage.c.

676 {
677  BTMetaPageData *metad;
678 
679  if (rel->rd_amcache == NULL)
680  {
681  Buffer metabuf;
682 
683  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
684  metad = _bt_getmeta(rel, metabuf);
685 
686  /*
687  * If there's no root page yet, _bt_getroot() doesn't expect a cache
688  * to be made, so just stop here and report the index height is zero.
689  * (XXX perhaps _bt_getroot() should be changed to allow this case.)
690  */
691  if (metad->btm_root == P_NONE)
692  {
693  _bt_relbuf(rel, metabuf);
694  return 0;
695  }
696 
697  /*
698  * Cache the metapage data for next time
699  */
701  sizeof(BTMetaPageData));
702  memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));
703  _bt_relbuf(rel, metabuf);
704  }
705 
706  /* Get cached page */
707  metad = (BTMetaPageData *) rel->rd_amcache;
708  /* We shouldn't have cached it if any of these fail */
709  Assert(metad->btm_magic == BTREE_MAGIC);
711  Assert(metad->btm_version <= BTREE_VERSION);
712  Assert(!metad->btm_allequalimage ||
714  Assert(metad->btm_fastroot != P_NONE);
715 
716  return metad->btm_fastlevel;
717 }

References _bt_getbuf(), _bt_getmeta(), _bt_relbuf(), Assert, BT_READ, BTMetaPageData::btm_allequalimage, BTMetaPageData::btm_fastlevel, BTMetaPageData::btm_fastroot, BTMetaPageData::btm_magic, BTMetaPageData::btm_root, BTMetaPageData::btm_version, BTREE_MAGIC, BTREE_METAPAGE, BTREE_MIN_VERSION, BTREE_NOVAC_VERSION, BTREE_VERSION, MemoryContextAlloc(), P_NONE, RelationData::rd_amcache, and RelationData::rd_indexcxt.

Referenced by _bt_insertonpg(), and get_relation_info().

◆ _bt_getstackbuf()

Buffer _bt_getstackbuf ( Relation  rel,
Relation  heaprel,
BTStack  stack,
BlockNumber  child 
)

Definition at line 2319 of file nbtinsert.c.

2320 {
2321  BlockNumber blkno;
2323 
2324  blkno = stack->bts_blkno;
2325  start = stack->bts_offset;
2326 
2327  for (;;)
2328  {
2329  Buffer buf;
2330  Page page;
2331  BTPageOpaque opaque;
2332 
2333  buf = _bt_getbuf(rel, blkno, BT_WRITE);
2334  page = BufferGetPage(buf);
2335  opaque = BTPageGetOpaque(page);
2336 
2337  Assert(heaprel != NULL);
2338  if (P_INCOMPLETE_SPLIT(opaque))
2339  {
2340  _bt_finish_split(rel, heaprel, buf, stack->bts_parent);
2341  continue;
2342  }
2343 
2344  if (!P_IGNORE(opaque))
2345  {
2346  OffsetNumber offnum,
2347  minoff,
2348  maxoff;
2349  ItemId itemid;
2350  IndexTuple item;
2351 
2352  minoff = P_FIRSTDATAKEY(opaque);
2353  maxoff = PageGetMaxOffsetNumber(page);
2354 
2355  /*
2356  * start = InvalidOffsetNumber means "search the whole page". We
2357  * need this test anyway due to possibility that page has a high
2358  * key now when it didn't before.
2359  */
2360  if (start < minoff)
2361  start = minoff;
2362 
2363  /*
2364  * Need this check too, to guard against possibility that page
2365  * split since we visited it originally.
2366  */
2367  if (start > maxoff)
2368  start = OffsetNumberNext(maxoff);
2369 
2370  /*
2371  * These loops will check every item on the page --- but in an
2372  * order that's attuned to the probability of where it actually
2373  * is. Scan to the right first, then to the left.
2374  */
2375  for (offnum = start;
2376  offnum <= maxoff;
2377  offnum = OffsetNumberNext(offnum))
2378  {
2379  itemid = PageGetItemId(page, offnum);
2380  item = (IndexTuple) PageGetItem(page, itemid);
2381 
2382  if (BTreeTupleGetDownLink(item) == child)
2383  {
2384  /* Return accurate pointer to where link is now */
2385  stack->bts_blkno = blkno;
2386  stack->bts_offset = offnum;
2387  return buf;
2388  }
2389  }
2390 
2391  for (offnum = OffsetNumberPrev(start);
2392  offnum >= minoff;
2393  offnum = OffsetNumberPrev(offnum))
2394  {
2395  itemid = PageGetItemId(page, offnum);
2396  item = (IndexTuple) PageGetItem(page, itemid);
2397 
2398  if (BTreeTupleGetDownLink(item) == child)
2399  {
2400  /* Return accurate pointer to where link is now */
2401  stack->bts_blkno = blkno;
2402  stack->bts_offset = offnum;
2403  return buf;
2404  }
2405  }
2406  }
2407 
2408  /*
2409  * The item we're looking for moved right at least one page.
2410  *
2411  * Lehman and Yao couple/chain locks when moving right here, which we
2412  * can avoid. See nbtree/README.
2413  */
2414  if (P_RIGHTMOST(opaque))
2415  {
2416  _bt_relbuf(rel, buf);
2417  return InvalidBuffer;
2418  }
2419  blkno = opaque->btpo_next;
2421  _bt_relbuf(rel, buf);
2422  }
2423 }
return str start
void _bt_finish_split(Relation rel, Relation heaprel, Buffer lbuf, BTStack stack)
Definition: nbtinsert.c:2241
#define OffsetNumberPrev(offsetNumber)
Definition: off.h:54
BlockNumber bts_blkno
Definition: nbtree.h:734
OffsetNumber bts_offset
Definition: nbtree.h:735

References _bt_finish_split(), _bt_getbuf(), _bt_relbuf(), Assert, BT_WRITE, BTPageGetOpaque, BTPageOpaqueData::btpo_next, BTreeTupleGetDownLink(), BTStackData::bts_blkno, BTStackData::bts_offset, BTStackData::bts_parent, buf, BufferGetPage(), InvalidBuffer, InvalidOffsetNumber, OffsetNumberNext, OffsetNumberPrev, P_FIRSTDATAKEY, P_IGNORE, P_INCOMPLETE_SPLIT, P_RIGHTMOST, PageGetItem(), PageGetItemId(), PageGetMaxOffsetNumber(), and start.

Referenced by _bt_insert_parent(), and _bt_lock_subtree_parent().

◆ _bt_gettrueroot()

Buffer _bt_gettrueroot ( Relation  rel)

Definition at line 580 of file nbtpage.c.

581 {
582  Buffer metabuf;
583  Page metapg;
584  BTPageOpaque metaopaque;
585  Buffer rootbuf;
586  Page rootpage;
587  BTPageOpaque rootopaque;
588  BlockNumber rootblkno;
589  uint32 rootlevel;
590  BTMetaPageData *metad;
591 
592  /*
593  * We don't try to use cached metapage data here, since (a) this path is
594  * not performance-critical, and (b) if we are here it suggests our cache
595  * is out-of-date anyway. In light of point (b), it's probably safest to
596  * actively flush any cached metapage info.
597  */
598  if (rel->rd_amcache)
599  pfree(rel->rd_amcache);
600  rel->rd_amcache = NULL;
601 
602  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
603  metapg = BufferGetPage(metabuf);
604  metaopaque = BTPageGetOpaque(metapg);
605  metad = BTPageGetMeta(metapg);
606 
607  if (!P_ISMETA(metaopaque) ||
608  metad->btm_magic != BTREE_MAGIC)
609  ereport(ERROR,
610  (errcode(ERRCODE_INDEX_CORRUPTED),
611  errmsg("index \"%s\" is not a btree",
612  RelationGetRelationName(rel))));
613 
614  if (metad->btm_version < BTREE_MIN_VERSION ||
615  metad->btm_version > BTREE_VERSION)
616  ereport(ERROR,
617  (errcode(ERRCODE_INDEX_CORRUPTED),
618  errmsg("version mismatch in index \"%s\": file version %d, "
619  "current version %d, minimal supported version %d",
622 
623  /* if no root page initialized yet, fail */
624  if (metad->btm_root == P_NONE)
625  {
626  _bt_relbuf(rel, metabuf);
627  return InvalidBuffer;
628  }
629 
630  rootblkno = metad->btm_root;
631  rootlevel = metad->btm_level;
632 
633  /*
634  * We are done with the metapage; arrange to release it via first
635  * _bt_relandgetbuf call
636  */
637  rootbuf = metabuf;
638 
639  for (;;)
640  {
641  rootbuf = _bt_relandgetbuf(rel, rootbuf, rootblkno, BT_READ);
642  rootpage = BufferGetPage(rootbuf);
643  rootopaque = BTPageGetOpaque(rootpage);
644 
645  if (!P_IGNORE(rootopaque))
646  break;
647 
648  /* it's dead, Jim. step right one page */
649  if (P_RIGHTMOST(rootopaque))
650  elog(ERROR, "no live root page found in index \"%s\"",
652  rootblkno = rootopaque->btpo_next;
653  }
654 
655  if (rootopaque->btpo_level != rootlevel)
656  elog(ERROR, "root page %u of index \"%s\" has level %u, expected %u",
657  rootblkno, RelationGetRelationName(rel),
658  rootopaque->btpo_level, rootlevel);
659 
660  return rootbuf;
661 }
#define P_ISMETA(opaque)
Definition: nbtree.h:223

References _bt_getbuf(), _bt_relandgetbuf(), _bt_relbuf(), BT_READ, BTMetaPageData::btm_level, BTMetaPageData::btm_magic, BTMetaPageData::btm_root, BTMetaPageData::btm_version, BTPageGetMeta, BTPageGetOpaque, BTPageOpaqueData::btpo_level, BTPageOpaqueData::btpo_next, BTREE_MAGIC, BTREE_METAPAGE, BTREE_MIN_VERSION, BTREE_VERSION, BufferGetPage(), elog, ereport, errcode(), errmsg(), ERROR, InvalidBuffer, P_IGNORE, P_ISMETA, P_NONE, P_RIGHTMOST, pfree(), RelationData::rd_amcache, and RelationGetRelationName.

Referenced by _bt_get_endpoint().

◆ _bt_initmetapage()

void _bt_initmetapage ( Page  page,
BlockNumber  rootbknum,
uint32  level,
bool  allequalimage 
)

Definition at line 67 of file nbtpage.c.

69 {
70  BTMetaPageData *metad;
71  BTPageOpaque metaopaque;
72 
73  _bt_pageinit(page, BLCKSZ);
74 
75  metad = BTPageGetMeta(page);
76  metad->btm_magic = BTREE_MAGIC;
77  metad->btm_version = BTREE_VERSION;
78  metad->btm_root = rootbknum;
79  metad->btm_level = level;
80  metad->btm_fastroot = rootbknum;
81  metad->btm_fastlevel = level;
84  metad->btm_allequalimage = allequalimage;
85 
86  metaopaque = BTPageGetOpaque(page);
87  metaopaque->btpo_flags = BTP_META;
88 
89  /*
90  * Set pd_lower just past the end of the metadata. This is essential,
91  * because without doing so, metadata will be lost if xlog.c compresses
92  * the page.
93