PostgreSQL Source Code git master
nbtree.h File Reference
#include "access/amapi.h"
#include "access/itup.h"
#include "access/sdir.h"
#include "access/tableam.h"
#include "access/xlogreader.h"
#include "catalog/pg_am_d.h"
#include "catalog/pg_index.h"
#include "lib/stringinfo.h"
#include "storage/bufmgr.h"
#include "storage/shm_toc.h"
Include dependency graph for nbtree.h:
This graph shows which files directly or indirectly include this file:

Go to the source code of this file.

Data Structures

struct  BTPageOpaqueData
 
struct  BTMetaPageData
 
struct  BTDeletedPageData
 
struct  BTPendingFSM
 
struct  BTVacState
 
struct  BTStackData
 
struct  BTScanInsertData
 
struct  BTInsertStateData
 
struct  BTDedupInterval
 
struct  BTDedupStateData
 
struct  BTVacuumPostingData
 
struct  BTScanPosItem
 
struct  BTScanPosData
 
struct  BTArrayKeyInfo
 
struct  BTScanOpaqueData
 
struct  BTReadPageState
 
struct  BTOptions
 

Macros

#define BTPageGetOpaque(page)   ((BTPageOpaque) PageGetSpecialPointer(page))
 
#define BTP_LEAF   (1 << 0) /* leaf page, i.e. not internal page */
 
#define BTP_ROOT   (1 << 1) /* root page (has no parent) */
 
#define BTP_DELETED   (1 << 2) /* page has been deleted from tree */
 
#define BTP_META   (1 << 3) /* meta-page */
 
#define BTP_HALF_DEAD   (1 << 4) /* empty, but still in tree */
 
#define BTP_SPLIT_END   (1 << 5) /* rightmost page of split group */
 
#define BTP_HAS_GARBAGE   (1 << 6) /* page has LP_DEAD tuples (deprecated) */
 
#define BTP_INCOMPLETE_SPLIT   (1 << 7) /* right sibling's downlink is missing */
 
#define BTP_HAS_FULLXID   (1 << 8) /* contains BTDeletedPageData */
 
#define MAX_BT_CYCLE_ID   0xFF7F
 
#define BTPageGetMeta(p)    ((BTMetaPageData *) PageGetContents(p))
 
#define BTREE_METAPAGE   0 /* first page is meta */
 
#define BTREE_MAGIC   0x053162 /* magic number in metapage */
 
#define BTREE_VERSION   4 /* current version number */
 
#define BTREE_MIN_VERSION   2 /* minimum supported version */
 
#define BTREE_NOVAC_VERSION   3 /* version with all meta fields set */
 
#define BTMaxItemSize(page)
 
#define BTMaxItemSizeNoHeapTid(page)
 
#define MaxTIDsPerBTreePage
 
#define BTREE_MIN_FILLFACTOR   10
 
#define BTREE_DEFAULT_FILLFACTOR   90
 
#define BTREE_NONLEAF_FILLFACTOR   70
 
#define BTREE_SINGLEVAL_FILLFACTOR   96
 
#define P_NONE   0
 
#define P_LEFTMOST(opaque)   ((opaque)->btpo_prev == P_NONE)
 
#define P_RIGHTMOST(opaque)   ((opaque)->btpo_next == P_NONE)
 
#define P_ISLEAF(opaque)   (((opaque)->btpo_flags & BTP_LEAF) != 0)
 
#define P_ISROOT(opaque)   (((opaque)->btpo_flags & BTP_ROOT) != 0)
 
#define P_ISDELETED(opaque)   (((opaque)->btpo_flags & BTP_DELETED) != 0)
 
#define P_ISMETA(opaque)   (((opaque)->btpo_flags & BTP_META) != 0)
 
#define P_ISHALFDEAD(opaque)   (((opaque)->btpo_flags & BTP_HALF_DEAD) != 0)
 
#define P_IGNORE(opaque)   (((opaque)->btpo_flags & (BTP_DELETED|BTP_HALF_DEAD)) != 0)
 
#define P_HAS_GARBAGE(opaque)   (((opaque)->btpo_flags & BTP_HAS_GARBAGE) != 0)
 
#define P_INCOMPLETE_SPLIT(opaque)   (((opaque)->btpo_flags & BTP_INCOMPLETE_SPLIT) != 0)
 
#define P_HAS_FULLXID(opaque)   (((opaque)->btpo_flags & BTP_HAS_FULLXID) != 0)
 
#define P_HIKEY   ((OffsetNumber) 1)
 
#define P_FIRSTKEY   ((OffsetNumber) 2)
 
#define P_FIRSTDATAKEY(opaque)   (P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY)
 
#define INDEX_ALT_TID_MASK   INDEX_AM_RESERVED_BIT
 
#define BT_OFFSET_MASK   0x0FFF
 
#define BT_STATUS_OFFSET_MASK   0xF000
 
#define BT_PIVOT_HEAP_TID_ATTR   0x1000
 
#define BT_IS_POSTING   0x2000
 
#define BTreeTupleGetNAtts(itup, rel)
 
#define BTCommuteStrategyNumber(strat)   (BTMaxStrategyNumber + 1 - (strat))
 
#define BTORDER_PROC   1
 
#define BTSORTSUPPORT_PROC   2
 
#define BTINRANGE_PROC   3
 
#define BTEQUALIMAGE_PROC   4
 
#define BTOPTIONS_PROC   5
 
#define BTNProcs   5
 
#define BT_READ   BUFFER_LOCK_SHARE
 
#define BT_WRITE   BUFFER_LOCK_EXCLUSIVE
 
#define BTScanPosIsPinned(scanpos)
 
#define BTScanPosUnpin(scanpos)
 
#define BTScanPosUnpinIfPinned(scanpos)
 
#define BTScanPosIsValid(scanpos)
 
#define BTScanPosInvalidate(scanpos)
 
#define SK_BT_REQFWD   0x00010000 /* required to continue forward scan */
 
#define SK_BT_REQBKWD   0x00020000 /* required to continue backward scan */
 
#define SK_BT_INDOPTION_SHIFT   24 /* must clear the above bits */
 
#define SK_BT_DESC   (INDOPTION_DESC << SK_BT_INDOPTION_SHIFT)
 
#define SK_BT_NULLS_FIRST   (INDOPTION_NULLS_FIRST << SK_BT_INDOPTION_SHIFT)
 
#define BTGetFillFactor(relation)
 
#define BTGetTargetPageFreeSpace(relation)    (BLCKSZ * (100 - BTGetFillFactor(relation)) / 100)
 
#define BTGetDeduplicateItems(relation)
 
#define PROGRESS_BTREE_PHASE_INDEXBUILD_TABLESCAN   2
 
#define PROGRESS_BTREE_PHASE_PERFORMSORT_1   3
 
#define PROGRESS_BTREE_PHASE_PERFORMSORT_2   4
 
#define PROGRESS_BTREE_PHASE_LEAF_LOAD   5
 

Typedefs

typedef uint16 BTCycleId
 
typedef struct BTPageOpaqueData BTPageOpaqueData
 
typedef BTPageOpaqueDataBTPageOpaque
 
typedef struct BTMetaPageData BTMetaPageData
 
typedef struct BTDeletedPageData BTDeletedPageData
 
typedef struct BTPendingFSM BTPendingFSM
 
typedef struct BTVacState BTVacState
 
typedef struct BTStackData BTStackData
 
typedef BTStackDataBTStack
 
typedef struct BTScanInsertData BTScanInsertData
 
typedef BTScanInsertDataBTScanInsert
 
typedef struct BTInsertStateData BTInsertStateData
 
typedef BTInsertStateDataBTInsertState
 
typedef struct BTDedupInterval BTDedupInterval
 
typedef struct BTDedupStateData BTDedupStateData
 
typedef BTDedupStateDataBTDedupState
 
typedef struct BTVacuumPostingData BTVacuumPostingData
 
typedef BTVacuumPostingDataBTVacuumPosting
 
typedef struct BTScanPosItem BTScanPosItem
 
typedef struct BTScanPosData BTScanPosData
 
typedef BTScanPosDataBTScanPos
 
typedef struct BTArrayKeyInfo BTArrayKeyInfo
 
typedef struct BTScanOpaqueData BTScanOpaqueData
 
typedef BTScanOpaqueDataBTScanOpaque
 
typedef struct BTReadPageState BTReadPageState
 
typedef struct BTOptions BTOptions
 

Functions

static void BTPageSetDeleted (Page page, FullTransactionId safexid)
 
static FullTransactionId BTPageGetDeleteXid (Page page)
 
static bool BTPageIsRecyclable (Page page, Relation heaprel)
 
 StaticAssertDecl (BT_OFFSET_MASK >=INDEX_MAX_KEYS, "BT_OFFSET_MASK can't fit INDEX_MAX_KEYS")
 
static bool BTreeTupleIsPivot (IndexTuple itup)
 
static bool BTreeTupleIsPosting (IndexTuple itup)
 
static void BTreeTupleSetPosting (IndexTuple itup, uint16 nhtids, int postingoffset)
 
static uint16 BTreeTupleGetNPosting (IndexTuple posting)
 
static uint32 BTreeTupleGetPostingOffset (IndexTuple posting)
 
static ItemPointer BTreeTupleGetPosting (IndexTuple posting)
 
static ItemPointer BTreeTupleGetPostingN (IndexTuple posting, int n)
 
static BlockNumber BTreeTupleGetDownLink (IndexTuple pivot)
 
static void BTreeTupleSetDownLink (IndexTuple pivot, BlockNumber blkno)
 
static void BTreeTupleSetNAtts (IndexTuple itup, uint16 nkeyatts, bool heaptid)
 
static BlockNumber BTreeTupleGetTopParent (IndexTuple leafhikey)
 
static void BTreeTupleSetTopParent (IndexTuple leafhikey, BlockNumber blkno)
 
static ItemPointer BTreeTupleGetHeapTID (IndexTuple itup)
 
static ItemPointer BTreeTupleGetMaxHeapTID (IndexTuple itup)
 
void btbuildempty (Relation index)
 
bool btinsert (Relation rel, Datum *values, bool *isnull, ItemPointer ht_ctid, Relation heapRel, IndexUniqueCheck checkUnique, bool indexUnchanged, struct IndexInfo *indexInfo)
 
IndexScanDesc btbeginscan (Relation rel, int nkeys, int norderbys)
 
Size btestimateparallelscan (int nkeys, int norderbys)
 
void btinitparallelscan (void *target)
 
bool btgettuple (IndexScanDesc scan, ScanDirection dir)
 
int64 btgetbitmap (IndexScanDesc scan, TIDBitmap *tbm)
 
void btrescan (IndexScanDesc scan, ScanKey scankey, int nscankeys, ScanKey orderbys, int norderbys)
 
void btparallelrescan (IndexScanDesc scan)
 
void btendscan (IndexScanDesc scan)
 
void btmarkpos (IndexScanDesc scan)
 
void btrestrpos (IndexScanDesc scan)
 
IndexBulkDeleteResultbtbulkdelete (IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state)
 
IndexBulkDeleteResultbtvacuumcleanup (IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
 
bool btcanreturn (Relation index, int attno)
 
int btgettreeheight (Relation rel)
 
bool _bt_parallel_seize (IndexScanDesc scan, BlockNumber *next_scan_page, BlockNumber *last_curr_page, bool first)
 
void _bt_parallel_release (IndexScanDesc scan, BlockNumber next_scan_page, BlockNumber curr_page)
 
void _bt_parallel_done (IndexScanDesc scan)
 
void _bt_parallel_primscan_schedule (IndexScanDesc scan, BlockNumber curr_page)
 
void _bt_dedup_pass (Relation rel, Buffer buf, IndexTuple newitem, Size newitemsz, bool bottomupdedup)
 
bool _bt_bottomupdel_pass (Relation rel, Buffer buf, Relation heapRel, Size newitemsz)
 
void _bt_dedup_start_pending (BTDedupState state, IndexTuple base, OffsetNumber baseoff)
 
bool _bt_dedup_save_htid (BTDedupState state, IndexTuple itup)
 
Size _bt_dedup_finish_pending (Page newpage, BTDedupState state)
 
IndexTuple _bt_form_posting (IndexTuple base, ItemPointer htids, int nhtids)
 
void _bt_update_posting (BTVacuumPosting vacposting)
 
IndexTuple _bt_swap_posting (IndexTuple newitem, IndexTuple oposting, int postingoff)
 
bool _bt_doinsert (Relation rel, IndexTuple itup, IndexUniqueCheck checkUnique, bool indexUnchanged, Relation heapRel)
 
void _bt_finish_split (Relation rel, Relation heaprel, Buffer lbuf, BTStack stack)
 
Buffer _bt_getstackbuf (Relation rel, Relation heaprel, BTStack stack, BlockNumber child)
 
OffsetNumber _bt_findsplitloc (Relation rel, Page origpage, OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem, bool *newitemonleft)
 
void _bt_initmetapage (Page page, BlockNumber rootbknum, uint32 level, bool allequalimage)
 
bool _bt_vacuum_needs_cleanup (Relation rel)
 
void _bt_set_cleanup_info (Relation rel, BlockNumber num_delpages)
 
void _bt_upgrademetapage (Page page)
 
Buffer _bt_getroot (Relation rel, Relation heaprel, int access)
 
Buffer _bt_gettrueroot (Relation rel)
 
int _bt_getrootheight (Relation rel)
 
void _bt_metaversion (Relation rel, bool *heapkeyspace, bool *allequalimage)
 
void _bt_checkpage (Relation rel, Buffer buf)
 
Buffer _bt_getbuf (Relation rel, BlockNumber blkno, int access)
 
Buffer _bt_allocbuf (Relation rel, Relation heaprel)
 
Buffer _bt_relandgetbuf (Relation rel, Buffer obuf, BlockNumber blkno, int access)
 
void _bt_relbuf (Relation rel, Buffer buf)
 
void _bt_lockbuf (Relation rel, Buffer buf, int access)
 
void _bt_unlockbuf (Relation rel, Buffer buf)
 
bool _bt_conditionallockbuf (Relation rel, Buffer buf)
 
void _bt_upgradelockbufcleanup (Relation rel, Buffer buf)
 
void _bt_pageinit (Page page, Size size)
 
void _bt_delitems_vacuum (Relation rel, Buffer buf, OffsetNumber *deletable, int ndeletable, BTVacuumPosting *updatable, int nupdatable)
 
void _bt_delitems_delete_check (Relation rel, Buffer buf, Relation heapRel, TM_IndexDeleteOp *delstate)
 
void _bt_pagedel (Relation rel, Buffer leafbuf, BTVacState *vstate)
 
void _bt_pendingfsm_init (Relation rel, BTVacState *vstate, bool cleanuponly)
 
void _bt_pendingfsm_finalize (Relation rel, BTVacState *vstate)
 
void _bt_preprocess_keys (IndexScanDesc scan)
 
BTStack _bt_search (Relation rel, Relation heaprel, BTScanInsert key, Buffer *bufP, int access)
 
OffsetNumber _bt_binsrch_insert (Relation rel, BTInsertState insertstate)
 
int32 _bt_compare (Relation rel, BTScanInsert key, Page page, OffsetNumber offnum)
 
bool _bt_first (IndexScanDesc scan, ScanDirection dir)
 
bool _bt_next (IndexScanDesc scan, ScanDirection dir)
 
Buffer _bt_get_endpoint (Relation rel, uint32 level, bool rightmost)
 
BTScanInsert _bt_mkscankey (Relation rel, IndexTuple itup)
 
void _bt_freestack (BTStack stack)
 
bool _bt_start_prim_scan (IndexScanDesc scan, ScanDirection dir)
 
int _bt_binsrch_array_skey (FmgrInfo *orderproc, bool cur_elem_trig, ScanDirection dir, Datum tupdatum, bool tupnull, BTArrayKeyInfo *array, ScanKey cur, int32 *set_elem_result)
 
void _bt_start_array_keys (IndexScanDesc scan, ScanDirection dir)
 
bool _bt_checkkeys (IndexScanDesc scan, BTReadPageState *pstate, bool arrayKeys, IndexTuple tuple, int tupnatts)
 
bool _bt_oppodir_checkkeys (IndexScanDesc scan, ScanDirection dir, IndexTuple finaltup)
 
void _bt_killitems (IndexScanDesc scan)
 
BTCycleId _bt_vacuum_cycleid (Relation rel)
 
BTCycleId _bt_start_vacuum (Relation rel)
 
void _bt_end_vacuum (Relation rel)
 
void _bt_end_vacuum_callback (int code, Datum arg)
 
Size BTreeShmemSize (void)
 
void BTreeShmemInit (void)
 
byteabtoptions (Datum reloptions, bool validate)
 
bool btproperty (Oid index_oid, int attno, IndexAMProperty prop, const char *propname, bool *res, bool *isnull)
 
char * btbuildphasename (int64 phasenum)
 
IndexTuple _bt_truncate (Relation rel, IndexTuple lastleft, IndexTuple firstright, BTScanInsert itup_key)
 
int _bt_keep_natts_fast (Relation rel, IndexTuple lastleft, IndexTuple firstright)
 
bool _bt_check_natts (Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum)
 
void _bt_check_third_page (Relation rel, Relation heap, bool needheaptidspace, Page page, IndexTuple newtup)
 
bool _bt_allequalimage (Relation rel, bool debugmessage)
 
bool btvalidate (Oid opclassoid)
 
void btadjustmembers (Oid opfamilyoid, Oid opclassoid, List *operators, List *functions)
 
IndexBuildResultbtbuild (Relation heap, Relation index, struct IndexInfo *indexInfo)
 
void _bt_parallel_build_main (dsm_segment *seg, shm_toc *toc)
 

Macro Definition Documentation

◆ BT_IS_POSTING

#define BT_IS_POSTING   0x2000

Definition at line 466 of file nbtree.h.

◆ BT_OFFSET_MASK

#define BT_OFFSET_MASK   0x0FFF

Definition at line 462 of file nbtree.h.

◆ BT_PIVOT_HEAP_TID_ATTR

#define BT_PIVOT_HEAP_TID_ATTR   0x1000

Definition at line 465 of file nbtree.h.

◆ BT_READ

#define BT_READ   BUFFER_LOCK_SHARE

Definition at line 724 of file nbtree.h.

◆ BT_STATUS_OFFSET_MASK

#define BT_STATUS_OFFSET_MASK   0xF000

Definition at line 463 of file nbtree.h.

◆ BT_WRITE

#define BT_WRITE   BUFFER_LOCK_EXCLUSIVE

Definition at line 725 of file nbtree.h.

◆ BTCommuteStrategyNumber

#define BTCommuteStrategyNumber (   strat)    (BTMaxStrategyNumber + 1 - (strat))

Definition at line 685 of file nbtree.h.

◆ BTEQUALIMAGE_PROC

#define BTEQUALIMAGE_PROC   4

Definition at line 715 of file nbtree.h.

◆ BTGetDeduplicateItems

#define BTGetDeduplicateItems (   relation)
Value:
(AssertMacro(relation->rd_rel->relkind == RELKIND_INDEX && \
relation->rd_rel->relam == BTREE_AM_OID), \
((relation)->rd_options ? \
((BTOptions *) (relation)->rd_options)->deduplicate_items : true))
#define AssertMacro(condition)
Definition: c.h:816

Definition at line 1141 of file nbtree.h.

◆ BTGetFillFactor

#define BTGetFillFactor (   relation)
Value:
(AssertMacro(relation->rd_rel->relkind == RELKIND_INDEX && \
relation->rd_rel->relam == BTREE_AM_OID), \
(relation)->rd_options ? \
((BTOptions *) (relation)->rd_options)->fillfactor : \
BTREE_DEFAULT_FILLFACTOR)

Definition at line 1133 of file nbtree.h.

◆ BTGetTargetPageFreeSpace

#define BTGetTargetPageFreeSpace (   relation)     (BLCKSZ * (100 - BTGetFillFactor(relation)) / 100)

Definition at line 1139 of file nbtree.h.

◆ BTINRANGE_PROC

#define BTINRANGE_PROC   3

Definition at line 714 of file nbtree.h.

◆ BTMaxItemSize

#define BTMaxItemSize (   page)
Value:
MAXALIGN(sizeof(BTPageOpaqueData))) / 3) - \
MAXALIGN(sizeof(ItemPointerData)))
static Size PageGetPageSize(const PageData *page)
Definition: bufpage.h:277
#define SizeOfPageHeaderData
Definition: bufpage.h:217
#define MAXALIGN_DOWN(LEN)
Definition: c.h:780
#define MAXALIGN(LEN)
Definition: c.h:768

Definition at line 164 of file nbtree.h.

◆ BTMaxItemSizeNoHeapTid

#define BTMaxItemSizeNoHeapTid (   page)
Value:

Definition at line 169 of file nbtree.h.

◆ BTNProcs

#define BTNProcs   5

Definition at line 717 of file nbtree.h.

◆ BTOPTIONS_PROC

#define BTOPTIONS_PROC   5

Definition at line 716 of file nbtree.h.

◆ BTORDER_PROC

#define BTORDER_PROC   1

Definition at line 712 of file nbtree.h.

◆ BTP_DELETED

#define BTP_DELETED   (1 << 2) /* page has been deleted from tree */

Definition at line 78 of file nbtree.h.

◆ BTP_HALF_DEAD

#define BTP_HALF_DEAD   (1 << 4) /* empty, but still in tree */

Definition at line 80 of file nbtree.h.

◆ BTP_HAS_FULLXID

#define BTP_HAS_FULLXID   (1 << 8) /* contains BTDeletedPageData */

Definition at line 84 of file nbtree.h.

◆ BTP_HAS_GARBAGE

#define BTP_HAS_GARBAGE   (1 << 6) /* page has LP_DEAD tuples (deprecated) */

Definition at line 82 of file nbtree.h.

◆ BTP_INCOMPLETE_SPLIT

#define BTP_INCOMPLETE_SPLIT   (1 << 7) /* right sibling's downlink is missing */

Definition at line 83 of file nbtree.h.

◆ BTP_LEAF

#define BTP_LEAF   (1 << 0) /* leaf page, i.e. not internal page */

Definition at line 76 of file nbtree.h.

◆ BTP_META

#define BTP_META   (1 << 3) /* meta-page */

Definition at line 79 of file nbtree.h.

◆ BTP_ROOT

#define BTP_ROOT   (1 << 1) /* root page (has no parent) */

Definition at line 77 of file nbtree.h.

◆ BTP_SPLIT_END

#define BTP_SPLIT_END   (1 << 5) /* rightmost page of split group */

Definition at line 81 of file nbtree.h.

◆ BTPageGetMeta

#define BTPageGetMeta (   p)     ((BTMetaPageData *) PageGetContents(p))

Definition at line 121 of file nbtree.h.

◆ BTPageGetOpaque

#define BTPageGetOpaque (   page)    ((BTPageOpaque) PageGetSpecialPointer(page))

Definition at line 73 of file nbtree.h.

◆ BTREE_DEFAULT_FILLFACTOR

#define BTREE_DEFAULT_FILLFACTOR   90

Definition at line 200 of file nbtree.h.

◆ BTREE_MAGIC

#define BTREE_MAGIC   0x053162 /* magic number in metapage */

Definition at line 149 of file nbtree.h.

◆ BTREE_METAPAGE

#define BTREE_METAPAGE   0 /* first page is meta */

Definition at line 148 of file nbtree.h.

◆ BTREE_MIN_FILLFACTOR

#define BTREE_MIN_FILLFACTOR   10

Definition at line 199 of file nbtree.h.

◆ BTREE_MIN_VERSION

#define BTREE_MIN_VERSION   2 /* minimum supported version */

Definition at line 151 of file nbtree.h.

◆ BTREE_NONLEAF_FILLFACTOR

#define BTREE_NONLEAF_FILLFACTOR   70

Definition at line 201 of file nbtree.h.

◆ BTREE_NOVAC_VERSION

#define BTREE_NOVAC_VERSION   3 /* version with all meta fields set */

Definition at line 152 of file nbtree.h.

◆ BTREE_SINGLEVAL_FILLFACTOR

#define BTREE_SINGLEVAL_FILLFACTOR   96

Definition at line 202 of file nbtree.h.

◆ BTREE_VERSION

#define BTREE_VERSION   4 /* current version number */

Definition at line 150 of file nbtree.h.

◆ BTreeTupleGetNAtts

#define BTreeTupleGetNAtts (   itup,
  rel 
)
Value:
( \
(BTreeTupleIsPivot(itup)) ? \
( \
) \
: \
)
static OffsetNumber ItemPointerGetOffsetNumberNoCheck(const ItemPointerData *pointer)
Definition: itemptr.h:114
static bool BTreeTupleIsPivot(IndexTuple itup)
Definition: nbtree.h:480
#define BT_OFFSET_MASK
Definition: nbtree.h:462
#define IndexRelationGetNumberOfAttributes(relation)
Definition: rel.h:517

Definition at line 577 of file nbtree.h.

◆ BTScanPosInvalidate

#define BTScanPosInvalidate (   scanpos)
Value:
do { \
(scanpos).buf = InvalidBuffer; \
(scanpos).currPage = InvalidBlockNumber; \
} while (0)
#define InvalidBlockNumber
Definition: block.h:33
#define InvalidBuffer
Definition: buf.h:25
static char * buf
Definition: pg_test_fsync.c:72

Definition at line 1021 of file nbtree.h.

◆ BTScanPosIsPinned

#define BTScanPosIsPinned (   scanpos)
Value:
( \
AssertMacro(BlockNumberIsValid((scanpos).currPage) || \
!BufferIsValid((scanpos).buf)), \
BufferIsValid((scanpos).buf) \
)
static bool BlockNumberIsValid(BlockNumber blockNumber)
Definition: block.h:71
static bool BufferIsValid(Buffer bufnum)
Definition: bufmgr.h:347

Definition at line 998 of file nbtree.h.

◆ BTScanPosIsValid

#define BTScanPosIsValid (   scanpos)
Value:
( \
AssertMacro(BlockNumberIsValid((scanpos).currPage) || \
!BufferIsValid((scanpos).buf)), \
BlockNumberIsValid((scanpos).currPage) \
)

Definition at line 1015 of file nbtree.h.

◆ BTScanPosUnpin

#define BTScanPosUnpin (   scanpos)
Value:
do { \
ReleaseBuffer((scanpos).buf); \
(scanpos).buf = InvalidBuffer; \
} while (0)

Definition at line 1004 of file nbtree.h.

◆ BTScanPosUnpinIfPinned

#define BTScanPosUnpinIfPinned (   scanpos)
Value:
do { \
if (BTScanPosIsPinned(scanpos)) \
BTScanPosUnpin(scanpos); \
} while (0)
#define BTScanPosIsPinned(scanpos)
Definition: nbtree.h:998

Definition at line 1009 of file nbtree.h.

◆ BTSORTSUPPORT_PROC

#define BTSORTSUPPORT_PROC   2

Definition at line 713 of file nbtree.h.

◆ INDEX_ALT_TID_MASK

#define INDEX_ALT_TID_MASK   INDEX_AM_RESERVED_BIT

Definition at line 459 of file nbtree.h.

◆ MAX_BT_CYCLE_ID

#define MAX_BT_CYCLE_ID   0xFF7F

Definition at line 93 of file nbtree.h.

◆ MaxTIDsPerBTreePage

#define MaxTIDsPerBTreePage
Value:
(int) ((BLCKSZ - SizeOfPageHeaderData - sizeof(BTPageOpaqueData)) / \
sizeof(ItemPointerData))

Definition at line 185 of file nbtree.h.

◆ P_FIRSTDATAKEY

#define P_FIRSTDATAKEY (   opaque)    (P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY)

Definition at line 369 of file nbtree.h.

◆ P_FIRSTKEY

#define P_FIRSTKEY   ((OffsetNumber) 2)

Definition at line 368 of file nbtree.h.

◆ P_HAS_FULLXID

#define P_HAS_FULLXID (   opaque)    (((opaque)->btpo_flags & BTP_HAS_FULLXID) != 0)

Definition at line 228 of file nbtree.h.

◆ P_HAS_GARBAGE

#define P_HAS_GARBAGE (   opaque)    (((opaque)->btpo_flags & BTP_HAS_GARBAGE) != 0)

Definition at line 226 of file nbtree.h.

◆ P_HIKEY

#define P_HIKEY   ((OffsetNumber) 1)

Definition at line 367 of file nbtree.h.

◆ P_IGNORE

#define P_IGNORE (   opaque)    (((opaque)->btpo_flags & (BTP_DELETED|BTP_HALF_DEAD)) != 0)

Definition at line 225 of file nbtree.h.

◆ P_INCOMPLETE_SPLIT

#define P_INCOMPLETE_SPLIT (   opaque)    (((opaque)->btpo_flags & BTP_INCOMPLETE_SPLIT) != 0)

Definition at line 227 of file nbtree.h.

◆ P_ISDELETED

#define P_ISDELETED (   opaque)    (((opaque)->btpo_flags & BTP_DELETED) != 0)

Definition at line 222 of file nbtree.h.

◆ P_ISHALFDEAD

#define P_ISHALFDEAD (   opaque)    (((opaque)->btpo_flags & BTP_HALF_DEAD) != 0)

Definition at line 224 of file nbtree.h.

◆ P_ISLEAF

#define P_ISLEAF (   opaque)    (((opaque)->btpo_flags & BTP_LEAF) != 0)

Definition at line 220 of file nbtree.h.

◆ P_ISMETA

#define P_ISMETA (   opaque)    (((opaque)->btpo_flags & BTP_META) != 0)

Definition at line 223 of file nbtree.h.

◆ P_ISROOT

#define P_ISROOT (   opaque)    (((opaque)->btpo_flags & BTP_ROOT) != 0)

Definition at line 221 of file nbtree.h.

◆ P_LEFTMOST

#define P_LEFTMOST (   opaque)    ((opaque)->btpo_prev == P_NONE)

Definition at line 218 of file nbtree.h.

◆ P_NONE

#define P_NONE   0

Definition at line 212 of file nbtree.h.

◆ P_RIGHTMOST

#define P_RIGHTMOST (   opaque)    ((opaque)->btpo_next == P_NONE)

Definition at line 219 of file nbtree.h.

◆ PROGRESS_BTREE_PHASE_INDEXBUILD_TABLESCAN

#define PROGRESS_BTREE_PHASE_INDEXBUILD_TABLESCAN   2

Definition at line 1152 of file nbtree.h.

◆ PROGRESS_BTREE_PHASE_LEAF_LOAD

#define PROGRESS_BTREE_PHASE_LEAF_LOAD   5

Definition at line 1155 of file nbtree.h.

◆ PROGRESS_BTREE_PHASE_PERFORMSORT_1

#define PROGRESS_BTREE_PHASE_PERFORMSORT_1   3

Definition at line 1153 of file nbtree.h.

◆ PROGRESS_BTREE_PHASE_PERFORMSORT_2

#define PROGRESS_BTREE_PHASE_PERFORMSORT_2   4

Definition at line 1154 of file nbtree.h.

◆ SK_BT_DESC

#define SK_BT_DESC   (INDOPTION_DESC << SK_BT_INDOPTION_SHIFT)

Definition at line 1122 of file nbtree.h.

◆ SK_BT_INDOPTION_SHIFT

#define SK_BT_INDOPTION_SHIFT   24 /* must clear the above bits */

Definition at line 1121 of file nbtree.h.

◆ SK_BT_NULLS_FIRST

#define SK_BT_NULLS_FIRST   (INDOPTION_NULLS_FIRST << SK_BT_INDOPTION_SHIFT)

Definition at line 1123 of file nbtree.h.

◆ SK_BT_REQBKWD

#define SK_BT_REQBKWD   0x00020000 /* required to continue backward scan */

Definition at line 1120 of file nbtree.h.

◆ SK_BT_REQFWD

#define SK_BT_REQFWD   0x00010000 /* required to continue forward scan */

Definition at line 1119 of file nbtree.h.

Typedef Documentation

◆ BTArrayKeyInfo

◆ BTCycleId

typedef uint16 BTCycleId

Definition at line 29 of file nbtree.h.

◆ BTDedupInterval

◆ BTDedupState

Definition at line 898 of file nbtree.h.

◆ BTDedupStateData

◆ BTDeletedPageData

◆ BTInsertState

Definition at line 840 of file nbtree.h.

◆ BTInsertStateData

◆ BTMetaPageData

◆ BTOptions

typedef struct BTOptions BTOptions

◆ BTPageOpaque

Definition at line 71 of file nbtree.h.

◆ BTPageOpaqueData

◆ BTPendingFSM

typedef struct BTPendingFSM BTPendingFSM

◆ BTReadPageState

◆ BTScanInsert

Definition at line 801 of file nbtree.h.

◆ BTScanInsertData

◆ BTScanOpaque

Definition at line 1078 of file nbtree.h.

◆ BTScanOpaqueData

◆ BTScanPos

Definition at line 996 of file nbtree.h.

◆ BTScanPosData

typedef struct BTScanPosData BTScanPosData

◆ BTScanPosItem

typedef struct BTScanPosItem BTScanPosItem

◆ BTStack

typedef BTStackData* BTStack

Definition at line 744 of file nbtree.h.

◆ BTStackData

typedef struct BTStackData BTStackData

◆ BTVacState

typedef struct BTVacState BTVacState

◆ BTVacuumPosting

Definition at line 919 of file nbtree.h.

◆ BTVacuumPostingData

Function Documentation

◆ _bt_allequalimage()

bool _bt_allequalimage ( Relation  rel,
bool  debugmessage 
)

Definition at line 3297 of file nbtutils.c.

3298{
3299 bool allequalimage = true;
3300
3301 /* INCLUDE indexes can never support deduplication */
3304 return false;
3305
3306 for (int i = 0; i < IndexRelationGetNumberOfKeyAttributes(rel); i++)
3307 {
3308 Oid opfamily = rel->rd_opfamily[i];
3309 Oid opcintype = rel->rd_opcintype[i];
3310 Oid collation = rel->rd_indcollation[i];
3311 Oid equalimageproc;
3312
3313 equalimageproc = get_opfamily_proc(opfamily, opcintype, opcintype,
3315
3316 /*
3317 * If there is no BTEQUALIMAGE_PROC then deduplication is assumed to
3318 * be unsafe. Otherwise, actually call proc and see what it says.
3319 */
3320 if (!OidIsValid(equalimageproc) ||
3321 !DatumGetBool(OidFunctionCall1Coll(equalimageproc, collation,
3322 ObjectIdGetDatum(opcintype))))
3323 {
3324 allequalimage = false;
3325 break;
3326 }
3327 }
3328
3329 if (debugmessage)
3330 {
3331 if (allequalimage)
3332 elog(DEBUG1, "index \"%s\" can safely use deduplication",
3334 else
3335 elog(DEBUG1, "index \"%s\" cannot use deduplication",
3337 }
3338
3339 return allequalimage;
3340}
#define OidIsValid(objectId)
Definition: c.h:732
#define DEBUG1
Definition: elog.h:30
#define elog(elevel,...)
Definition: elog.h:225
Datum OidFunctionCall1Coll(Oid functionId, Oid collation, Datum arg1)
Definition: fmgr.c:1411
int i
Definition: isn.c:72
Oid get_opfamily_proc(Oid opfamily, Oid lefttype, Oid righttype, int16 procnum)
Definition: lsyscache.c:796
#define BTEQUALIMAGE_PROC
Definition: nbtree.h:715
static bool DatumGetBool(Datum X)
Definition: postgres.h:95
static Datum ObjectIdGetDatum(Oid X)
Definition: postgres.h:257
unsigned int Oid
Definition: postgres_ext.h:32
#define RelationGetRelationName(relation)
Definition: rel.h:539
#define IndexRelationGetNumberOfKeyAttributes(relation)
Definition: rel.h:524
Oid * rd_opcintype
Definition: rel.h:208
Oid * rd_opfamily
Definition: rel.h:207
Oid * rd_indcollation
Definition: rel.h:217

References BTEQUALIMAGE_PROC, DatumGetBool(), DEBUG1, elog, get_opfamily_proc(), i, IndexRelationGetNumberOfAttributes, IndexRelationGetNumberOfKeyAttributes, ObjectIdGetDatum(), OidFunctionCall1Coll(), OidIsValid, RelationData::rd_indcollation, RelationData::rd_opcintype, RelationData::rd_opfamily, and RelationGetRelationName.

Referenced by _bt_leafbuild(), bt_index_check_internal(), and btbuildempty().

◆ _bt_allocbuf()

Buffer _bt_allocbuf ( Relation  rel,
Relation  heaprel 
)

Definition at line 869 of file nbtpage.c.

870{
871 Buffer buf;
872 BlockNumber blkno;
873 Page page;
874
875 Assert(heaprel != NULL);
876
877 /*
878 * First see if the FSM knows of any free pages.
879 *
880 * We can't trust the FSM's report unreservedly; we have to check that the
881 * page is still free. (For example, an already-free page could have been
882 * re-used between the time the last VACUUM scanned it and the time the
883 * VACUUM made its FSM updates.)
884 *
885 * In fact, it's worse than that: we can't even assume that it's safe to
886 * take a lock on the reported page. If somebody else has a lock on it,
887 * or even worse our own caller does, we could deadlock. (The own-caller
888 * scenario is actually not improbable. Consider an index on a serial or
889 * timestamp column. Nearly all splits will be at the rightmost page, so
890 * it's entirely likely that _bt_split will call us while holding a lock
891 * on the page most recently acquired from FSM. A VACUUM running
892 * concurrently with the previous split could well have placed that page
893 * back in FSM.)
894 *
895 * To get around that, we ask for only a conditional lock on the reported
896 * page. If we fail, then someone else is using the page, and we may
897 * reasonably assume it's not free. (If we happen to be wrong, the worst
898 * consequence is the page will be lost to use till the next VACUUM, which
899 * is no big problem.)
900 */
901 for (;;)
902 {
903 blkno = GetFreeIndexPage(rel);
904 if (blkno == InvalidBlockNumber)
905 break;
906 buf = ReadBuffer(rel, blkno);
907 if (_bt_conditionallockbuf(rel, buf))
908 {
909 page = BufferGetPage(buf);
910
911 /*
912 * It's possible to find an all-zeroes page in an index. For
913 * example, a backend might successfully extend the relation one
914 * page and then crash before it is able to make a WAL entry for
915 * adding the page. If we find a zeroed page then reclaim it
916 * immediately.
917 */
918 if (PageIsNew(page))
919 {
920 /* Okay to use page. Initialize and return it. */
922 return buf;
923 }
924
925 if (BTPageIsRecyclable(page, heaprel))
926 {
927 /*
928 * If we are generating WAL for Hot Standby then create a WAL
929 * record that will allow us to conflict with queries running
930 * on standby, in case they have snapshots older than safexid
931 * value
932 */
934 {
935 xl_btree_reuse_page xlrec_reuse;
936
937 /*
938 * Note that we don't register the buffer with the record,
939 * because this operation doesn't modify the page (that
940 * already happened, back when VACUUM deleted the page).
941 * This record only exists to provide a conflict point for
942 * Hot Standby. See record REDO routine comments.
943 */
944 xlrec_reuse.locator = rel->rd_locator;
945 xlrec_reuse.block = blkno;
947 xlrec_reuse.isCatalogRel =
949
951 XLogRegisterData((char *) &xlrec_reuse, SizeOfBtreeReusePage);
952
953 XLogInsert(RM_BTREE_ID, XLOG_BTREE_REUSE_PAGE);
954 }
955
956 /* Okay to use page. Re-initialize and return it. */
958 return buf;
959 }
960 elog(DEBUG2, "FSM returned nonrecyclable page");
961 _bt_relbuf(rel, buf);
962 }
963 else
964 {
965 elog(DEBUG2, "FSM returned nonlockable page");
966 /* couldn't get lock, so just drop pin */
968 }
969 }
970
971 /*
972 * Extend the relation by one page. Need to use RBM_ZERO_AND_LOCK or we
973 * risk a race condition against btvacuumscan --- see comments therein.
974 * This forces us to repeat the valgrind request that _bt_lockbuf()
975 * otherwise would make, as we can't use _bt_lockbuf() without introducing
976 * a race.
977 */
979 if (!RelationUsesLocalBuffers(rel))
981
982 /* Initialize the new page before returning it */
983 page = BufferGetPage(buf);
984 Assert(PageIsNew(page));
986
987 return buf;
988}
uint32 BlockNumber
Definition: block.h:31
int Buffer
Definition: buf.h:23
Buffer ExtendBufferedRel(BufferManagerRelation bmr, ForkNumber forkNum, BufferAccessStrategy strategy, uint32 flags)
Definition: bufmgr.c:846
void ReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:4866
Buffer ReadBuffer(Relation reln, BlockNumber blockNum)
Definition: bufmgr.c:746
static Page BufferGetPage(Buffer buffer)
Definition: bufmgr.h:396
static Size BufferGetPageSize(Buffer buffer)
Definition: bufmgr.h:385
@ EB_LOCK_FIRST
Definition: bufmgr.h:86
#define BMR_REL(p_rel)
Definition: bufmgr.h:107
static bool PageIsNew(const PageData *page)
Definition: bufpage.h:234
PageData * Page
Definition: bufpage.h:82
#define Assert(condition)
Definition: c.h:815
#define DEBUG2
Definition: elog.h:29
BlockNumber GetFreeIndexPage(Relation rel)
Definition: indexfsm.c:38
#define VALGRIND_MAKE_MEM_DEFINED(addr, size)
Definition: memdebug.h:26
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:1023
void _bt_pageinit(Page page, Size size)
Definition: nbtpage.c:1129
bool _bt_conditionallockbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:1093
static FullTransactionId BTPageGetDeleteXid(Page page)
Definition: nbtree.h:260
static bool BTPageIsRecyclable(Page page, Relation heaprel)
Definition: nbtree.h:291
#define XLOG_BTREE_REUSE_PAGE
Definition: nbtxlog.h:40
#define SizeOfBtreeReusePage
Definition: nbtxlog.h:192
#define RelationIsAccessibleInLogicalDecoding(relation)
Definition: rel.h:684
#define RelationNeedsWAL(relation)
Definition: rel.h:628
#define RelationUsesLocalBuffers(relation)
Definition: rel.h:637
@ MAIN_FORKNUM
Definition: relpath.h:58
RelFileLocator rd_locator
Definition: rel.h:57
FullTransactionId snapshotConflictHorizon
Definition: nbtxlog.h:187
RelFileLocator locator
Definition: nbtxlog.h:185
BlockNumber block
Definition: nbtxlog.h:186
#define XLogStandbyInfoActive()
Definition: xlog.h:123
XLogRecPtr XLogInsert(RmgrId rmid, uint8 info)
Definition: xloginsert.c:474
void XLogRegisterData(const char *data, uint32 len)
Definition: xloginsert.c:364
void XLogBeginInsert(void)
Definition: xloginsert.c:149

References _bt_conditionallockbuf(), _bt_pageinit(), _bt_relbuf(), Assert, xl_btree_reuse_page::block, BMR_REL, BTPageGetDeleteXid(), BTPageIsRecyclable(), buf, BufferGetPage(), BufferGetPageSize(), DEBUG2, EB_LOCK_FIRST, elog, ExtendBufferedRel(), GetFreeIndexPage(), InvalidBlockNumber, xl_btree_reuse_page::isCatalogRel, xl_btree_reuse_page::locator, MAIN_FORKNUM, PageIsNew(), RelationData::rd_locator, ReadBuffer(), RelationIsAccessibleInLogicalDecoding, RelationNeedsWAL, RelationUsesLocalBuffers, ReleaseBuffer(), SizeOfBtreeReusePage, xl_btree_reuse_page::snapshotConflictHorizon, VALGRIND_MAKE_MEM_DEFINED, XLOG_BTREE_REUSE_PAGE, XLogBeginInsert(), XLogInsert(), XLogRegisterData(), and XLogStandbyInfoActive.

Referenced by _bt_getroot(), _bt_newlevel(), and _bt_split().

◆ _bt_binsrch_array_skey()

int _bt_binsrch_array_skey ( FmgrInfo orderproc,
bool  cur_elem_trig,
ScanDirection  dir,
Datum  tupdatum,
bool  tupnull,
BTArrayKeyInfo array,
ScanKey  cur,
int32 set_elem_result 
)

Definition at line 271 of file nbtutils.c.

276{
277 int low_elem = 0,
278 mid_elem = -1,
279 high_elem = array->num_elems - 1,
280 result = 0;
281 Datum arrdatum;
282
283 Assert(cur->sk_flags & SK_SEARCHARRAY);
284 Assert(cur->sk_strategy == BTEqualStrategyNumber);
285
286 if (cur_elem_trig)
287 {
289 Assert(cur->sk_flags & SK_BT_REQFWD);
290
291 /*
292 * When the scan key that triggered array advancement is a required
293 * array scan key, it is now certain that the current array element
294 * (plus all prior elements relative to the current scan direction)
295 * cannot possibly be at or ahead of the corresponding tuple value.
296 * (_bt_checkkeys must have called _bt_tuple_before_array_skeys, which
297 * makes sure this is true as a condition of advancing the arrays.)
298 *
299 * This makes it safe to exclude array elements up to and including
300 * the former-current array element from our search.
301 *
302 * Separately, when array advancement was triggered by a required scan
303 * key, the array element immediately after the former-current element
304 * is often either an exact tupdatum match, or a "close by" near-match
305 * (a near-match tupdatum is one whose key space falls _between_ the
306 * former-current and new-current array elements). We'll detect both
307 * cases via an optimistic comparison of the new search lower bound
308 * (or new search upper bound in the case of backwards scans).
309 */
310 if (ScanDirectionIsForward(dir))
311 {
312 low_elem = array->cur_elem + 1; /* old cur_elem exhausted */
313
314 /* Compare prospective new cur_elem (also the new lower bound) */
315 if (high_elem >= low_elem)
316 {
317 arrdatum = array->elem_values[low_elem];
318 result = _bt_compare_array_skey(orderproc, tupdatum, tupnull,
319 arrdatum, cur);
320
321 if (result <= 0)
322 {
323 /* Optimistic comparison optimization worked out */
324 *set_elem_result = result;
325 return low_elem;
326 }
327 mid_elem = low_elem;
328 low_elem++; /* this cur_elem exhausted, too */
329 }
330
331 if (high_elem < low_elem)
332 {
333 /* Caller needs to perform "beyond end" array advancement */
334 *set_elem_result = 1;
335 return high_elem;
336 }
337 }
338 else
339 {
340 high_elem = array->cur_elem - 1; /* old cur_elem exhausted */
341
342 /* Compare prospective new cur_elem (also the new upper bound) */
343 if (high_elem >= low_elem)
344 {
345 arrdatum = array->elem_values[high_elem];
346 result = _bt_compare_array_skey(orderproc, tupdatum, tupnull,
347 arrdatum, cur);
348
349 if (result >= 0)
350 {
351 /* Optimistic comparison optimization worked out */
352 *set_elem_result = result;
353 return high_elem;
354 }
355 mid_elem = high_elem;
356 high_elem--; /* this cur_elem exhausted, too */
357 }
358
359 if (high_elem < low_elem)
360 {
361 /* Caller needs to perform "beyond end" array advancement */
362 *set_elem_result = -1;
363 return low_elem;
364 }
365 }
366 }
367
368 while (high_elem > low_elem)
369 {
370 mid_elem = low_elem + ((high_elem - low_elem) / 2);
371 arrdatum = array->elem_values[mid_elem];
372
373 result = _bt_compare_array_skey(orderproc, tupdatum, tupnull,
374 arrdatum, cur);
375
376 if (result == 0)
377 {
378 /*
379 * It's safe to quit as soon as we see an equal array element.
380 * This often saves an extra comparison or two...
381 */
382 low_elem = mid_elem;
383 break;
384 }
385
386 if (result > 0)
387 low_elem = mid_elem + 1;
388 else
389 high_elem = mid_elem;
390 }
391
392 /*
393 * ...but our caller also cares about how its searched-for tuple datum
394 * compares to the low_elem datum. Must always set *set_elem_result with
395 * the result of that comparison specifically.
396 */
397 if (low_elem != mid_elem)
398 result = _bt_compare_array_skey(orderproc, tupdatum, tupnull,
399 array->elem_values[low_elem], cur);
400
401 *set_elem_result = result;
402
403 return low_elem;
404}
struct cursor * cur
Definition: ecpg.c:29
#define SK_BT_REQFWD
Definition: nbtree.h:1119
static int32 _bt_compare_array_skey(FmgrInfo *orderproc, Datum tupdatum, bool tupnull, Datum arrdatum, ScanKey cur)
Definition: nbtutils.c:201
uintptr_t Datum
Definition: postgres.h:69
#define ScanDirectionIsForward(direction)
Definition: sdir.h:64
#define ScanDirectionIsNoMovement(direction)
Definition: sdir.h:57
#define SK_SEARCHARRAY
Definition: skey.h:120
#define BTEqualStrategyNumber
Definition: stratnum.h:31
Datum * elem_values
Definition: nbtree.h:1033

References _bt_compare_array_skey(), Assert, BTEqualStrategyNumber, cur, BTArrayKeyInfo::cur_elem, BTArrayKeyInfo::elem_values, BTArrayKeyInfo::num_elems, ScanDirectionIsForward, ScanDirectionIsNoMovement, SK_BT_REQFWD, and SK_SEARCHARRAY.

Referenced by _bt_advance_array_keys(), and _bt_compare_array_scankey_args().

◆ _bt_binsrch_insert()

OffsetNumber _bt_binsrch_insert ( Relation  rel,
BTInsertState  insertstate 
)

Definition at line 474 of file nbtsearch.c.

475{
476 BTScanInsert key = insertstate->itup_key;
477 Page page;
478 BTPageOpaque opaque;
479 OffsetNumber low,
480 high,
481 stricthigh;
482 int32 result,
483 cmpval;
484
485 page = BufferGetPage(insertstate->buf);
486 opaque = BTPageGetOpaque(page);
487
488 Assert(P_ISLEAF(opaque));
489 Assert(!key->nextkey);
490 Assert(insertstate->postingoff == 0);
491
492 if (!insertstate->bounds_valid)
493 {
494 /* Start new binary search */
495 low = P_FIRSTDATAKEY(opaque);
496 high = PageGetMaxOffsetNumber(page);
497 }
498 else
499 {
500 /* Restore result of previous binary search against same page */
501 low = insertstate->low;
502 high = insertstate->stricthigh;
503 }
504
505 /* If there are no keys on the page, return the first available slot */
506 if (unlikely(high < low))
507 {
508 /* Caller can't reuse bounds */
509 insertstate->low = InvalidOffsetNumber;
510 insertstate->stricthigh = InvalidOffsetNumber;
511 insertstate->bounds_valid = false;
512 return low;
513 }
514
515 /*
516 * Binary search to find the first key on the page >= scan key. (nextkey
517 * is always false when inserting).
518 *
519 * The loop invariant is: all slots before 'low' are < scan key, all slots
520 * at or after 'high' are >= scan key. 'stricthigh' is > scan key, and is
521 * maintained to save additional search effort for caller.
522 *
523 * We can fall out when high == low.
524 */
525 if (!insertstate->bounds_valid)
526 high++; /* establish the loop invariant for high */
527 stricthigh = high; /* high initially strictly higher */
528
529 cmpval = 1; /* !nextkey comparison value */
530
531 while (high > low)
532 {
533 OffsetNumber mid = low + ((high - low) / 2);
534
535 /* We have low <= mid < high, so mid points at a real slot */
536
537 result = _bt_compare(rel, key, page, mid);
538
539 if (result >= cmpval)
540 low = mid + 1;
541 else
542 {
543 high = mid;
544 if (result != 0)
545 stricthigh = high;
546 }
547
548 /*
549 * If tuple at offset located by binary search is a posting list whose
550 * TID range overlaps with caller's scantid, perform posting list
551 * binary search to set postingoff for caller. Caller must split the
552 * posting list when postingoff is set. This should happen
553 * infrequently.
554 */
555 if (unlikely(result == 0 && key->scantid != NULL))
556 {
557 /*
558 * postingoff should never be set more than once per leaf page
559 * binary search. That would mean that there are duplicate table
560 * TIDs in the index, which is never okay. Check for that here.
561 */
562 if (insertstate->postingoff != 0)
564 (errcode(ERRCODE_INDEX_CORRUPTED),
565 errmsg_internal("table tid from new index tuple (%u,%u) cannot find insert offset between offsets %u and %u of block %u in index \"%s\"",
568 low, stricthigh,
569 BufferGetBlockNumber(insertstate->buf),
571
572 insertstate->postingoff = _bt_binsrch_posting(key, page, mid);
573 }
574 }
575
576 /*
577 * On a leaf page, a binary search always returns the first key >= scan
578 * key (at least in !nextkey case), which could be the last slot + 1. This
579 * is also the lower bound of cached search.
580 *
581 * stricthigh may also be the last slot + 1, which prevents caller from
582 * using bounds directly, but is still useful to us if we're called a
583 * second time with cached bounds (cached low will be < stricthigh when
584 * that happens).
585 */
586 insertstate->low = low;
587 insertstate->stricthigh = stricthigh;
588 insertstate->bounds_valid = true;
589
590 return low;
591}
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:3724
static OffsetNumber PageGetMaxOffsetNumber(const PageData *page)
Definition: bufpage.h:372
int32_t int32
Definition: c.h:484
#define unlikely(x)
Definition: c.h:333
int errmsg_internal(const char *fmt,...)
Definition: elog.c:1157
int errcode(int sqlerrcode)
Definition: elog.c:853
#define ERROR
Definition: elog.h:39
#define ereport(elevel,...)
Definition: elog.h:149
static OffsetNumber ItemPointerGetOffsetNumber(const ItemPointerData *pointer)
Definition: itemptr.h:124
static BlockNumber ItemPointerGetBlockNumber(const ItemPointerData *pointer)
Definition: itemptr.h:103
#define P_ISLEAF(opaque)
Definition: nbtree.h:220
#define BTPageGetOpaque(page)
Definition: nbtree.h:73
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:369
static int _bt_binsrch_posting(BTScanInsert key, Page page, OffsetNumber offnum)
Definition: nbtsearch.c:602
int32 _bt_compare(Relation rel, BTScanInsert key, Page page, OffsetNumber offnum)
Definition: nbtsearch.c:688
#define InvalidOffsetNumber
Definition: off.h:26
uint16 OffsetNumber
Definition: off.h:24
OffsetNumber stricthigh
Definition: nbtree.h:830
bool bounds_valid
Definition: nbtree.h:828
OffsetNumber low
Definition: nbtree.h:829
BTScanInsert itup_key
Definition: nbtree.h:818

References _bt_binsrch_posting(), _bt_compare(), Assert, BTInsertStateData::bounds_valid, BTPageGetOpaque, BTInsertStateData::buf, BufferGetBlockNumber(), BufferGetPage(), ereport, errcode(), errmsg_internal(), ERROR, InvalidOffsetNumber, ItemPointerGetBlockNumber(), ItemPointerGetOffsetNumber(), BTInsertStateData::itup_key, sort-test::key, BTInsertStateData::low, P_FIRSTDATAKEY, P_ISLEAF, PageGetMaxOffsetNumber(), BTInsertStateData::postingoff, RelationGetRelationName, BTInsertStateData::stricthigh, and unlikely.

Referenced by _bt_check_unique(), _bt_findinsertloc(), and bt_rootdescend().

◆ _bt_bottomupdel_pass()

bool _bt_bottomupdel_pass ( Relation  rel,
Buffer  buf,
Relation  heapRel,
Size  newitemsz 
)

Definition at line 307 of file nbtdedup.c.

309{
310 OffsetNumber offnum,
311 minoff,
312 maxoff;
313 Page page = BufferGetPage(buf);
314 BTPageOpaque opaque = BTPageGetOpaque(page);
316 TM_IndexDeleteOp delstate;
317 bool neverdedup;
318 int nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
319
320 /* Passed-in newitemsz is MAXALIGNED but does not include line pointer */
321 newitemsz += sizeof(ItemIdData);
322
323 /* Initialize deduplication state */
325 state->deduplicate = true;
326 state->nmaxitems = 0;
327 state->maxpostingsize = BLCKSZ; /* We're not really deduplicating */
328 state->base = NULL;
329 state->baseoff = InvalidOffsetNumber;
330 state->basetupsize = 0;
331 state->htids = palloc(state->maxpostingsize);
332 state->nhtids = 0;
333 state->nitems = 0;
334 state->phystupsize = 0;
335 state->nintervals = 0;
336
337 /*
338 * Initialize tableam state that describes bottom-up index deletion
339 * operation.
340 *
341 * We'll go on to ask the tableam to search for TIDs whose index tuples we
342 * can safely delete. The tableam will search until our leaf page space
343 * target is satisfied, or until the cost of continuing with the tableam
344 * operation seems too high. It focuses its efforts on TIDs associated
345 * with duplicate index tuples that we mark "promising".
346 *
347 * This space target is a little arbitrary. The tableam must be able to
348 * keep the costs and benefits in balance. We provide the tableam with
349 * exhaustive information about what might work, without directly
350 * concerning ourselves with avoiding work during the tableam call. Our
351 * role in costing the bottom-up deletion process is strictly advisory.
352 */
353 delstate.irel = rel;
354 delstate.iblknum = BufferGetBlockNumber(buf);
355 delstate.bottomup = true;
356 delstate.bottomupfreespace = Max(BLCKSZ / 16, newitemsz);
357 delstate.ndeltids = 0;
358 delstate.deltids = palloc(MaxTIDsPerBTreePage * sizeof(TM_IndexDelete));
359 delstate.status = palloc(MaxTIDsPerBTreePage * sizeof(TM_IndexStatus));
360
361 minoff = P_FIRSTDATAKEY(opaque);
362 maxoff = PageGetMaxOffsetNumber(page);
363 for (offnum = minoff;
364 offnum <= maxoff;
365 offnum = OffsetNumberNext(offnum))
366 {
367 ItemId itemid = PageGetItemId(page, offnum);
368 IndexTuple itup = (IndexTuple) PageGetItem(page, itemid);
369
370 Assert(!ItemIdIsDead(itemid));
371
372 if (offnum == minoff)
373 {
374 /* itup starts first pending interval */
375 _bt_dedup_start_pending(state, itup, offnum);
376 }
377 else if (_bt_keep_natts_fast(rel, state->base, itup) > nkeyatts &&
379 {
380 /* Tuple is equal; just added its TIDs to pending interval */
381 }
382 else
383 {
384 /* Finalize interval -- move its TIDs to delete state */
385 _bt_bottomupdel_finish_pending(page, state, &delstate);
386
387 /* itup starts new pending interval */
388 _bt_dedup_start_pending(state, itup, offnum);
389 }
390 }
391 /* Finalize final interval -- move its TIDs to delete state */
392 _bt_bottomupdel_finish_pending(page, state, &delstate);
393
394 /*
395 * We don't give up now in the event of having few (or even zero)
396 * promising tuples for the tableam because it's not up to us as the index
397 * AM to manage costs (note that the tableam might have heuristics of its
398 * own that work out what to do). We should at least avoid having our
399 * caller do a useless deduplication pass after we return in the event of
400 * zero promising tuples, though.
401 */
402 neverdedup = false;
403 if (state->nintervals == 0)
404 neverdedup = true;
405
406 pfree(state->htids);
407 pfree(state);
408
409 /* Ask tableam which TIDs are deletable, then physically delete them */
410 _bt_delitems_delete_check(rel, buf, heapRel, &delstate);
411
412 pfree(delstate.deltids);
413 pfree(delstate.status);
414
415 /* Report "success" to caller unconditionally to avoid deduplication */
416 if (neverdedup)
417 return true;
418
419 /* Don't dedup when we won't end up back here any time soon anyway */
420 return PageGetExactFreeSpace(page) >= Max(BLCKSZ / 24, newitemsz);
421}
Size PageGetExactFreeSpace(const PageData *page)
Definition: bufpage.c:947
static Item PageGetItem(const PageData *page, const ItemIdData *itemId)
Definition: bufpage.h:354
static ItemId PageGetItemId(Page page, OffsetNumber offsetNumber)
Definition: bufpage.h:244
#define Max(x, y)
Definition: c.h:955
struct ItemIdData ItemIdData
#define ItemIdIsDead(itemId)
Definition: itemid.h:113
IndexTupleData * IndexTuple
Definition: itup.h:53
void pfree(void *pointer)
Definition: mcxt.c:1521
void * palloc(Size size)
Definition: mcxt.c:1317
bool _bt_dedup_save_htid(BTDedupState state, IndexTuple itup)
Definition: nbtdedup.c:484
void _bt_dedup_start_pending(BTDedupState state, IndexTuple base, OffsetNumber baseoff)
Definition: nbtdedup.c:433
static void _bt_bottomupdel_finish_pending(Page page, BTDedupState state, TM_IndexDeleteOp *delstate)
Definition: nbtdedup.c:648
void _bt_delitems_delete_check(Relation rel, Buffer buf, Relation heapRel, TM_IndexDeleteOp *delstate)
Definition: nbtpage.c:1513
#define MaxTIDsPerBTreePage
Definition: nbtree.h:185
BTDedupStateData * BTDedupState
Definition: nbtree.h:898
int _bt_keep_natts_fast(Relation rel, IndexTuple lastleft, IndexTuple firstright)
Definition: nbtutils.c:3032
#define OffsetNumberNext(offsetNumber)
Definition: off.h:52
TM_IndexStatus * status
Definition: tableam.h:254
int bottomupfreespace
Definition: tableam.h:249
Relation irel
Definition: tableam.h:246
TM_IndexDelete * deltids
Definition: tableam.h:253
BlockNumber iblknum
Definition: tableam.h:247
Definition: regguts.h:323

References _bt_bottomupdel_finish_pending(), _bt_dedup_save_htid(), _bt_dedup_start_pending(), _bt_delitems_delete_check(), _bt_keep_natts_fast(), Assert, TM_IndexDeleteOp::bottomup, TM_IndexDeleteOp::bottomupfreespace, BTPageGetOpaque, buf, BufferGetBlockNumber(), BufferGetPage(), TM_IndexDeleteOp::deltids, TM_IndexDeleteOp::iblknum, IndexRelationGetNumberOfKeyAttributes, InvalidOffsetNumber, TM_IndexDeleteOp::irel, ItemIdIsDead, Max, MaxTIDsPerBTreePage, TM_IndexDeleteOp::ndeltids, OffsetNumberNext, P_FIRSTDATAKEY, PageGetExactFreeSpace(), PageGetItem(), PageGetItemId(), PageGetMaxOffsetNumber(), palloc(), pfree(), and TM_IndexDeleteOp::status.

Referenced by _bt_delete_or_dedup_one_page().

◆ _bt_check_natts()

bool _bt_check_natts ( Relation  rel,
bool  heapkeyspace,
Page  page,
OffsetNumber  offnum 
)

Definition at line 3079 of file nbtutils.c.

3080{
3083 BTPageOpaque opaque = BTPageGetOpaque(page);
3084 IndexTuple itup;
3085 int tupnatts;
3086
3087 /*
3088 * We cannot reliably test a deleted or half-dead page, since they have
3089 * dummy high keys
3090 */
3091 if (P_IGNORE(opaque))
3092 return true;
3093
3094 Assert(offnum >= FirstOffsetNumber &&
3095 offnum <= PageGetMaxOffsetNumber(page));
3096
3097 itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
3098 tupnatts = BTreeTupleGetNAtts(itup, rel);
3099
3100 /* !heapkeyspace indexes do not support deduplication */
3101 if (!heapkeyspace && BTreeTupleIsPosting(itup))
3102 return false;
3103
3104 /* Posting list tuples should never have "pivot heap TID" bit set */
3105 if (BTreeTupleIsPosting(itup) &&
3108 return false;
3109
3110 /* INCLUDE indexes do not support deduplication */
3111 if (natts != nkeyatts && BTreeTupleIsPosting(itup))
3112 return false;
3113
3114 if (P_ISLEAF(opaque))
3115 {
3116 if (offnum >= P_FIRSTDATAKEY(opaque))
3117 {
3118 /*
3119 * Non-pivot tuple should never be explicitly marked as a pivot
3120 * tuple
3121 */
3122 if (BTreeTupleIsPivot(itup))
3123 return false;
3124
3125 /*
3126 * Leaf tuples that are not the page high key (non-pivot tuples)
3127 * should never be truncated. (Note that tupnatts must have been
3128 * inferred, even with a posting list tuple, because only pivot
3129 * tuples store tupnatts directly.)
3130 */
3131 return tupnatts == natts;
3132 }
3133 else
3134 {
3135 /*
3136 * Rightmost page doesn't contain a page high key, so tuple was
3137 * checked above as ordinary leaf tuple
3138 */
3139 Assert(!P_RIGHTMOST(opaque));
3140
3141 /*
3142 * !heapkeyspace high key tuple contains only key attributes. Note
3143 * that tupnatts will only have been explicitly represented in
3144 * !heapkeyspace indexes that happen to have non-key attributes.
3145 */
3146 if (!heapkeyspace)
3147 return tupnatts == nkeyatts;
3148
3149 /* Use generic heapkeyspace pivot tuple handling */
3150 }
3151 }
3152 else /* !P_ISLEAF(opaque) */
3153 {
3154 if (offnum == P_FIRSTDATAKEY(opaque))
3155 {
3156 /*
3157 * The first tuple on any internal page (possibly the first after
3158 * its high key) is its negative infinity tuple. Negative
3159 * infinity tuples are always truncated to zero attributes. They
3160 * are a particular kind of pivot tuple.
3161 */
3162 if (heapkeyspace)
3163 return tupnatts == 0;
3164
3165 /*
3166 * The number of attributes won't be explicitly represented if the
3167 * negative infinity tuple was generated during a page split that
3168 * occurred with a version of Postgres before v11. There must be
3169 * a problem when there is an explicit representation that is
3170 * non-zero, or when there is no explicit representation and the
3171 * tuple is evidently not a pre-pg_upgrade tuple.
3172 *
3173 * Prior to v11, downlinks always had P_HIKEY as their offset.
3174 * Accept that as an alternative indication of a valid
3175 * !heapkeyspace negative infinity tuple.
3176 */
3177 return tupnatts == 0 ||
3179 }
3180 else
3181 {
3182 /*
3183 * !heapkeyspace downlink tuple with separator key contains only
3184 * key attributes. Note that tupnatts will only have been
3185 * explicitly represented in !heapkeyspace indexes that happen to
3186 * have non-key attributes.
3187 */
3188 if (!heapkeyspace)
3189 return tupnatts == nkeyatts;
3190
3191 /* Use generic heapkeyspace pivot tuple handling */
3192 }
3193 }
3194
3195 /* Handle heapkeyspace pivot tuples (excluding minus infinity items) */
3196 Assert(heapkeyspace);
3197
3198 /*
3199 * Explicit representation of the number of attributes is mandatory with
3200 * heapkeyspace index pivot tuples, regardless of whether or not there are
3201 * non-key attributes.
3202 */
3203 if (!BTreeTupleIsPivot(itup))
3204 return false;
3205
3206 /* Pivot tuple should not use posting list representation (redundant) */
3207 if (BTreeTupleIsPosting(itup))
3208 return false;
3209
3210 /*
3211 * Heap TID is a tiebreaker key attribute, so it cannot be untruncated
3212 * when any other key attribute is truncated
3213 */
3214 if (BTreeTupleGetHeapTID(itup) != NULL && tupnatts != nkeyatts)
3215 return false;
3216
3217 /*
3218 * Pivot tuple must have at least one untruncated key attribute (minus
3219 * infinity pivot tuples are the only exception). Pivot tuples can never
3220 * represent that there is a value present for a key attribute that
3221 * exceeds pg_index.indnkeyatts for the index.
3222 */
3223 return tupnatts > 0 && tupnatts <= nkeyatts;
3224}
int16_t int16
Definition: c.h:483
#define BT_PIVOT_HEAP_TID_ATTR
Definition: nbtree.h:465
#define P_HIKEY
Definition: nbtree.h:367
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:219
#define P_IGNORE(opaque)
Definition: nbtree.h:225
static bool BTreeTupleIsPosting(IndexTuple itup)
Definition: nbtree.h:492
static ItemPointer BTreeTupleGetHeapTID(IndexTuple itup)
Definition: nbtree.h:638
#define BTreeTupleGetNAtts(itup, rel)
Definition: nbtree.h:577
#define FirstOffsetNumber
Definition: off.h:27
ItemPointerData t_tid
Definition: itup.h:37

References Assert, BT_PIVOT_HEAP_TID_ATTR, BTPageGetOpaque, BTreeTupleGetHeapTID(), BTreeTupleGetNAtts, BTreeTupleIsPivot(), BTreeTupleIsPosting(), FirstOffsetNumber, IndexRelationGetNumberOfAttributes, IndexRelationGetNumberOfKeyAttributes, ItemPointerGetOffsetNumber(), ItemPointerGetOffsetNumberNoCheck(), P_FIRSTDATAKEY, P_HIKEY, P_IGNORE, P_ISLEAF, P_RIGHTMOST, PageGetItem(), PageGetItemId(), PageGetMaxOffsetNumber(), and IndexTupleData::t_tid.

Referenced by _bt_compare(), and bt_target_page_check().

◆ _bt_check_third_page()

void _bt_check_third_page ( Relation  rel,
Relation  heap,
bool  needheaptidspace,
Page  page,
IndexTuple  newtup 
)

Definition at line 3239 of file nbtutils.c.

3241{
3242 Size itemsz;
3243 BTPageOpaque opaque;
3244
3245 itemsz = MAXALIGN(IndexTupleSize(newtup));
3246
3247 /* Double check item size against limit */
3248 if (itemsz <= BTMaxItemSize(page))
3249 return;
3250
3251 /*
3252 * Tuple is probably too large to fit on page, but it's possible that the
3253 * index uses version 2 or version 3, or that page is an internal page, in
3254 * which case a slightly higher limit applies.
3255 */
3256 if (!needheaptidspace && itemsz <= BTMaxItemSizeNoHeapTid(page))
3257 return;
3258
3259 /*
3260 * Internal page insertions cannot fail here, because that would mean that
3261 * an earlier leaf level insertion that should have failed didn't
3262 */
3263 opaque = BTPageGetOpaque(page);
3264 if (!P_ISLEAF(opaque))
3265 elog(ERROR, "cannot insert oversized tuple of size %zu on internal page of index \"%s\"",
3266 itemsz, RelationGetRelationName(rel));
3267
3268 ereport(ERROR,
3269 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
3270 errmsg("index row size %zu exceeds btree version %u maximum %zu for index \"%s\"",
3271 itemsz,
3272 needheaptidspace ? BTREE_VERSION : BTREE_NOVAC_VERSION,
3273 needheaptidspace ? BTMaxItemSize(page) :
3276 errdetail("Index row references tuple (%u,%u) in relation \"%s\".",
3280 errhint("Values larger than 1/3 of a buffer page cannot be indexed.\n"
3281 "Consider a function index of an MD5 hash of the value, "
3282 "or use full text indexing."),
3284}
size_t Size
Definition: c.h:562
int errdetail(const char *fmt,...)
Definition: elog.c:1203
int errhint(const char *fmt,...)
Definition: elog.c:1317
int errmsg(const char *fmt,...)
Definition: elog.c:1070
static Size IndexTupleSize(const IndexTupleData *itup)
Definition: itup.h:71
#define BTMaxItemSizeNoHeapTid(page)
Definition: nbtree.h:169
#define BTREE_VERSION
Definition: nbtree.h:150
#define BTMaxItemSize(page)
Definition: nbtree.h:164
#define BTREE_NOVAC_VERSION
Definition: nbtree.h:152
int errtableconstraint(Relation rel, const char *conname)
Definition: relcache.c:6023

References BTMaxItemSize, BTMaxItemSizeNoHeapTid, BTPageGetOpaque, BTREE_NOVAC_VERSION, BTREE_VERSION, BTreeTupleGetHeapTID(), elog, ereport, errcode(), errdetail(), errhint(), errmsg(), ERROR, errtableconstraint(), IndexTupleSize(), ItemPointerGetBlockNumber(), ItemPointerGetOffsetNumber(), MAXALIGN, P_ISLEAF, and RelationGetRelationName.

Referenced by _bt_buildadd(), and _bt_findinsertloc().

◆ _bt_checkkeys()

bool _bt_checkkeys ( IndexScanDesc  scan,
BTReadPageState pstate,
bool  arrayKeys,
IndexTuple  tuple,
int  tupnatts 
)

Definition at line 1627 of file nbtutils.c.

1629{
1630 TupleDesc tupdesc = RelationGetDescr(scan->indexRelation);
1631 BTScanOpaque so = (BTScanOpaque) scan->opaque;
1632 ScanDirection dir = so->currPos.dir;
1633 int ikey = 0;
1634 bool res;
1635
1636 Assert(BTreeTupleGetNAtts(tuple, scan->indexRelation) == tupnatts);
1637
1638 res = _bt_check_compare(scan, dir, tuple, tupnatts, tupdesc,
1639 arrayKeys, pstate->prechecked, pstate->firstmatch,
1640 &pstate->continuescan, &ikey);
1641
1642#ifdef USE_ASSERT_CHECKING
1643 if (!arrayKeys && so->numArrayKeys)
1644 {
1645 /*
1646 * This is a continuescan precheck call for a scan with array keys.
1647 *
1648 * Assert that the scan isn't in danger of becoming confused.
1649 */
1650 Assert(!so->scanBehind && !so->oppositeDirCheck);
1651 Assert(!pstate->prechecked && !pstate->firstmatch);
1652 Assert(!_bt_tuple_before_array_skeys(scan, dir, tuple, tupdesc,
1653 tupnatts, false, 0, NULL));
1654 }
1655 if (pstate->prechecked || pstate->firstmatch)
1656 {
1657 bool dcontinuescan;
1658 int dikey = 0;
1659
1660 /*
1661 * Call relied on continuescan/firstmatch prechecks -- assert that we
1662 * get the same answer without those optimizations
1663 */
1664 Assert(res == _bt_check_compare(scan, dir, tuple, tupnatts, tupdesc,
1665 false, false, false,
1666 &dcontinuescan, &dikey));
1667 Assert(pstate->continuescan == dcontinuescan);
1668 }
1669#endif
1670
1671 /*
1672 * Only one _bt_check_compare call is required in the common case where
1673 * there are no equality strategy array scan keys. Otherwise we can only
1674 * accept _bt_check_compare's answer unreservedly when it didn't set
1675 * pstate.continuescan=false.
1676 */
1677 if (!arrayKeys || pstate->continuescan)
1678 return res;
1679
1680 /*
1681 * _bt_check_compare call set continuescan=false in the presence of
1682 * equality type array keys. This could mean that the tuple is just past
1683 * the end of matches for the current array keys.
1684 *
1685 * It's also possible that the scan is still _before_ the _start_ of
1686 * tuples matching the current set of array keys. Check for that first.
1687 */
1688 if (_bt_tuple_before_array_skeys(scan, dir, tuple, tupdesc, tupnatts, true,
1689 ikey, NULL))
1690 {
1691 /*
1692 * Tuple is still before the start of matches according to the scan's
1693 * required array keys (according to _all_ of its required equality
1694 * strategy keys, actually).
1695 *
1696 * _bt_advance_array_keys occasionally sets so->scanBehind to signal
1697 * that the scan's current position/tuples might be significantly
1698 * behind (multiple pages behind) its current array keys. When this
1699 * happens, we need to be prepared to recover by starting a new
1700 * primitive index scan here, on our own.
1701 */
1702 Assert(!so->scanBehind ||
1704 if (unlikely(so->scanBehind) && pstate->finaltup &&
1705 _bt_tuple_before_array_skeys(scan, dir, pstate->finaltup, tupdesc,
1707 scan->indexRelation),
1708 false, 0, NULL))
1709 {
1710 /* Cut our losses -- start a new primitive index scan now */
1711 pstate->continuescan = false;
1712 so->needPrimScan = true;
1713 }
1714 else
1715 {
1716 /* Override _bt_check_compare, continue primitive scan */
1717 pstate->continuescan = true;
1718
1719 /*
1720 * We will end up here repeatedly given a group of tuples > the
1721 * previous array keys and < the now-current keys (for a backwards
1722 * scan it's just the same, though the operators swap positions).
1723 *
1724 * We must avoid allowing this linear search process to scan very
1725 * many tuples from well before the start of tuples matching the
1726 * current array keys (or from well before the point where we'll
1727 * once again have to advance the scan's array keys).
1728 *
1729 * We keep the overhead under control by speculatively "looking
1730 * ahead" to later still-unscanned items from this same leaf page.
1731 * We'll only attempt this once the number of tuples that the
1732 * linear search process has examined starts to get out of hand.
1733 */
1734 pstate->rechecks++;
1736 {
1737 /* See if we should skip ahead within the current leaf page */
1738 _bt_checkkeys_look_ahead(scan, pstate, tupnatts, tupdesc);
1739
1740 /*
1741 * Might have set pstate.skip to a later page offset. When
1742 * that happens then _bt_readpage caller will inexpensively
1743 * skip ahead to a later tuple from the same page (the one
1744 * just after the tuple we successfully "looked ahead" to).
1745 */
1746 }
1747 }
1748
1749 /* This indextuple doesn't match the current qual, in any case */
1750 return false;
1751 }
1752
1753 /*
1754 * Caller's tuple is >= the current set of array keys and other equality
1755 * constraint scan keys (or <= if this is a backwards scan). It's now
1756 * clear that we _must_ advance any required array keys in lockstep with
1757 * the scan.
1758 */
1759 return _bt_advance_array_keys(scan, pstate, tuple, tupnatts, tupdesc,
1760 ikey, true);
1761}
BTScanOpaqueData * BTScanOpaque
Definition: nbtree.h:1078
static bool _bt_check_compare(IndexScanDesc scan, ScanDirection dir, IndexTuple tuple, int tupnatts, TupleDesc tupdesc, bool advancenonrequired, bool prechecked, bool firstmatch, bool *continuescan, int *ikey)
Definition: nbtutils.c:1843
static bool _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate, IndexTuple tuple, int tupnatts, TupleDesc tupdesc, int sktrig, bool sktrig_required)
Definition: nbtutils.c:863
#define LOOK_AHEAD_REQUIRED_RECHECKS
Definition: nbtutils.c:27
static bool _bt_tuple_before_array_skeys(IndexScanDesc scan, ScanDirection dir, IndexTuple tuple, TupleDesc tupdesc, int tupnatts, bool readpagetup, int sktrig, bool *scanBehind)
Definition: nbtutils.c:619
static void _bt_checkkeys_look_ahead(IndexScanDesc scan, BTReadPageState *pstate, int tupnatts, TupleDesc tupdesc)
Definition: nbtutils.c:2233
#define RelationGetDescr(relation)
Definition: rel.h:531
ScanDirection
Definition: sdir.h:25
bool firstmatch
Definition: nbtree.h:1103
bool continuescan
Definition: nbtree.h:1096
IndexTuple finaltup
Definition: nbtree.h:1088
bool prechecked
Definition: nbtree.h:1102
int16 rechecks
Definition: nbtree.h:1109
bool needPrimScan
Definition: nbtree.h:1045
BTScanPosData currPos
Definition: nbtree.h:1074
bool oppositeDirCheck
Definition: nbtree.h:1047
ScanKey keyData
Definition: nbtree.h:1041
ScanDirection dir
Definition: nbtree.h:967
Relation indexRelation
Definition: relscan.h:135
StrategyNumber sk_strategy
Definition: skey.h:68

References _bt_advance_array_keys(), _bt_check_compare(), _bt_checkkeys_look_ahead(), _bt_tuple_before_array_skeys(), Assert, BTEqualStrategyNumber, BTreeTupleGetNAtts, BTReadPageState::continuescan, BTScanOpaqueData::currPos, BTScanPosData::dir, BTReadPageState::finaltup, BTReadPageState::firstmatch, IndexScanDescData::indexRelation, BTScanOpaqueData::keyData, LOOK_AHEAD_REQUIRED_RECHECKS, BTScanOpaqueData::needPrimScan, BTScanOpaqueData::numArrayKeys, IndexScanDescData::opaque, BTScanOpaqueData::oppositeDirCheck, BTReadPageState::prechecked, BTReadPageState::rechecks, RelationGetDescr, res, BTScanOpaqueData::scanBehind, ScanKeyData::sk_strategy, and unlikely.

Referenced by _bt_readpage().

◆ _bt_checkpage()

void _bt_checkpage ( Relation  rel,
Buffer  buf 
)

Definition at line 797 of file nbtpage.c.

798{
799 Page page = BufferGetPage(buf);
800
801 /*
802 * ReadBuffer verifies that every newly-read page passes
803 * PageHeaderIsValid, which means it either contains a reasonably sane
804 * page header or is all-zero. We have to defend against the all-zero
805 * case, however.
806 */
807 if (PageIsNew(page))
809 (errcode(ERRCODE_INDEX_CORRUPTED),
810 errmsg("index \"%s\" contains unexpected zero page at block %u",
813 errhint("Please REINDEX it.")));
814
815 /*
816 * Additionally check that the special area looks sane.
817 */
818 if (PageGetSpecialSize(page) != MAXALIGN(sizeof(BTPageOpaqueData)))
820 (errcode(ERRCODE_INDEX_CORRUPTED),
821 errmsg("index \"%s\" contains corrupted page at block %u",
824 errhint("Please REINDEX it.")));
825}
static uint16 PageGetSpecialSize(const PageData *page)
Definition: bufpage.h:317

References buf, BufferGetBlockNumber(), BufferGetPage(), ereport, errcode(), errhint(), errmsg(), ERROR, MAXALIGN, PageGetSpecialSize(), PageIsNew(), and RelationGetRelationName.

Referenced by _bt_getbuf(), _bt_relandgetbuf(), _bt_search_insert(), bt_recheck_sibling_links(), btvacuumpage(), and palloc_btree_page().

◆ _bt_compare()

int32 _bt_compare ( Relation  rel,
BTScanInsert  key,
Page  page,
OffsetNumber  offnum 
)

Definition at line 688 of file nbtsearch.c.

692{
693 TupleDesc itupdesc = RelationGetDescr(rel);
694 BTPageOpaque opaque = BTPageGetOpaque(page);
695 IndexTuple itup;
696 ItemPointer heapTid;
697 ScanKey scankey;
698 int ncmpkey;
699 int ntupatts;
700 int32 result;
701
702 Assert(_bt_check_natts(rel, key->heapkeyspace, page, offnum));
704 Assert(key->heapkeyspace || key->scantid == NULL);
705
706 /*
707 * Force result ">" if target item is first data item on an internal page
708 * --- see NOTE above.
709 */
710 if (!P_ISLEAF(opaque) && offnum == P_FIRSTDATAKEY(opaque))
711 return 1;
712
713 itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
714 ntupatts = BTreeTupleGetNAtts(itup, rel);
715
716 /*
717 * The scan key is set up with the attribute number associated with each
718 * term in the key. It is important that, if the index is multi-key, the
719 * scan contain the first k key attributes, and that they be in order. If
720 * you think about how multi-key ordering works, you'll understand why
721 * this is.
722 *
723 * We don't test for violation of this condition here, however. The
724 * initial setup for the index scan had better have gotten it right (see
725 * _bt_first).
726 */
727
728 ncmpkey = Min(ntupatts, key->keysz);
729 Assert(key->heapkeyspace || ncmpkey == key->keysz);
730 Assert(!BTreeTupleIsPosting(itup) || key->allequalimage);
731 scankey = key->scankeys;
732 for (int i = 1; i <= ncmpkey; i++)
733 {
734 Datum datum;
735 bool isNull;
736
737 datum = index_getattr(itup, scankey->sk_attno, itupdesc, &isNull);
738
739 if (scankey->sk_flags & SK_ISNULL) /* key is NULL */
740 {
741 if (isNull)
742 result = 0; /* NULL "=" NULL */
743 else if (scankey->sk_flags & SK_BT_NULLS_FIRST)
744 result = -1; /* NULL "<" NOT_NULL */
745 else
746 result = 1; /* NULL ">" NOT_NULL */
747 }
748 else if (isNull) /* key is NOT_NULL and item is NULL */
749 {
750 if (scankey->sk_flags & SK_BT_NULLS_FIRST)
751 result = 1; /* NOT_NULL ">" NULL */
752 else
753 result = -1; /* NOT_NULL "<" NULL */
754 }
755 else
756 {
757 /*
758 * The sk_func needs to be passed the index value as left arg and
759 * the sk_argument as right arg (they might be of different
760 * types). Since it is convenient for callers to think of
761 * _bt_compare as comparing the scankey to the index item, we have
762 * to flip the sign of the comparison result. (Unless it's a DESC
763 * column, in which case we *don't* flip the sign.)
764 */
765 result = DatumGetInt32(FunctionCall2Coll(&scankey->sk_func,
766 scankey->sk_collation,
767 datum,
768 scankey->sk_argument));
769
770 if (!(scankey->sk_flags & SK_BT_DESC))
771 INVERT_COMPARE_RESULT(result);
772 }
773
774 /* if the keys are unequal, return the difference */
775 if (result != 0)
776 return result;
777
778 scankey++;
779 }
780
781 /*
782 * All non-truncated attributes (other than heap TID) were found to be
783 * equal. Treat truncated attributes as minus infinity when scankey has a
784 * key attribute value that would otherwise be compared directly.
785 *
786 * Note: it doesn't matter if ntupatts includes non-key attributes;
787 * scankey won't, so explicitly excluding non-key attributes isn't
788 * necessary.
789 */
790 if (key->keysz > ntupatts)
791 return 1;
792
793 /*
794 * Use the heap TID attribute and scantid to try to break the tie. The
795 * rules are the same as any other key attribute -- only the
796 * representation differs.
797 */
798 heapTid = BTreeTupleGetHeapTID(itup);
799 if (key->scantid == NULL)
800 {
801 /*
802 * Forward scans have a scankey that is considered greater than a
803 * truncated pivot tuple if and when the scankey has equal values for
804 * attributes up to and including the least significant untruncated
805 * attribute in tuple. Even attributes that were omitted from the
806 * scan key are considered greater than -inf truncated attributes.
807 * (See _bt_binsrch for an explanation of our backward scan behavior.)
808 *
809 * For example, if an index has the minimum two attributes (single
810 * user key attribute, plus heap TID attribute), and a page's high key
811 * is ('foo', -inf), and scankey is ('foo', <omitted>), the search
812 * will not descend to the page to the left. The search will descend
813 * right instead. The truncated attribute in pivot tuple means that
814 * all non-pivot tuples on the page to the left are strictly < 'foo',
815 * so it isn't necessary to descend left. In other words, search
816 * doesn't have to descend left because it isn't interested in a match
817 * that has a heap TID value of -inf.
818 *
819 * Note: the heap TID part of the test ensures that scankey is being
820 * compared to a pivot tuple with one or more truncated -inf key
821 * attributes. The heap TID attribute is the last key attribute in
822 * every index, of course, but other than that it isn't special.
823 */
824 if (!key->backward && key->keysz == ntupatts && heapTid == NULL &&
825 key->heapkeyspace)
826 return 1;
827
828 /* All provided scankey arguments found to be equal */
829 return 0;
830 }
831
832 /*
833 * Treat truncated heap TID as minus infinity, since scankey has a key
834 * attribute value (scantid) that would otherwise be compared directly
835 */
837 if (heapTid == NULL)
838 return 1;
839
840 /*
841 * Scankey must be treated as equal to a posting list tuple if its scantid
842 * value falls within the range of the posting list. In all other cases
843 * there can only be a single heap TID value, which is compared directly
844 * with scantid.
845 */
847 result = ItemPointerCompare(key->scantid, heapTid);
848 if (result <= 0 || !BTreeTupleIsPosting(itup))
849 return result;
850 else
851 {
852 result = ItemPointerCompare(key->scantid,
854 if (result > 0)
855 return 1;
856 }
857
858 return 0;
859}
#define Min(x, y)
Definition: c.h:961
#define INVERT_COMPARE_RESULT(var)
Definition: c.h:1063
Datum FunctionCall2Coll(FmgrInfo *flinfo, Oid collation, Datum arg1, Datum arg2)
Definition: fmgr.c:1149
int32 ItemPointerCompare(ItemPointer arg1, ItemPointer arg2)
Definition: itemptr.c:51
static Datum index_getattr(IndexTuple tup, int attnum, TupleDesc tupleDesc, bool *isnull)
Definition: itup.h:131
#define SK_BT_NULLS_FIRST
Definition: nbtree.h:1123
#define SK_BT_DESC
Definition: nbtree.h:1122
static ItemPointer BTreeTupleGetMaxHeapTID(IndexTuple itup)
Definition: nbtree.h:664
bool _bt_check_natts(Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum)
Definition: nbtutils.c:3079
static int32 DatumGetInt32(Datum X)
Definition: postgres.h:207
#define SK_ISNULL
Definition: skey.h:115
int sk_flags
Definition: skey.h:66
Datum sk_argument
Definition: skey.h:72
FmgrInfo sk_func
Definition: skey.h:71
Oid sk_collation
Definition: skey.h:70
AttrNumber sk_attno
Definition: skey.h:67

References _bt_check_natts(), Assert, BTPageGetOpaque, BTreeTupleGetHeapTID(), BTreeTupleGetMaxHeapTID(), BTreeTupleGetNAtts, BTreeTupleIsPosting(), DatumGetInt32(), FunctionCall2Coll(), i, index_getattr(), IndexRelationGetNumberOfKeyAttributes, INVERT_COMPARE_RESULT, ItemPointerCompare(), sort-test::key, Min, P_FIRSTDATAKEY, P_ISLEAF, PageGetItem(), PageGetItemId(), RelationGetDescr, ScanKeyData::sk_argument, ScanKeyData::sk_attno, SK_BT_DESC, SK_BT_NULLS_FIRST, ScanKeyData::sk_collation, ScanKeyData::sk_flags, ScanKeyData::sk_func, and SK_ISNULL.

Referenced by _bt_binsrch(), _bt_binsrch_insert(), _bt_check_unique(), _bt_findinsertloc(), _bt_moveright(), _bt_search_insert(), bt_rootdescend(), bt_target_page_check(), invariant_g_offset(), invariant_l_nontarget_offset(), invariant_l_offset(), and invariant_leq_offset().

◆ _bt_conditionallockbuf()

bool _bt_conditionallockbuf ( Relation  rel,
Buffer  buf 
)

Definition at line 1093 of file nbtpage.c.

1094{
1095 /* ConditionalLockBuffer() asserts that pin is held by this backend */
1097 return false;
1098
1099 if (!RelationUsesLocalBuffers(rel))
1101
1102 return true;
1103}
bool ConditionalLockBuffer(Buffer buffer)
Definition: bufmgr.c:5126

References buf, BufferGetPage(), ConditionalLockBuffer(), RelationUsesLocalBuffers, and VALGRIND_MAKE_MEM_DEFINED.

Referenced by _bt_allocbuf(), and _bt_search_insert().

◆ _bt_dedup_finish_pending()

Size _bt_dedup_finish_pending ( Page  newpage,
BTDedupState  state 
)

Definition at line 555 of file nbtdedup.c.

556{
557 OffsetNumber tupoff;
558 Size tuplesz;
559 Size spacesaving;
560
561 Assert(state->nitems > 0);
562 Assert(state->nitems <= state->nhtids);
563 Assert(state->intervals[state->nintervals].baseoff == state->baseoff);
564
565 tupoff = OffsetNumberNext(PageGetMaxOffsetNumber(newpage));
566 if (state->nitems == 1)
567 {
568 /* Use original, unchanged base tuple */
569 tuplesz = IndexTupleSize(state->base);
570 Assert(tuplesz == MAXALIGN(IndexTupleSize(state->base)));
571 Assert(tuplesz <= BTMaxItemSize(newpage));
572 if (PageAddItem(newpage, (Item) state->base, tuplesz, tupoff,
573 false, false) == InvalidOffsetNumber)
574 elog(ERROR, "deduplication failed to add tuple to page");
575
576 spacesaving = 0;
577 }
578 else
579 {
580 IndexTuple final;
581
582 /* Form a tuple with a posting list */
583 final = _bt_form_posting(state->base, state->htids, state->nhtids);
584 tuplesz = IndexTupleSize(final);
585 Assert(tuplesz <= state->maxpostingsize);
586
587 /* Save final number of items for posting list */
588 state->intervals[state->nintervals].nitems = state->nitems;
589
590 Assert(tuplesz == MAXALIGN(IndexTupleSize(final)));
591 Assert(tuplesz <= BTMaxItemSize(newpage));
592 if (PageAddItem(newpage, (Item) final, tuplesz, tupoff, false,
593 false) == InvalidOffsetNumber)
594 elog(ERROR, "deduplication failed to add tuple to page");
595
596 pfree(final);
597 spacesaving = state->phystupsize - (tuplesz + sizeof(ItemIdData));
598 /* Increment nintervals, since we wrote a new posting list tuple */
599 state->nintervals++;
600 Assert(spacesaving > 0 && spacesaving < BLCKSZ);
601 }
602
603 /* Reset state for next pending posting list */
604 state->nhtids = 0;
605 state->nitems = 0;
606 state->phystupsize = 0;
607
608 return spacesaving;
609}
#define PageAddItem(page, item, size, offsetNumber, overwrite, is_heap)
Definition: bufpage.h:471
Pointer Item
Definition: item.h:17
IndexTuple _bt_form_posting(IndexTuple base, ItemPointer htids, int nhtids)
Definition: nbtdedup.c:864

References _bt_form_posting(), Assert, BTMaxItemSize, elog, ERROR, IndexTupleSize(), InvalidOffsetNumber, MAXALIGN, OffsetNumberNext, PageAddItem, PageGetMaxOffsetNumber(), and pfree().

Referenced by _bt_dedup_pass(), and btree_xlog_dedup().

◆ _bt_dedup_pass()

void _bt_dedup_pass ( Relation  rel,
Buffer  buf,
IndexTuple  newitem,
Size  newitemsz,
bool  bottomupdedup 
)

Definition at line 58 of file nbtdedup.c.

60{
61 OffsetNumber offnum,
62 minoff,
63 maxoff;
64 Page page = BufferGetPage(buf);
65 BTPageOpaque opaque = BTPageGetOpaque(page);
66 Page newpage;
68 Size pagesaving PG_USED_FOR_ASSERTS_ONLY = 0;
69 bool singlevalstrat = false;
70 int nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
71
72 /* Passed-in newitemsz is MAXALIGNED but does not include line pointer */
73 newitemsz += sizeof(ItemIdData);
74
75 /*
76 * Initialize deduplication state.
77 *
78 * It would be possible for maxpostingsize (limit on posting list tuple
79 * size) to be set to one third of the page. However, it seems like a
80 * good idea to limit the size of posting lists to one sixth of a page.
81 * That ought to leave us with a good split point when pages full of
82 * duplicates can be split several times.
83 */
85 state->deduplicate = true;
86 state->nmaxitems = 0;
87 state->maxpostingsize = Min(BTMaxItemSize(page) / 2, INDEX_SIZE_MASK);
88 /* Metadata about base tuple of current pending posting list */
89 state->base = NULL;
90 state->baseoff = InvalidOffsetNumber;
91 state->basetupsize = 0;
92 /* Metadata about current pending posting list TIDs */
93 state->htids = palloc(state->maxpostingsize);
94 state->nhtids = 0;
95 state->nitems = 0;
96 /* Size of all physical tuples to be replaced by pending posting list */
97 state->phystupsize = 0;
98 /* nintervals should be initialized to zero */
99 state->nintervals = 0;
100
101 minoff = P_FIRSTDATAKEY(opaque);
102 maxoff = PageGetMaxOffsetNumber(page);
103
104 /*
105 * Consider applying "single value" strategy, though only if the page
106 * seems likely to be split in the near future
107 */
108 if (!bottomupdedup)
109 singlevalstrat = _bt_do_singleval(rel, page, state, minoff, newitem);
110
111 /*
112 * Deduplicate items from page, and write them to newpage.
113 *
114 * Copy the original page's LSN into newpage copy. This will become the
115 * updated version of the page. We need this because XLogInsert will
116 * examine the LSN and possibly dump it in a page image.
117 */
118 newpage = PageGetTempPageCopySpecial(page);
119 PageSetLSN(newpage, PageGetLSN(page));
120
121 /* Copy high key, if any */
122 if (!P_RIGHTMOST(opaque))
123 {
124 ItemId hitemid = PageGetItemId(page, P_HIKEY);
125 Size hitemsz = ItemIdGetLength(hitemid);
126 IndexTuple hitem = (IndexTuple) PageGetItem(page, hitemid);
127
128 if (PageAddItem(newpage, (Item) hitem, hitemsz, P_HIKEY,
129 false, false) == InvalidOffsetNumber)
130 elog(ERROR, "deduplication failed to add highkey");
131 }
132
133 for (offnum = minoff;
134 offnum <= maxoff;
135 offnum = OffsetNumberNext(offnum))
136 {
137 ItemId itemid = PageGetItemId(page, offnum);
138 IndexTuple itup = (IndexTuple) PageGetItem(page, itemid);
139
140 Assert(!ItemIdIsDead(itemid));
141
142 if (offnum == minoff)
143 {
144 /*
145 * No previous/base tuple for the data item -- use the data item
146 * as base tuple of pending posting list
147 */
148 _bt_dedup_start_pending(state, itup, offnum);
149 }
150 else if (state->deduplicate &&
151 _bt_keep_natts_fast(rel, state->base, itup) > nkeyatts &&
153 {
154 /*
155 * Tuple is equal to base tuple of pending posting list. Heap
156 * TID(s) for itup have been saved in state.
157 */
158 }
159 else
160 {
161 /*
162 * Tuple is not equal to pending posting list tuple, or
163 * _bt_dedup_save_htid() opted to not merge current item into
164 * pending posting list for some other reason (e.g., adding more
165 * TIDs would have caused posting list to exceed current
166 * maxpostingsize).
167 *
168 * If state contains pending posting list with more than one item,
169 * form new posting tuple and add it to our temp page (newpage).
170 * Else add pending interval's base tuple to the temp page as-is.
171 */
172 pagesaving += _bt_dedup_finish_pending(newpage, state);
173
174 if (singlevalstrat)
175 {
176 /*
177 * Single value strategy's extra steps.
178 *
179 * Lower maxpostingsize for sixth and final large posting list
180 * tuple at the point where 5 maxpostingsize-capped tuples
181 * have either been formed or observed.
182 *
183 * When a sixth maxpostingsize-capped item is formed/observed,
184 * stop merging together tuples altogether. The few tuples
185 * that remain at the end of the page won't be merged together
186 * at all (at least not until after a future page split takes
187 * place, when this page's newly allocated right sibling page
188 * gets its first deduplication pass).
189 */
190 if (state->nmaxitems == 5)
191 _bt_singleval_fillfactor(page, state, newitemsz);
192 else if (state->nmaxitems == 6)
193 {
194 state->deduplicate = false;
195 singlevalstrat = false; /* won't be back here */
196 }
197 }
198
199 /* itup starts new pending posting list */
200 _bt_dedup_start_pending(state, itup, offnum);
201 }
202 }
203
204 /* Handle the last item */
205 pagesaving += _bt_dedup_finish_pending(newpage, state);
206
207 /*
208 * If no items suitable for deduplication were found, newpage must be
209 * exactly the same as the original page, so just return from function.
210 *
211 * We could determine whether or not to proceed on the basis the space
212 * savings being sufficient to avoid an immediate page split instead. We
213 * don't do that because there is some small value in nbtsplitloc.c always
214 * operating against a page that is fully deduplicated (apart from
215 * newitem). Besides, most of the cost has already been paid.
216 */
217 if (state->nintervals == 0)
218 {
219 /* cannot leak memory here */
220 pfree(newpage);
221 pfree(state->htids);
222 pfree(state);
223 return;
224 }
225
226 /*
227 * By here, it's clear that deduplication will definitely go ahead.
228 *
229 * Clear the BTP_HAS_GARBAGE page flag. The index must be a heapkeyspace
230 * index, and as such we'll never pay attention to BTP_HAS_GARBAGE anyway.
231 * But keep things tidy.
232 */
233 if (P_HAS_GARBAGE(opaque))
234 {
235 BTPageOpaque nopaque = BTPageGetOpaque(newpage);
236
237 nopaque->btpo_flags &= ~BTP_HAS_GARBAGE;
238 }
239
241
242 PageRestoreTempPage(newpage, page);
244
245 /* XLOG stuff */
246 if (RelationNeedsWAL(rel))
247 {
248 XLogRecPtr recptr;
249 xl_btree_dedup xlrec_dedup;
250
251 xlrec_dedup.nintervals = state->nintervals;
252
255 XLogRegisterData((char *) &xlrec_dedup, SizeOfBtreeDedup);
256
257 /*
258 * The intervals array is not in the buffer, but pretend that it is.
259 * When XLogInsert stores the whole buffer, the array need not be
260 * stored too.
261 */
262 XLogRegisterBufData(0, (char *) state->intervals,
263 state->nintervals * sizeof(BTDedupInterval));
264
265 recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DEDUP);
266
267 PageSetLSN(page, recptr);
268 }
269
271
272 /* Local space accounting should agree with page accounting */
273 Assert(pagesaving < newitemsz || PageGetExactFreeSpace(page) >= newitemsz);
274
275 /* cannot leak memory here */
276 pfree(state->htids);
277 pfree(state);
278}
void MarkBufferDirty(Buffer buffer)
Definition: bufmgr.c:2532
void PageRestoreTempPage(Page tempPage, Page oldPage)
Definition: bufpage.c:413
Page PageGetTempPageCopySpecial(const PageData *page)
Definition: bufpage.c:391
static void PageSetLSN(Page page, XLogRecPtr lsn)
Definition: bufpage.h:391
static XLogRecPtr PageGetLSN(const PageData *page)
Definition: bufpage.h:386
#define PG_USED_FOR_ASSERTS_ONLY
Definition: c.h:204
#define ItemIdGetLength(itemId)
Definition: itemid.h:59
#define INDEX_SIZE_MASK
Definition: itup.h:65
#define START_CRIT_SECTION()
Definition: miscadmin.h:149
#define END_CRIT_SECTION()
Definition: miscadmin.h:151
static bool _bt_do_singleval(Relation rel, Page page, BTDedupState state, OffsetNumber minoff, IndexTuple newitem)
Definition: nbtdedup.c:782
Size _bt_dedup_finish_pending(Page newpage, BTDedupState state)
Definition: nbtdedup.c:555
static void _bt_singleval_fillfactor(Page page, BTDedupState state, Size newitemsz)
Definition: nbtdedup.c:822
#define P_HAS_GARBAGE(opaque)
Definition: nbtree.h:226
#define XLOG_BTREE_DEDUP
Definition: nbtxlog.h:33
#define SizeOfBtreeDedup
Definition: nbtxlog.h:174
uint16 btpo_flags
Definition: nbtree.h:67
uint16 nintervals
Definition: nbtxlog.h:169
uint64 XLogRecPtr
Definition: xlogdefs.h:21
void XLogRegisterBufData(uint8 block_id, const char *data, uint32 len)
Definition: xloginsert.c:405
void XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags)
Definition: xloginsert.c:242
#define REGBUF_STANDARD
Definition: xloginsert.h:35

References _bt_dedup_finish_pending(), _bt_dedup_save_htid(), _bt_dedup_start_pending(), _bt_do_singleval(), _bt_keep_natts_fast(), _bt_singleval_fillfactor(), Assert, BTMaxItemSize, BTPageGetOpaque, BTPageOpaqueData::btpo_flags, buf, BufferGetPage(), elog, END_CRIT_SECTION, ERROR, INDEX_SIZE_MASK, IndexRelationGetNumberOfKeyAttributes, InvalidOffsetNumber, ItemIdGetLength, ItemIdIsDead, MarkBufferDirty(), Min, xl_btree_dedup::nintervals, OffsetNumberNext, P_FIRSTDATAKEY, P_HAS_GARBAGE, P_HIKEY, P_RIGHTMOST, PageAddItem, PageGetExactFreeSpace(), PageGetItem(), PageGetItemId(), PageGetLSN(), PageGetMaxOffsetNumber(), PageGetTempPageCopySpecial(), PageRestoreTempPage(), PageSetLSN(), palloc(), pfree(), PG_USED_FOR_ASSERTS_ONLY, REGBUF_STANDARD, RelationNeedsWAL, SizeOfBtreeDedup, START_CRIT_SECTION, XLOG_BTREE_DEDUP, XLogBeginInsert(), XLogInsert(), XLogRegisterBufData(), XLogRegisterBuffer(), and XLogRegisterData().

Referenced by _bt_delete_or_dedup_one_page().

◆ _bt_dedup_save_htid()

bool _bt_dedup_save_htid ( BTDedupState  state,
IndexTuple  itup 
)

Definition at line 484 of file nbtdedup.c.

485{
486 int nhtids;
487 ItemPointer htids;
488 Size mergedtupsz;
489
491
492 if (!BTreeTupleIsPosting(itup))
493 {
494 nhtids = 1;
495 htids = &itup->t_tid;
496 }
497 else
498 {
499 nhtids = BTreeTupleGetNPosting(itup);
500 htids = BTreeTupleGetPosting(itup);
501 }
502
503 /*
504 * Don't append (have caller finish pending posting list as-is) if
505 * appending heap TID(s) from itup would put us over maxpostingsize limit.
506 *
507 * This calculation needs to match the code used within _bt_form_posting()
508 * for new posting list tuples.
509 */
510 mergedtupsz = MAXALIGN(state->basetupsize +
511 (state->nhtids + nhtids) * sizeof(ItemPointerData));
512
513 if (mergedtupsz > state->maxpostingsize)
514 {
515 /*
516 * Count this as an oversized item for single value strategy, though
517 * only when there are 50 TIDs in the final posting list tuple. This
518 * limit (which is fairly arbitrary) avoids confusion about how many
519 * 1/6 of a page tuples have been encountered/created by the current
520 * deduplication pass.
521 *
522 * Note: We deliberately don't consider which deduplication pass
523 * merged together tuples to create this item (could be a previous
524 * deduplication pass, or current pass). See _bt_do_singleval()
525 * comments.
526 */
527 if (state->nhtids > 50)
528 state->nmaxitems++;
529
530 return false;
531 }
532
533 /*
534 * Save heap TIDs to pending posting list tuple -- itup can be merged into
535 * pending posting list
536 */
537 state->nitems++;
538 memcpy(state->htids + state->nhtids, htids,
539 sizeof(ItemPointerData) * nhtids);
540 state->nhtids += nhtids;
541 state->phystupsize += MAXALIGN(IndexTupleSize(itup)) + sizeof(ItemIdData);
542
543 return true;
544}
static uint16 BTreeTupleGetNPosting(IndexTuple posting)
Definition: nbtree.h:518
static ItemPointer BTreeTupleGetPosting(IndexTuple posting)
Definition: nbtree.h:537

References Assert, BTreeTupleGetNPosting(), BTreeTupleGetPosting(), BTreeTupleIsPivot(), BTreeTupleIsPosting(), IndexTupleSize(), MAXALIGN, and IndexTupleData::t_tid.

Referenced by _bt_bottomupdel_pass(), _bt_dedup_pass(), _bt_load(), and btree_xlog_dedup().

◆ _bt_dedup_start_pending()

void _bt_dedup_start_pending ( BTDedupState  state,
IndexTuple  base,
OffsetNumber  baseoff 
)

Definition at line 433 of file nbtdedup.c.

435{
436 Assert(state->nhtids == 0);
437 Assert(state->nitems == 0);
439
440 /*
441 * Copy heap TID(s) from new base tuple for new candidate posting list
442 * into working state's array
443 */
444 if (!BTreeTupleIsPosting(base))
445 {
446 memcpy(state->htids, &base->t_tid, sizeof(ItemPointerData));
447 state->nhtids = 1;
448 state->basetupsize = IndexTupleSize(base);
449 }
450 else
451 {
452 int nposting;
453
454 nposting = BTreeTupleGetNPosting(base);
455 memcpy(state->htids, BTreeTupleGetPosting(base),
456 sizeof(ItemPointerData) * nposting);
457 state->nhtids = nposting;
458 /* basetupsize should not include existing posting list */
459 state->basetupsize = BTreeTupleGetPostingOffset(base);
460 }
461
462 /*
463 * Save new base tuple itself -- it'll be needed if we actually create a
464 * new posting list from new pending posting list.
465 *
466 * Must maintain physical size of all existing tuples (including line
467 * pointer overhead) so that we can calculate space savings on page.
468 */
469 state->nitems = 1;
470 state->base = base;
471 state->baseoff = baseoff;
472 state->phystupsize = MAXALIGN(IndexTupleSize(base)) + sizeof(ItemIdData);
473 /* Also save baseoff in pending state for interval */
474 state->intervals[state->nintervals].baseoff = state->baseoff;
475}
static uint32 BTreeTupleGetPostingOffset(IndexTuple posting)
Definition: nbtree.h:529

References Assert, BTreeTupleGetNPosting(), BTreeTupleGetPosting(), BTreeTupleGetPostingOffset(), BTreeTupleIsPivot(), BTreeTupleIsPosting(), IndexTupleSize(), MAXALIGN, and IndexTupleData::t_tid.

Referenced by _bt_bottomupdel_pass(), _bt_dedup_pass(), _bt_load(), and btree_xlog_dedup().

◆ _bt_delitems_delete_check()

void _bt_delitems_delete_check ( Relation  rel,
Buffer  buf,
Relation  heapRel,
TM_IndexDeleteOp delstate 
)

Definition at line 1513 of file nbtpage.c.

1515{
1516 Page page = BufferGetPage(buf);
1517 TransactionId snapshotConflictHorizon;
1518 bool isCatalogRel;
1519 OffsetNumber postingidxoffnum = InvalidOffsetNumber;
1520 int ndeletable = 0,
1521 nupdatable = 0;
1524
1525 /* Use tableam interface to determine which tuples to delete first */
1526 snapshotConflictHorizon = table_index_delete_tuples(heapRel, delstate);
1527 isCatalogRel = RelationIsAccessibleInLogicalDecoding(heapRel);
1528
1529 /* Should not WAL-log snapshotConflictHorizon unless it's required */
1530 if (!XLogStandbyInfoActive())
1531 snapshotConflictHorizon = InvalidTransactionId;
1532
1533 /*
1534 * Construct a leaf-page-wise description of what _bt_delitems_delete()
1535 * needs to do to physically delete index tuples from the page.
1536 *
1537 * Must sort deltids array to restore leaf-page-wise order (original order
1538 * before call to tableam). This is the order that the loop expects.
1539 *
1540 * Note that deltids array might be a lot smaller now. It might even have
1541 * no entries at all (with bottom-up deletion caller), in which case there
1542 * is nothing left to do.
1543 */
1544 qsort(delstate->deltids, delstate->ndeltids, sizeof(TM_IndexDelete),
1546 if (delstate->ndeltids == 0)
1547 {
1548 Assert(delstate->bottomup);
1549 return;
1550 }
1551
1552 /* We definitely have to delete at least one index tuple (or one TID) */
1553 for (int i = 0; i < delstate->ndeltids; i++)
1554 {
1555 TM_IndexStatus *dstatus = delstate->status + delstate->deltids[i].id;
1556 OffsetNumber idxoffnum = dstatus->idxoffnum;
1557 ItemId itemid = PageGetItemId(page, idxoffnum);
1558 IndexTuple itup = (IndexTuple) PageGetItem(page, itemid);
1559 int nestedi,
1560 nitem;
1561 BTVacuumPosting vacposting;
1562
1563 Assert(OffsetNumberIsValid(idxoffnum));
1564
1565 if (idxoffnum == postingidxoffnum)
1566 {
1567 /*
1568 * This deltid entry is a TID from a posting list tuple that has
1569 * already been completely processed
1570 */
1573 &delstate->deltids[i].tid) < 0);
1575 &delstate->deltids[i].tid) >= 0);
1576 continue;
1577 }
1578
1579 if (!BTreeTupleIsPosting(itup))
1580 {
1581 /* Plain non-pivot tuple */
1582 Assert(ItemPointerEquals(&itup->t_tid, &delstate->deltids[i].tid));
1583 if (dstatus->knowndeletable)
1584 deletable[ndeletable++] = idxoffnum;
1585 continue;
1586 }
1587
1588 /*
1589 * itup is a posting list tuple whose lowest deltids entry (which may
1590 * or may not be for the first TID from itup) is considered here now.
1591 * We should process all of the deltids entries for the posting list
1592 * together now, though (not just the lowest). Remember to skip over
1593 * later itup-related entries during later iterations of outermost
1594 * loop.
1595 */
1596 postingidxoffnum = idxoffnum; /* Remember work in outermost loop */
1597 nestedi = i; /* Initialize for first itup deltids entry */
1598 vacposting = NULL; /* Describes final action for itup */
1599 nitem = BTreeTupleGetNPosting(itup);
1600 for (int p = 0; p < nitem; p++)
1601 {
1602 ItemPointer ptid = BTreeTupleGetPostingN(itup, p);
1603 int ptidcmp = -1;
1604
1605 /*
1606 * This nested loop reuses work across ptid TIDs taken from itup.
1607 * We take advantage of the fact that both itup's TIDs and deltids
1608 * entries (within a single itup/posting list grouping) must both
1609 * be in ascending TID order.
1610 */
1611 for (; nestedi < delstate->ndeltids; nestedi++)
1612 {
1613 TM_IndexDelete *tcdeltid = &delstate->deltids[nestedi];
1614 TM_IndexStatus *tdstatus = (delstate->status + tcdeltid->id);
1615
1616 /* Stop once we get past all itup related deltids entries */
1617 Assert(tdstatus->idxoffnum >= idxoffnum);
1618 if (tdstatus->idxoffnum != idxoffnum)
1619 break;
1620
1621 /* Skip past non-deletable itup related entries up front */
1622 if (!tdstatus->knowndeletable)
1623 continue;
1624
1625 /* Entry is first partial ptid match (or an exact match)? */
1626 ptidcmp = ItemPointerCompare(&tcdeltid->tid, ptid);
1627 if (ptidcmp >= 0)
1628 {
1629 /* Greater than or equal (partial or exact) match... */
1630 break;
1631 }
1632 }
1633
1634 /* ...exact ptid match to a deletable deltids entry? */
1635 if (ptidcmp != 0)
1636 continue;
1637
1638 /* Exact match for deletable deltids entry -- ptid gets deleted */
1639 if (vacposting == NULL)
1640 {
1641 vacposting = palloc(offsetof(BTVacuumPostingData, deletetids) +
1642 nitem * sizeof(uint16));
1643 vacposting->itup = itup;
1644 vacposting->updatedoffset = idxoffnum;
1645 vacposting->ndeletedtids = 0;
1646 }
1647 vacposting->deletetids[vacposting->ndeletedtids++] = p;
1648 }
1649
1650 /* Final decision on itup, a posting list tuple */
1651
1652 if (vacposting == NULL)
1653 {
1654 /* No TIDs to delete from itup -- do nothing */
1655 }
1656 else if (vacposting->ndeletedtids == nitem)
1657 {
1658 /* Straight delete of itup (to delete all TIDs) */
1659 deletable[ndeletable++] = idxoffnum;
1660 /* Turns out we won't need granular information */
1661 pfree(vacposting);
1662 }
1663 else
1664 {
1665 /* Delete some (but not all) TIDs from itup */
1666 Assert(vacposting->ndeletedtids > 0 &&
1667 vacposting->ndeletedtids < nitem);
1668 updatable[nupdatable++] = vacposting;
1669 }
1670 }
1671
1672 /* Physically delete tuples (or TIDs) using deletable (or updatable) */
1673 _bt_delitems_delete(rel, buf, snapshotConflictHorizon, isCatalogRel,
1674 deletable, ndeletable, updatable, nupdatable);
1675
1676 /* be tidy */
1677 for (int i = 0; i < nupdatable; i++)
1678 pfree(updatable[i]);
1679}
uint16_t uint16
Definition: c.h:487
uint32 TransactionId
Definition: c.h:609
bool ItemPointerEquals(ItemPointer pointer1, ItemPointer pointer2)
Definition: itemptr.c:35
#define MaxIndexTuplesPerPage
Definition: itup.h:181
static void _bt_delitems_delete(Relation rel, Buffer buf, TransactionId snapshotConflictHorizon, bool isCatalogRel, OffsetNumber *deletable, int ndeletable, BTVacuumPosting *updatable, int nupdatable)
Definition: nbtpage.c:1284
static int _bt_delitems_cmp(const void *a, const void *b)
Definition: nbtpage.c:1464
static ItemPointer BTreeTupleGetPostingN(IndexTuple posting, int n)
Definition: nbtree.h:544
#define OffsetNumberIsValid(offsetNumber)
Definition: off.h:39
#define qsort(a, b, c, d)
Definition: port.h:474
uint16 deletetids[FLEXIBLE_ARRAY_MEMBER]
Definition: nbtree.h:916
uint16 ndeletedtids
Definition: nbtree.h:915
IndexTuple itup
Definition: nbtree.h:911
OffsetNumber updatedoffset
Definition: nbtree.h:912
ItemPointerData tid
Definition: tableam.h:212
bool knowndeletable
Definition: tableam.h:219
OffsetNumber idxoffnum
Definition: tableam.h:218
static TransactionId table_index_delete_tuples(Relation rel, TM_IndexDeleteOp *delstate)
Definition: tableam.h:1361
#define InvalidTransactionId
Definition: transam.h:31

References _bt_delitems_cmp(), _bt_delitems_delete(), Assert, TM_IndexDeleteOp::bottomup, BTreeTupleGetHeapTID(), BTreeTupleGetMaxHeapTID(), BTreeTupleGetNPosting(), BTreeTupleGetPostingN(), BTreeTupleIsPosting(), buf, BufferGetPage(), BTVacuumPostingData::deletetids, TM_IndexDeleteOp::deltids, i, TM_IndexDelete::id, TM_IndexStatus::idxoffnum, InvalidOffsetNumber, InvalidTransactionId, ItemPointerCompare(), ItemPointerEquals(), BTVacuumPostingData::itup, TM_IndexStatus::knowndeletable, MaxIndexTuplesPerPage, BTVacuumPostingData::ndeletedtids, TM_IndexDeleteOp::ndeltids, OffsetNumberIsValid, PageGetItem(), PageGetItemId(), palloc(), pfree(), qsort, RelationIsAccessibleInLogicalDecoding, TM_IndexDeleteOp::status, IndexTupleData::t_tid, table_index_delete_tuples(), TM_IndexDelete::tid, BTVacuumPostingData::updatedoffset, and XLogStandbyInfoActive.

Referenced by _bt_bottomupdel_pass(), and _bt_simpledel_pass().

◆ _bt_delitems_vacuum()

void _bt_delitems_vacuum ( Relation  rel,
Buffer  buf,
OffsetNumber deletable,
int  ndeletable,
BTVacuumPosting updatable,
int  nupdatable 
)

Definition at line 1154 of file nbtpage.c.

1157{
1158 Page page = BufferGetPage(buf);
1159 BTPageOpaque opaque;
1160 bool needswal = RelationNeedsWAL(rel);
1161 char *updatedbuf = NULL;
1162 Size updatedbuflen = 0;
1163 OffsetNumber updatedoffsets[MaxIndexTuplesPerPage];
1164
1165 /* Shouldn't be called unless there's something to do */
1166 Assert(ndeletable > 0 || nupdatable > 0);
1167
1168 /* Generate new version of posting lists without deleted TIDs */
1169 if (nupdatable > 0)
1170 updatedbuf = _bt_delitems_update(updatable, nupdatable,
1171 updatedoffsets, &updatedbuflen,
1172 needswal);
1173
1174 /* No ereport(ERROR) until changes are logged */
1176
1177 /*
1178 * Handle posting tuple updates.
1179 *
1180 * Deliberately do this before handling simple deletes. If we did it the
1181 * other way around (i.e. WAL record order -- simple deletes before
1182 * updates) then we'd have to make compensating changes to the 'updatable'
1183 * array of offset numbers.
1184 *
1185 * PageIndexTupleOverwrite() won't unset each item's LP_DEAD bit when it
1186 * happens to already be set. It's important that we not interfere with
1187 * any future simple index tuple deletion operations.
1188 */
1189 for (int i = 0; i < nupdatable; i++)
1190 {
1191 OffsetNumber updatedoffset = updatedoffsets[i];
1192 IndexTuple itup;
1193 Size itemsz;
1194
1195 itup = updatable[i]->itup;
1196 itemsz = MAXALIGN(IndexTupleSize(itup));
1197 if (!PageIndexTupleOverwrite(page, updatedoffset, (Item) itup,
1198 itemsz))
1199 elog(PANIC, "failed to update partially dead item in block %u of index \"%s\"",
1201 }
1202
1203 /* Now handle simple deletes of entire tuples */
1204 if (ndeletable > 0)
1205 PageIndexMultiDelete(page, deletable, ndeletable);
1206
1207 /*
1208 * We can clear the vacuum cycle ID since this page has certainly been
1209 * processed by the current vacuum scan.
1210 */
1211 opaque = BTPageGetOpaque(page);
1212 opaque->btpo_cycleid = 0;
1213
1214 /*
1215 * Clear the BTP_HAS_GARBAGE page flag.
1216 *
1217 * This flag indicates the presence of LP_DEAD items on the page (though
1218 * not reliably). Note that we only rely on it with pg_upgrade'd
1219 * !heapkeyspace indexes. That's why clearing it here won't usually
1220 * interfere with simple index tuple deletion.
1221 */
1222 opaque->btpo_flags &= ~BTP_HAS_GARBAGE;
1223
1225
1226 /* XLOG stuff */
1227 if (needswal)
1228 {
1229 XLogRecPtr recptr;
1230 xl_btree_vacuum xlrec_vacuum;
1231
1232 xlrec_vacuum.ndeleted = ndeletable;
1233 xlrec_vacuum.nupdated = nupdatable;
1234
1237 XLogRegisterData((char *) &xlrec_vacuum, SizeOfBtreeVacuum);
1238
1239 if (ndeletable > 0)
1240 XLogRegisterBufData(0, (char *) deletable,
1241 ndeletable * sizeof(OffsetNumber));
1242
1243 if (nupdatable > 0)
1244 {
1245 XLogRegisterBufData(0, (char *) updatedoffsets,
1246 nupdatable * sizeof(OffsetNumber));
1247 XLogRegisterBufData(0, updatedbuf, updatedbuflen);
1248 }
1249
1250 recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_VACUUM);
1251
1252 PageSetLSN(page, recptr);
1253 }
1254
1256
1257 /* can't leak memory here */
1258 if (updatedbuf != NULL)
1259 pfree(updatedbuf);
1260 /* free tuples allocated within _bt_delitems_update() */
1261 for (int i = 0; i < nupdatable; i++)
1262 pfree(updatable[i]->itup);
1263}
void PageIndexMultiDelete(Page page, OffsetNumber *itemnos, int nitems)
Definition: bufpage.c:1150
bool PageIndexTupleOverwrite(Page page, OffsetNumber offnum, Item newtup, Size newsize)
Definition: bufpage.c:1394
#define PANIC
Definition: elog.h:42
static char * _bt_delitems_update(BTVacuumPosting *updatable, int nupdatable, OffsetNumber *updatedoffsets, Size *updatedbuflen, bool needswal)
Definition: nbtpage.c:1405
#define SizeOfBtreeVacuum
Definition: nbtxlog.h:234
#define XLOG_BTREE_VACUUM
Definition: nbtxlog.h:39
BTCycleId btpo_cycleid
Definition: nbtree.h:68
uint16 ndeleted
Definition: nbtxlog.h:222
uint16 nupdated
Definition: nbtxlog.h:223

References _bt_delitems_update(), Assert, BTPageGetOpaque, BTPageOpaqueData::btpo_cycleid, BTPageOpaqueData::btpo_flags, buf, BufferGetBlockNumber(), BufferGetPage(), elog, END_CRIT_SECTION, i, IndexTupleSize(), BTVacuumPostingData::itup, MarkBufferDirty(), MAXALIGN, MaxIndexTuplesPerPage, xl_btree_vacuum::ndeleted, xl_btree_vacuum::nupdated, PageIndexMultiDelete(), PageIndexTupleOverwrite(), PageSetLSN(), PANIC, pfree(), REGBUF_STANDARD, RelationGetRelationName, RelationNeedsWAL, SizeOfBtreeVacuum, START_CRIT_SECTION, XLOG_BTREE_VACUUM, XLogBeginInsert(), XLogInsert(), XLogRegisterBufData(), XLogRegisterBuffer(), and XLogRegisterData().

Referenced by btvacuumpage().

◆ _bt_doinsert()

bool _bt_doinsert ( Relation  rel,
IndexTuple  itup,
IndexUniqueCheck  checkUnique,
bool  indexUnchanged,
Relation  heapRel 
)

Definition at line 102 of file nbtinsert.c.

105{
106 bool is_unique = false;
107 BTInsertStateData insertstate;
108 BTScanInsert itup_key;
109 BTStack stack;
110 bool checkingunique = (checkUnique != UNIQUE_CHECK_NO);
111
112 /* we need an insertion scan key to do our search, so build one */
113 itup_key = _bt_mkscankey(rel, itup);
114
115 if (checkingunique)
116 {
117 if (!itup_key->anynullkeys)
118 {
119 /* No (heapkeyspace) scantid until uniqueness established */
120 itup_key->scantid = NULL;
121 }
122 else
123 {
124 /*
125 * Scan key for new tuple contains NULL key values. Bypass
126 * checkingunique steps. They are unnecessary because core code
127 * considers NULL unequal to every value, including NULL.
128 *
129 * This optimization avoids O(N^2) behavior within the
130 * _bt_findinsertloc() heapkeyspace path when a unique index has a
131 * large number of "duplicates" with NULL key values.
132 */
133 checkingunique = false;
134 /* Tuple is unique in the sense that core code cares about */
135 Assert(checkUnique != UNIQUE_CHECK_EXISTING);
136 is_unique = true;
137 }
138 }
139
140 /*
141 * Fill in the BTInsertState working area, to track the current page and
142 * position within the page to insert on.
143 *
144 * Note that itemsz is passed down to lower level code that deals with
145 * inserting the item. It must be MAXALIGN()'d. This ensures that space
146 * accounting code consistently considers the alignment overhead that we
147 * expect PageAddItem() will add later. (Actually, index_form_tuple() is
148 * already conservative about alignment, but we don't rely on that from
149 * this distance. Besides, preserving the "true" tuple size in index
150 * tuple headers for the benefit of nbtsplitloc.c might happen someday.
151 * Note that heapam does not MAXALIGN() each heap tuple's lp_len field.)
152 */
153 insertstate.itup = itup;
154 insertstate.itemsz = MAXALIGN(IndexTupleSize(itup));
155 insertstate.itup_key = itup_key;
156 insertstate.bounds_valid = false;
157 insertstate.buf = InvalidBuffer;
158 insertstate.postingoff = 0;
159
160search:
161
162 /*
163 * Find and lock the leaf page that the tuple should be added to by
164 * searching from the root page. insertstate.buf will hold a buffer that
165 * is locked in exclusive mode afterwards.
166 */
167 stack = _bt_search_insert(rel, heapRel, &insertstate);
168
169 /*
170 * checkingunique inserts are not allowed to go ahead when two tuples with
171 * equal key attribute values would be visible to new MVCC snapshots once
172 * the xact commits. Check for conflicts in the locked page/buffer (if
173 * needed) here.
174 *
175 * It might be necessary to check a page to the right in _bt_check_unique,
176 * though that should be very rare. In practice the first page the value
177 * could be on (with scantid omitted) is almost always also the only page
178 * that a matching tuple might be found on. This is due to the behavior
179 * of _bt_findsplitloc with duplicate tuples -- a group of duplicates can
180 * only be allowed to cross a page boundary when there is no candidate
181 * leaf page split point that avoids it. Also, _bt_check_unique can use
182 * the leaf page high key to determine that there will be no duplicates on
183 * the right sibling without actually visiting it (it uses the high key in
184 * cases where the new item happens to belong at the far right of the leaf
185 * page).
186 *
187 * NOTE: obviously, _bt_check_unique can only detect keys that are already
188 * in the index; so it cannot defend against concurrent insertions of the
189 * same key. We protect against that by means of holding a write lock on
190 * the first page the value could be on, with omitted/-inf value for the
191 * implicit heap TID tiebreaker attribute. Any other would-be inserter of
192 * the same key must acquire a write lock on the same page, so only one
193 * would-be inserter can be making the check at one time. Furthermore,
194 * once we are past the check we hold write locks continuously until we
195 * have performed our insertion, so no later inserter can fail to see our
196 * insertion. (This requires some care in _bt_findinsertloc.)
197 *
198 * If we must wait for another xact, we release the lock while waiting,
199 * and then must perform a new search.
200 *
201 * For a partial uniqueness check, we don't wait for the other xact. Just
202 * let the tuple in and return false for possibly non-unique, or true for
203 * definitely unique.
204 */
205 if (checkingunique)
206 {
207 TransactionId xwait;
208 uint32 speculativeToken;
209
210 xwait = _bt_check_unique(rel, &insertstate, heapRel, checkUnique,
211 &is_unique, &speculativeToken);
212
213 if (unlikely(TransactionIdIsValid(xwait)))
214 {
215 /* Have to wait for the other guy ... */
216 _bt_relbuf(rel, insertstate.buf);
217 insertstate.buf = InvalidBuffer;
218
219 /*
220 * If it's a speculative insertion, wait for it to finish (ie. to
221 * go ahead with the insertion, or kill the tuple). Otherwise
222 * wait for the transaction to finish as usual.
223 */
224 if (speculativeToken)
225 SpeculativeInsertionWait(xwait, speculativeToken);
226 else
227 XactLockTableWait(xwait, rel, &itup->t_tid, XLTW_InsertIndex);
228
229 /* start over... */
230 if (stack)
231 _bt_freestack(stack);
232 goto search;
233 }
234
235 /* Uniqueness is established -- restore heap tid as scantid */
236 if (itup_key->heapkeyspace)
237 itup_key->scantid = &itup->t_tid;
238 }
239
240 if (checkUnique != UNIQUE_CHECK_EXISTING)
241 {
242 OffsetNumber newitemoff;
243
244 /*
245 * The only conflict predicate locking cares about for indexes is when
246 * an index tuple insert conflicts with an existing lock. We don't
247 * know the actual page we're going to insert on for sure just yet in
248 * checkingunique and !heapkeyspace cases, but it's okay to use the
249 * first page the value could be on (with scantid omitted) instead.
250 */
252
253 /*
254 * Do the insertion. Note that insertstate contains cached binary
255 * search bounds established within _bt_check_unique when insertion is
256 * checkingunique.
257 */
258 newitemoff = _bt_findinsertloc(rel, &insertstate, checkingunique,
259 indexUnchanged, stack, heapRel);
260 _bt_insertonpg(rel, heapRel, itup_key, insertstate.buf, InvalidBuffer,
261 stack, itup, insertstate.itemsz, newitemoff,
262 insertstate.postingoff, false);
263 }
264 else
265 {
266 /* just release the buffer */
267 _bt_relbuf(rel, insertstate.buf);
268 }
269
270 /* be tidy */
271 if (stack)
272 _bt_freestack(stack);
273 pfree(itup_key);
274
275 return is_unique;
276}
uint32_t uint32
Definition: c.h:488
@ UNIQUE_CHECK_NO
Definition: genam.h:119
@ UNIQUE_CHECK_EXISTING
Definition: genam.h:122
void XactLockTableWait(TransactionId xid, Relation rel, ItemPointer ctid, XLTW_Oper oper)
Definition: lmgr.c:656
void SpeculativeInsertionWait(TransactionId xid, uint32 token)
Definition: lmgr.c:813
@ XLTW_InsertIndex
Definition: lmgr.h:31
static BTStack _bt_search_insert(Relation rel, Relation heaprel, BTInsertState insertstate)
Definition: nbtinsert.c:317
static OffsetNumber _bt_findinsertloc(Relation rel, BTInsertState insertstate, bool checkingunique, bool indexUnchanged, BTStack stack, Relation heapRel)
Definition: nbtinsert.c:815
static void _bt_insertonpg(Relation rel, Relation heaprel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, BTStack stack, IndexTuple itup, Size itemsz, OffsetNumber newitemoff, int postingoff, bool split_only_page)
Definition: nbtinsert.c:1105
static TransactionId _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel, IndexUniqueCheck checkUnique, bool *is_unique, uint32 *speculativeToken)
Definition: nbtinsert.c:408
void _bt_freestack(BTStack stack)
Definition: nbtutils.c:172
BTScanInsert _bt_mkscankey(Relation rel, IndexTuple itup)
Definition: nbtutils.c:80
void CheckForSerializableConflictIn(Relation relation, ItemPointer tid, BlockNumber blkno)
Definition: predicate.c:4326
IndexTuple itup
Definition: nbtree.h:816
ItemPointer scantid
Definition: nbtree.h:796
bool heapkeyspace
Definition: nbtree.h:791
bool anynullkeys
Definition: nbtree.h:793
#define TransactionIdIsValid(xid)
Definition: transam.h:41

References _bt_check_unique(), _bt_findinsertloc(), _bt_freestack(), _bt_insertonpg(), _bt_mkscankey(), _bt_relbuf(), _bt_search_insert(), BTScanInsertData::anynullkeys, Assert, BTInsertStateData::bounds_valid, BTInsertStateData::buf, BufferGetBlockNumber(), CheckForSerializableConflictIn(), BTScanInsertData::heapkeyspace, IndexTupleSize(), InvalidBuffer, BTInsertStateData::itemsz, BTInsertStateData::itup, BTInsertStateData::itup_key, MAXALIGN, pfree(), BTInsertStateData::postingoff, BTScanInsertData::scantid, SpeculativeInsertionWait(), IndexTupleData::t_tid, TransactionIdIsValid, UNIQUE_CHECK_EXISTING, UNIQUE_CHECK_NO, unlikely, XactLockTableWait(), and XLTW_InsertIndex.

Referenced by btinsert().

◆ _bt_end_vacuum()

void _bt_end_vacuum ( Relation  rel)

Definition at line 2641 of file nbtutils.c.

2642{
2643 int i;
2644
2645 LWLockAcquire(BtreeVacuumLock, LW_EXCLUSIVE);
2646
2647 /* Find the array entry */
2648 for (i = 0; i < btvacinfo->num_vacuums; i++)
2649 {
2650 BTOneVacInfo *vac = &btvacinfo->vacuums[i];
2651
2652 if (vac->relid.relId == rel->rd_lockInfo.lockRelId.relId &&
2653 vac->relid.dbId == rel->rd_lockInfo.lockRelId.dbId)
2654 {
2655 /* Remove it by shifting down the last entry */
2656 *vac = btvacinfo->vacuums[btvacinfo->num_vacuums - 1];
2658 break;
2659 }
2660 }
2661
2662 LWLockRelease(BtreeVacuumLock);
2663}
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1168
void LWLockRelease(LWLock *lock)
Definition: lwlock.c:1781
@ LW_EXCLUSIVE
Definition: lwlock.h:114
static BTVacInfo * btvacinfo
Definition: nbtutils.c:2537
LockRelId relid
Definition: nbtutils.c:2525
int num_vacuums
Definition: nbtutils.c:2532
BTOneVacInfo vacuums[FLEXIBLE_ARRAY_MEMBER]
Definition: nbtutils.c:2534
LockRelId lockRelId
Definition: rel.h:46
Oid relId
Definition: rel.h:40
Oid dbId
Definition: rel.h:41
LockInfoData rd_lockInfo
Definition: rel.h:114

References btvacinfo, LockRelId::dbId, i, LockInfoData::lockRelId, LW_EXCLUSIVE, LWLockAcquire(), LWLockRelease(), BTVacInfo::num_vacuums, RelationData::rd_lockInfo, BTOneVacInfo::relid, LockRelId::relId, and BTVacInfo::vacuums.

Referenced by _bt_end_vacuum_callback(), and btbulkdelete().

◆ _bt_end_vacuum_callback()

void _bt_end_vacuum_callback ( int  code,
Datum  arg 
)

Definition at line 2669 of file nbtutils.c.

2670{
2672}
void _bt_end_vacuum(Relation rel)
Definition: nbtutils.c:2641
void * arg
static Pointer DatumGetPointer(Datum X)
Definition: postgres.h:317

References _bt_end_vacuum(), arg, and DatumGetPointer().

Referenced by btbulkdelete().

◆ _bt_findsplitloc()

OffsetNumber _bt_findsplitloc ( Relation  rel,
Page  origpage,
OffsetNumber  newitemoff,
Size  newitemsz,
IndexTuple  newitem,
bool *  newitemonleft 
)

Definition at line 129 of file nbtsplitloc.c.

135{
136 BTPageOpaque opaque;
137 int leftspace,
138 rightspace,
139 olddataitemstotal,
140 olddataitemstoleft,
141 perfectpenalty,
142 leaffillfactor;
144 FindSplitStrat strategy;
145 ItemId itemid;
146 OffsetNumber offnum,
147 maxoff,
148 firstrightoff;
149 double fillfactormult;
150 bool usemult;
151 SplitPoint leftpage,
152 rightpage;
153
154 opaque = BTPageGetOpaque(origpage);
155 maxoff = PageGetMaxOffsetNumber(origpage);
156
157 /* Total free space available on a btree page, after fixed overhead */
158 leftspace = rightspace =
160 MAXALIGN(sizeof(BTPageOpaqueData));
161
162 /* The right page will have the same high key as the old page */
163 if (!P_RIGHTMOST(opaque))
164 {
165 itemid = PageGetItemId(origpage, P_HIKEY);
166 rightspace -= (int) (MAXALIGN(ItemIdGetLength(itemid)) +
167 sizeof(ItemIdData));
168 }
169
170 /* Count up total space in data items before actually scanning 'em */
171 olddataitemstotal = rightspace - (int) PageGetExactFreeSpace(origpage);
172 leaffillfactor = BTGetFillFactor(rel);
173
174 /* Passed-in newitemsz is MAXALIGNED but does not include line pointer */
175 newitemsz += sizeof(ItemIdData);
176 state.rel = rel;
177 state.origpage = origpage;
178 state.newitem = newitem;
179 state.newitemsz = newitemsz;
180 state.is_leaf = P_ISLEAF(opaque);
181 state.is_rightmost = P_RIGHTMOST(opaque);
182 state.leftspace = leftspace;
183 state.rightspace = rightspace;
184 state.olddataitemstotal = olddataitemstotal;
185 state.minfirstrightsz = SIZE_MAX;
186 state.newitemoff = newitemoff;
187
188 /* newitem cannot be a posting list item */
189 Assert(!BTreeTupleIsPosting(newitem));
190
191 /*
192 * nsplits should never exceed maxoff because there will be at most as
193 * many candidate split points as there are points _between_ tuples, once
194 * you imagine that the new item is already on the original page (the
195 * final number of splits may be slightly lower because not all points
196 * between tuples will be legal).
197 */
198 state.maxsplits = maxoff;
199 state.splits = palloc(sizeof(SplitPoint) * state.maxsplits);
200 state.nsplits = 0;
201
202 /*
203 * Scan through the data items and calculate space usage for a split at
204 * each possible position
205 */
206 olddataitemstoleft = 0;
207
208 for (offnum = P_FIRSTDATAKEY(opaque);
209 offnum <= maxoff;
210 offnum = OffsetNumberNext(offnum))
211 {
212 Size itemsz;
213
214 itemid = PageGetItemId(origpage, offnum);
215 itemsz = MAXALIGN(ItemIdGetLength(itemid)) + sizeof(ItemIdData);
216
217 /*
218 * When item offset number is not newitemoff, neither side of the
219 * split can be newitem. Record a split after the previous data item
220 * from original page, but before the current data item from original
221 * page. (_bt_recsplitloc() will reject the split when there are no
222 * previous items, which we rely on.)
223 */
224 if (offnum < newitemoff)
225 _bt_recsplitloc(&state, offnum, false, olddataitemstoleft, itemsz);
226 else if (offnum > newitemoff)
227 _bt_recsplitloc(&state, offnum, true, olddataitemstoleft, itemsz);
228 else
229 {
230 /*
231 * Record a split after all "offnum < newitemoff" original page
232 * data items, but before newitem
233 */
234 _bt_recsplitloc(&state, offnum, false, olddataitemstoleft, itemsz);
235
236 /*
237 * Record a split after newitem, but before data item from
238 * original page at offset newitemoff/current offset
239 */
240 _bt_recsplitloc(&state, offnum, true, olddataitemstoleft, itemsz);
241 }
242
243 olddataitemstoleft += itemsz;
244 }
245
246 /*
247 * Record a split after all original page data items, but before newitem.
248 * (Though only when it's possible that newitem will end up alone on new
249 * right page.)
250 */
251 Assert(olddataitemstoleft == olddataitemstotal);
252 if (newitemoff > maxoff)
253 _bt_recsplitloc(&state, newitemoff, false, olddataitemstotal, 0);
254
255 /*
256 * I believe it is not possible to fail to find a feasible split, but just
257 * in case ...
258 */
259 if (state.nsplits == 0)
260 elog(ERROR, "could not find a feasible split point for index \"%s\"",
262
263 /*
264 * Start search for a split point among list of legal split points. Give
265 * primary consideration to equalizing available free space in each half
266 * of the split initially (start with default strategy), while applying
267 * rightmost and split-after-new-item optimizations where appropriate.
268 * Either of the two other fallback strategies may be required for cases
269 * with a large number of duplicates around the original/space-optimal
270 * split point.
271 *
272 * Default strategy gives some weight to suffix truncation in deciding a
273 * split point on leaf pages. It attempts to select a split point where a
274 * distinguishing attribute appears earlier in the new high key for the
275 * left side of the split, in order to maximize the number of trailing
276 * attributes that can be truncated away. Only candidate split points
277 * that imply an acceptable balance of free space on each side are
278 * considered. See _bt_defaultinterval().
279 */
280 if (!state.is_leaf)
281 {
282 /* fillfactormult only used on rightmost page */
283 usemult = state.is_rightmost;
284 fillfactormult = BTREE_NONLEAF_FILLFACTOR / 100.0;
285 }
286 else if (state.is_rightmost)
287 {
288 /* Rightmost leaf page -- fillfactormult always used */
289 usemult = true;
290 fillfactormult = leaffillfactor / 100.0;
291 }
292 else if (_bt_afternewitemoff(&state, maxoff, leaffillfactor, &usemult))
293 {
294 /*
295 * New item inserted at rightmost point among a localized grouping on
296 * a leaf page -- apply "split after new item" optimization, either by
297 * applying leaf fillfactor multiplier, or by choosing the exact split
298 * point that leaves newitem as lastleft. (usemult is set for us.)
299 */
300 if (usemult)
301 {
302 /* fillfactormult should be set based on leaf fillfactor */
303 fillfactormult = leaffillfactor / 100.0;
304 }
305 else
306 {
307 /* find precise split point after newitemoff */
308 for (int i = 0; i < state.nsplits; i++)
309 {
310 SplitPoint *split = state.splits + i;
311
312 if (split->newitemonleft &&
313 newitemoff == split->firstrightoff)
314 {
315 pfree(state.splits);
316 *newitemonleft = true;
317 return newitemoff;
318 }
319 }
320
321 /*
322 * Cannot legally split after newitemoff; proceed with split
323 * without using fillfactor multiplier. This is defensive, and
324 * should never be needed in practice.
325 */
326 fillfactormult = 0.50;
327 }
328 }
329 else
330 {
331 /* Other leaf page. 50:50 page split. */
332 usemult = false;
333 /* fillfactormult not used, but be tidy */
334 fillfactormult = 0.50;
335 }
336
337 /*
338 * Save leftmost and rightmost splits for page before original ordinal
339 * sort order is lost by delta/fillfactormult sort
340 */
341 leftpage = state.splits[0];
342 rightpage = state.splits[state.nsplits - 1];
343
344 /* Give split points a fillfactormult-wise delta, and sort on deltas */
345 _bt_deltasortsplits(&state, fillfactormult, usemult);
346
347 /* Determine split interval for default strategy */
348 state.interval = _bt_defaultinterval(&state);
349
350 /*
351 * Determine if default strategy/split interval will produce a
352 * sufficiently distinguishing split, or if we should change strategies.
353 * Alternative strategies change the range of split points that are
354 * considered acceptable (split interval), and possibly change
355 * fillfactormult, in order to deal with pages with a large number of
356 * duplicates gracefully.
357 *
358 * Pass low and high splits for the entire page (actually, they're for an
359 * imaginary version of the page that includes newitem). These are used
360 * when the initial split interval encloses split points that are full of
361 * duplicates, and we need to consider if it's even possible to avoid
362 * appending a heap TID.
363 */
364 perfectpenalty = _bt_strategy(&state, &leftpage, &rightpage, &strategy);
365
366 if (strategy == SPLIT_DEFAULT)
367 {
368 /*
369 * Default strategy worked out (always works out with internal page).
370 * Original split interval still stands.
371 */
372 }
373
374 /*
375 * Many duplicates strategy is used when a heap TID would otherwise be
376 * appended, but the page isn't completely full of logical duplicates.
377 *
378 * The split interval is widened to include all legal candidate split
379 * points. There might be a few as two distinct values in the whole-page
380 * split interval, though it's also possible that most of the values on
381 * the page are unique. The final split point will either be to the
382 * immediate left or to the immediate right of the group of duplicate
383 * tuples that enclose the first/delta-optimal split point (perfect
384 * penalty was set so that the lowest delta split point that avoids
385 * appending a heap TID will be chosen). Maximizing the number of
386 * attributes that can be truncated away is not a goal of the many
387 * duplicates strategy.
388 *
389 * Single value strategy is used when it is impossible to avoid appending
390 * a heap TID. It arranges to leave the left page very full. This
391 * maximizes space utilization in cases where tuples with the same
392 * attribute values span many pages. Newly inserted duplicates will tend
393 * to have higher heap TID values, so we'll end up splitting to the right
394 * consistently. (Single value strategy is harmless though not
395 * particularly useful with !heapkeyspace indexes.)
396 */
397 else if (strategy == SPLIT_MANY_DUPLICATES)
398 {
399 Assert(state.is_leaf);
400 /* Shouldn't try to truncate away extra user attributes */
401 Assert(perfectpenalty ==
403 /* No need to resort splits -- no change in fillfactormult/deltas */
404 state.interval = state.nsplits;
405 }
406 else if (strategy == SPLIT_SINGLE_VALUE)
407 {
408 Assert(state.is_leaf);
409 /* Split near the end of the page */
410 usemult = true;
411 fillfactormult = BTREE_SINGLEVAL_FILLFACTOR / 100.0;
412 /* Resort split points with new delta */
413 _bt_deltasortsplits(&state, fillfactormult, usemult);
414 /* Appending a heap TID is unavoidable, so interval of 1 is fine */
415 state.interval = 1;
416 }
417
418 /*
419 * Search among acceptable split points (using final split interval) for
420 * the entry that has the lowest penalty, and is therefore expected to
421 * maximize fan-out. Sets *newitemonleft for us.
422 */
423 firstrightoff = _bt_bestsplitloc(&state, perfectpenalty, newitemonleft,
424 strategy);
425 pfree(state.splits);
426
427 return firstrightoff;
428}
#define BTREE_SINGLEVAL_FILLFACTOR
Definition: nbtree.h:202
#define BTGetFillFactor(relation)
Definition: nbtree.h:1133
#define BTREE_NONLEAF_FILLFACTOR
Definition: nbtree.h:201
static void _bt_deltasortsplits(FindSplitData *state, double fillfactormult, bool usemult)
Definition: nbtsplitloc.c:566
static int _bt_strategy(FindSplitData *state, SplitPoint *leftpage, SplitPoint *rightpage, FindSplitStrat *strategy)
Definition: nbtsplitloc.c:934
static bool _bt_afternewitemoff(FindSplitData *state, OffsetNumber maxoff, int leaffillfactor, bool *usemult)
Definition: nbtsplitloc.c:630
static void _bt_recsplitloc(FindSplitData *state, OffsetNumber firstrightoff, bool newitemonleft, int olddataitemstoleft, Size firstrightofforigpagetuplesz)
Definition: nbtsplitloc.c:449
FindSplitStrat
Definition: nbtsplitloc.c:21
@ SPLIT_DEFAULT
Definition: nbtsplitloc.c:23
@ SPLIT_MANY_DUPLICATES
Definition: nbtsplitloc.c:24
@ SPLIT_SINGLE_VALUE
Definition: nbtsplitloc.c:25
static OffsetNumber _bt_bestsplitloc(FindSplitData *state, int perfectpenalty, bool *newitemonleft, FindSplitStrat strategy)
Definition: nbtsplitloc.c:788
static int _bt_defaultinterval(FindSplitData *state)
Definition: nbtsplitloc.c:876
bool newitemonleft
Definition: nbtsplitloc.c:37
OffsetNumber firstrightoff
Definition: nbtsplitloc.c:36

References _bt_afternewitemoff(), _bt_bestsplitloc(), _bt_defaultinterval(), _bt_deltasortsplits(), _bt_recsplitloc(), _bt_strategy(), Assert, BTGetFillFactor, BTPageGetOpaque, BTREE_NONLEAF_FILLFACTOR, BTREE_SINGLEVAL_FILLFACTOR, BTreeTupleIsPosting(), elog, ERROR, SplitPoint::firstrightoff, i, IndexRelationGetNumberOfKeyAttributes, ItemIdGetLength, MAXALIGN, SplitPoint::newitemonleft, OffsetNumberNext, P_FIRSTDATAKEY, P_HIKEY, P_ISLEAF, P_RIGHTMOST, PageGetExactFreeSpace(), PageGetItemId(), PageGetMaxOffsetNumber(), PageGetPageSize(), palloc(), pfree(), RelationGetRelationName, SizeOfPageHeaderData, SPLIT_DEFAULT, SPLIT_MANY_DUPLICATES, and SPLIT_SINGLE_VALUE.

Referenced by _bt_split().

◆ _bt_finish_split()

void _bt_finish_split ( Relation  rel,
Relation  heaprel,
Buffer  lbuf,
BTStack  stack 
)

Definition at line 2241 of file nbtinsert.c.

2242{
2243 Page lpage = BufferGetPage(lbuf);
2244 BTPageOpaque lpageop = BTPageGetOpaque(lpage);
2245 Buffer rbuf;
2246 Page rpage;
2247 BTPageOpaque rpageop;
2248 bool wasroot;
2249 bool wasonly;
2250
2251 Assert(P_INCOMPLETE_SPLIT(lpageop));
2252 Assert(heaprel != NULL);
2253
2254 /* Lock right sibling, the one missing the downlink */
2255 rbuf = _bt_getbuf(rel, lpageop->btpo_next, BT_WRITE);
2256 rpage = BufferGetPage(rbuf);
2257 rpageop = BTPageGetOpaque(rpage);
2258
2259 /* Could this be a root split? */
2260 if (!stack)
2261 {
2262 Buffer metabuf;
2263 Page metapg;
2264 BTMetaPageData *metad;
2265
2266 /* acquire lock on the metapage */
2267 metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
2268 metapg = BufferGetPage(metabuf);
2269 metad = BTPageGetMeta(metapg);
2270
2271 wasroot = (metad->btm_root == BufferGetBlockNumber(lbuf));
2272
2273 _bt_relbuf(rel, metabuf);
2274 }
2275 else
2276 wasroot = false;
2277
2278 /* Was this the only page on the level before split? */
2279 wasonly = (P_LEFTMOST(lpageop) && P_RIGHTMOST(rpageop));
2280
2281 elog(DEBUG1, "finishing incomplete split of %u/%u",
2283
2284 _bt_insert_parent(rel, heaprel, lbuf, rbuf, stack, wasroot, wasonly);
2285}
static void _bt_insert_parent(Relation rel, Relation heaprel, Buffer buf, Buffer rbuf, BTStack stack, bool isroot, bool isonly)
Definition: nbtinsert.c:2099
Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access)
Definition: nbtpage.c:845
#define BTPageGetMeta(p)
Definition: nbtree.h:121
#define P_LEFTMOST(opaque)
Definition: nbtree.h:218
#define P_INCOMPLETE_SPLIT(opaque)
Definition: nbtree.h:227
#define BTREE_METAPAGE
Definition: nbtree.h:148
#define BT_WRITE
Definition: nbtree.h:725
BlockNumber btm_root
Definition: nbtree.h:107
BlockNumber btpo_next
Definition: nbtree.h:65

References _bt_getbuf(), _bt_insert_parent(), _bt_relbuf(), Assert, BT_WRITE, BTMetaPageData::btm_root, BTPageGetMeta, BTPageGetOpaque, BTPageOpaqueData::btpo_next, BTREE_METAPAGE, BufferGetBlockNumber(), BufferGetPage(), DEBUG1, elog, P_INCOMPLETE_SPLIT, P_LEFTMOST, and P_RIGHTMOST.

Referenced by _bt_getstackbuf(), _bt_moveright(), and _bt_stepright().

◆ _bt_first()

bool _bt_first ( IndexScanDesc  scan,
ScanDirection  dir 
)

Definition at line 882 of file nbtsearch.c.

883{
884 Relation rel = scan->indexRelation;
885 BTScanOpaque so = (BTScanOpaque) scan->opaque;
886 BTStack stack;
887 OffsetNumber offnum;
888 BTScanInsertData inskey;
889 ScanKey startKeys[INDEX_MAX_KEYS];
890 ScanKeyData notnullkeys[INDEX_MAX_KEYS];
891 int keysz = 0;
892 StrategyNumber strat_total;
894 lastcurrblkno;
895
897
898 /*
899 * Examine the scan keys and eliminate any redundant keys; also mark the
900 * keys that must be matched to continue the scan.
901 */
903
904 /*
905 * Quit now if _bt_preprocess_keys() discovered that the scan keys can
906 * never be satisfied (eg, x == 1 AND x > 2).
907 */
908 if (!so->qual_ok)
909 {
910 Assert(!so->needPrimScan);
911 _bt_parallel_done(scan);
912 return false;
913 }
914
915 /*
916 * If this is a parallel scan, we must seize the scan. _bt_readfirstpage
917 * will likely release the parallel scan later on.
918 */
919 if (scan->parallel_scan != NULL &&
920 !_bt_parallel_seize(scan, &blkno, &lastcurrblkno, true))
921 return false;
922
923 /*
924 * Initialize the scan's arrays (if any) for the current scan direction
925 * (except when they were already set to later values as part of
926 * scheduling the primitive index scan that is now underway)
927 */
928 if (so->numArrayKeys && !so->needPrimScan)
929 _bt_start_array_keys(scan, dir);
930
931 if (blkno != InvalidBlockNumber)
932 {
933 /*
934 * We anticipated calling _bt_search, but another worker bet us to it.
935 * _bt_readnextpage releases the scan for us (not _bt_readfirstpage).
936 */
937 Assert(scan->parallel_scan != NULL);
938 Assert(!so->needPrimScan);
939 Assert(blkno != P_NONE);
940
941 if (!_bt_readnextpage(scan, blkno, lastcurrblkno, dir, true))
942 return false;
943
944 _bt_returnitem(scan, so);
945 return true;
946 }
947
948 /*
949 * Count an indexscan for stats, now that we know that we'll call
950 * _bt_search/_bt_endpoint below
951 */
953
954 /*----------
955 * Examine the scan keys to discover where we need to start the scan.
956 *
957 * We want to identify the keys that can be used as starting boundaries;
958 * these are =, >, or >= keys for a forward scan or =, <, <= keys for
959 * a backwards scan. We can use keys for multiple attributes so long as
960 * the prior attributes had only =, >= (resp. =, <=) keys. Once we accept
961 * a > or < boundary or find an attribute with no boundary (which can be
962 * thought of as the same as "> -infinity"), we can't use keys for any
963 * attributes to its right, because it would break our simplistic notion
964 * of what initial positioning strategy to use.
965 *
966 * When the scan keys include cross-type operators, _bt_preprocess_keys
967 * may not be able to eliminate redundant keys; in such cases we will
968 * arbitrarily pick a usable one for each attribute. This is correct
969 * but possibly not optimal behavior. (For example, with keys like
970 * "x >= 4 AND x >= 5" we would elect to scan starting at x=4 when
971 * x=5 would be more efficient.) Since the situation only arises given
972 * a poorly-worded query plus an incomplete opfamily, live with it.
973 *
974 * When both equality and inequality keys appear for a single attribute
975 * (again, only possible when cross-type operators appear), we *must*
976 * select one of the equality keys for the starting point, because
977 * _bt_checkkeys() will stop the scan as soon as an equality qual fails.
978 * For example, if we have keys like "x >= 4 AND x = 10" and we elect to
979 * start at x=4, we will fail and stop before reaching x=10. If multiple
980 * equality quals survive preprocessing, however, it doesn't matter which
981 * one we use --- by definition, they are either redundant or
982 * contradictory.
983 *
984 * Any regular (not SK_SEARCHNULL) key implies a NOT NULL qualifier.
985 * If the index stores nulls at the end of the index we'll be starting
986 * from, and we have no boundary key for the column (which means the key
987 * we deduced NOT NULL from is an inequality key that constrains the other
988 * end of the index), then we cons up an explicit SK_SEARCHNOTNULL key to
989 * use as a boundary key. If we didn't do this, we might find ourselves
990 * traversing a lot of null entries at the start of the scan.
991 *
992 * In this loop, row-comparison keys are treated the same as keys on their
993 * first (leftmost) columns. We'll add on lower-order columns of the row
994 * comparison below, if possible.
995 *
996 * The selected scan keys (at most one per index column) are remembered by
997 * storing their addresses into the local startKeys[] array.
998 *
999 * _bt_checkkeys/_bt_advance_array_keys decide whether and when to start
1000 * the next primitive index scan (for scans with array keys) based in part
1001 * on an understanding of how it'll enable us to reposition the scan.
1002 * They're directly aware of how we'll sometimes cons up an explicit
1003 * SK_SEARCHNOTNULL key. They'll even end primitive scans by applying a
1004 * symmetric "deduce NOT NULL" rule of their own. This allows top-level
1005 * scans to skip large groups of NULLs through repeated deductions about
1006 * key strictness (for a required inequality key) and whether NULLs in the
1007 * key's index column are stored last or first (relative to non-NULLs).
1008 * If you update anything here, _bt_checkkeys/_bt_advance_array_keys might
1009 * need to be kept in sync.
1010 *----------
1011 */
1012 strat_total = BTEqualStrategyNumber;
1013 if (so->numberOfKeys > 0)
1014 {
1015 AttrNumber curattr;
1016 ScanKey chosen;
1017 ScanKey impliesNN;
1018 ScanKey cur;
1019
1020 /*
1021 * chosen is the so-far-chosen key for the current attribute, if any.
1022 * We don't cast the decision in stone until we reach keys for the
1023 * next attribute.
1024 */
1025 cur = so->keyData;
1026 curattr = 1;
1027 chosen = NULL;
1028 /* Also remember any scankey that implies a NOT NULL constraint */
1029 impliesNN = NULL;
1030
1031 /*
1032 * Loop iterates from 0 to numberOfKeys inclusive; we use the last
1033 * pass to handle after-last-key processing. Actual exit from the
1034 * loop is at one of the "break" statements below.
1035 */
1036 for (int i = 0;; cur++, i++)
1037 {
1038 if (i >= so->numberOfKeys || cur->sk_attno != curattr)
1039 {
1040 /*
1041 * Done looking at keys for curattr. If we didn't find a
1042 * usable boundary key, see if we can deduce a NOT NULL key.
1043 */
1044 if (chosen == NULL && impliesNN != NULL &&
1045 ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
1048 {
1049 /* Yes, so build the key in notnullkeys[keysz] */
1050 chosen = &notnullkeys[keysz];
1053 (impliesNN->sk_flags &
1055 curattr,
1056 ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
1059 InvalidOid,
1060 InvalidOid,
1061 InvalidOid,
1062 (Datum) 0);
1063 }
1064
1065 /*
1066 * If we still didn't find a usable boundary key, quit; else
1067 * save the boundary key pointer in startKeys.
1068 */
1069 if (chosen == NULL)
1070 break;
1071 startKeys[keysz++] = chosen;
1072
1073 /*
1074 * We can only consider adding more boundary keys when the one
1075 * that we just chose to add uses either the = or >= strategy
1076 * (during backwards scans we can only do so when the key that
1077 * we just added to startKeys[] uses the = or <= strategy)
1078 */
1079 strat_total = chosen->sk_strategy;
1080 if (strat_total == BTGreaterStrategyNumber ||
1081 strat_total == BTLessStrategyNumber)
1082 break;
1083
1084 /*
1085 * Done if that was the last attribute, or if next key is not
1086 * in sequence (implying no boundary key is available for the
1087 * next attribute).
1088 */
1089 if (i >= so->numberOfKeys ||
1090 cur->sk_attno != curattr + 1)
1091 break;
1092
1093 /*
1094 * Reset for next attr.
1095 */
1096 curattr = cur->sk_attno;
1097 chosen = NULL;
1098 impliesNN = NULL;
1099 }
1100
1101 /*
1102 * Can we use this key as a starting boundary for this attr?
1103 *
1104 * If not, does it imply a NOT NULL constraint? (Because
1105 * SK_SEARCHNULL keys are always assigned BTEqualStrategyNumber,
1106 * *any* inequality key works for that; we need not test.)
1107 */
1108 switch (cur->sk_strategy)
1109 {
1112 if (chosen == NULL)
1113 {
1114 if (ScanDirectionIsBackward(dir))
1115 chosen = cur;
1116 else
1117 impliesNN = cur;
1118 }
1119 break;
1121 /* override any non-equality choice */
1122 chosen = cur;
1123 break;
1126 if (chosen == NULL)
1127 {
1128 if (ScanDirectionIsForward(dir))
1129 chosen = cur;
1130 else
1131 impliesNN = cur;
1132 }
1133 break;
1134 }
1135 }
1136 }
1137
1138 /*
1139 * If we found no usable boundary keys, we have to start from one end of
1140 * the tree. Walk down that edge to the first or last key, and scan from
1141 * there.
1142 *
1143 * Note: calls _bt_readfirstpage for us, which releases the parallel scan.
1144 */
1145 if (keysz == 0)
1146 return _bt_endpoint(scan, dir);
1147
1148 /*
1149 * We want to start the scan somewhere within the index. Set up an
1150 * insertion scankey we can use to search for the boundary point we
1151 * identified above. The insertion scankey is built using the keys
1152 * identified by startKeys[]. (Remaining insertion scankey fields are
1153 * initialized after initial-positioning scan keys are finalized.)
1154 */
1155 Assert(keysz <= INDEX_MAX_KEYS);
1156 for (int i = 0; i < keysz; i++)
1157 {
1158 ScanKey cur = startKeys[i];
1159
1160 Assert(cur->sk_attno == i + 1);
1161
1162 if (cur->sk_flags & SK_ROW_HEADER)
1163 {
1164 /*
1165 * Row comparison header: look to the first row member instead
1166 */
1167 ScanKey subkey = (ScanKey) DatumGetPointer(cur->sk_argument);
1168
1169 /*
1170 * Cannot be a NULL in the first row member: _bt_preprocess_keys
1171 * would've marked the qual as unsatisfiable, preventing us from
1172 * ever getting this far
1173 */
1174 Assert(subkey->sk_flags & SK_ROW_MEMBER);
1175 Assert(subkey->sk_attno == cur->sk_attno);
1176 Assert(!(subkey->sk_flags & SK_ISNULL));
1177
1178 /*
1179 * The member scankeys are already in insertion format (ie, they
1180 * have sk_func = 3-way-comparison function)
1181 */
1182 memcpy(inskey.scankeys + i, subkey, sizeof(ScanKeyData));
1183
1184 /*
1185 * If the row comparison is the last positioning key we accepted,
1186 * try to add additional keys from the lower-order row members.
1187 * (If we accepted independent conditions on additional index
1188 * columns, we use those instead --- doesn't seem worth trying to
1189 * determine which is more restrictive.) Note that this is OK
1190 * even if the row comparison is of ">" or "<" type, because the
1191 * condition applied to all but the last row member is effectively
1192 * ">=" or "<=", and so the extra keys don't break the positioning
1193 * scheme. But, by the same token, if we aren't able to use all
1194 * the row members, then the part of the row comparison that we
1195 * did use has to be treated as just a ">=" or "<=" condition, and
1196 * so we'd better adjust strat_total accordingly.
1197 */
1198 if (i == keysz - 1)
1199 {
1200 bool used_all_subkeys = false;
1201
1202 Assert(!(subkey->sk_flags & SK_ROW_END));
1203 for (;;)
1204 {
1205 subkey++;
1206 Assert(subkey->sk_flags & SK_ROW_MEMBER);
1207 if (subkey->sk_attno != keysz + 1)
1208 break; /* out-of-sequence, can't use it */
1209 if (subkey->sk_strategy != cur->sk_strategy)
1210 break; /* wrong direction, can't use it */
1211 if (subkey->sk_flags & SK_ISNULL)
1212 break; /* can't use null keys */
1213 Assert(keysz < INDEX_MAX_KEYS);
1214 memcpy(inskey.scankeys + keysz, subkey,
1215 sizeof(ScanKeyData));
1216 keysz++;
1217 if (subkey->sk_flags & SK_ROW_END)
1218 {
1219 used_all_subkeys = true;
1220 break;
1221 }
1222 }
1223 if (!used_all_subkeys)
1224 {
1225 switch (strat_total)
1226 {
1228 strat_total = BTLessEqualStrategyNumber;
1229 break;
1231 strat_total = BTGreaterEqualStrategyNumber;
1232 break;
1233 }
1234 }
1235 break; /* done with outer loop */
1236 }
1237 }
1238 else
1239 {
1240 /*
1241 * Ordinary comparison key. Transform the search-style scan key
1242 * to an insertion scan key by replacing the sk_func with the
1243 * appropriate btree comparison function.
1244 *
1245 * If scankey operator is not a cross-type comparison, we can use
1246 * the cached comparison function; otherwise gotta look it up in
1247 * the catalogs. (That can't lead to infinite recursion, since no
1248 * indexscan initiated by syscache lookup will use cross-data-type
1249 * operators.)
1250 *
1251 * We support the convention that sk_subtype == InvalidOid means
1252 * the opclass input type; this is a hack to simplify life for
1253 * ScanKeyInit().
1254 */
1255 if (cur->sk_subtype == rel->rd_opcintype[i] ||
1256 cur->sk_subtype == InvalidOid)
1257 {
1258 FmgrInfo *procinfo;
1259
1260 procinfo = index_getprocinfo(rel, cur->sk_attno, BTORDER_PROC);
1261 ScanKeyEntryInitializeWithInfo(inskey.scankeys + i,
1262 cur->sk_flags,
1263 cur->sk_attno,
1265 cur->sk_subtype,
1266 cur->sk_collation,
1267 procinfo,
1268 cur->sk_argument);
1269 }
1270 else
1271 {
1272 RegProcedure cmp_proc;
1273
1274 cmp_proc = get_opfamily_proc(rel->rd_opfamily[i],
1275 rel->rd_opcintype[i],
1276 cur->sk_subtype,
1277 BTORDER_PROC);
1278 if (!RegProcedureIsValid(cmp_proc))
1279 elog(ERROR, "missing support function %d(%u,%u) for attribute %d of index \"%s\"",
1280 BTORDER_PROC, rel->rd_opcintype[i], cur->sk_subtype,
1281 cur->sk_attno, RelationGetRelationName(rel));
1282 ScanKeyEntryInitialize(inskey.scankeys + i,
1283 cur->sk_flags,
1284 cur->sk_attno,
1286 cur->sk_subtype,
1287 cur->sk_collation,
1288 cmp_proc,
1289 cur->sk_argument);
1290 }
1291 }
1292 }
1293
1294 /*----------
1295 * Examine the selected initial-positioning strategy to determine exactly
1296 * where we need to start the scan, and set flag variables to control the
1297 * initial descent by _bt_search (and our _bt_binsrch call for the leaf
1298 * page _bt_search returns).
1299 *----------
1300 */
1301 _bt_metaversion(rel, &inskey.heapkeyspace, &inskey.allequalimage);
1302 inskey.anynullkeys = false; /* unused */
1303 inskey.scantid = NULL;
1304 inskey.keysz = keysz;
1305 switch (strat_total)
1306 {
1308
1309 inskey.nextkey = false;
1310 inskey.backward = true;
1311 break;
1312
1314
1315 inskey.nextkey = true;
1316 inskey.backward = true;
1317 break;
1318
1320
1321 /*
1322 * If a backward scan was specified, need to start with last equal
1323 * item not first one.
1324 */
1325 if (ScanDirectionIsBackward(dir))
1326 {
1327 /*
1328 * This is the same as the <= strategy
1329 */
1330 inskey.nextkey = true;
1331 inskey.backward = true;
1332 }
1333 else
1334 {
1335 /*
1336 * This is the same as the >= strategy
1337 */
1338 inskey.nextkey = false;
1339 inskey.backward = false;
1340 }
1341 break;
1342
1344
1345 /*
1346 * Find first item >= scankey
1347 */
1348 inskey.nextkey = false;
1349 inskey.backward = false;
1350 break;
1351
1353
1354 /*
1355 * Find first item > scankey
1356 */
1357 inskey.nextkey = true;
1358 inskey.backward = false;
1359 break;
1360
1361 default:
1362 /* can't get here, but keep compiler quiet */
1363 elog(ERROR, "unrecognized strat_total: %d", (int) strat_total);
1364 return false;
1365 }
1366
1367 /*
1368 * Use the manufactured insertion scan key to descend the tree and
1369 * position ourselves on the target leaf page.
1370 */
1371 Assert(ScanDirectionIsBackward(dir) == inskey.backward);
1372 stack = _bt_search(rel, NULL, &inskey, &so->currPos.buf, BT_READ);
1373
1374 /* don't need to keep the stack around... */
1375 _bt_freestack(stack);
1376
1377 if (!BufferIsValid(so->currPos.buf))
1378 {
1379 /*
1380 * We only get here if the index is completely empty. Lock relation
1381 * because nothing finer to lock exists. Without a buffer lock, it's
1382 * possible for another transaction to insert data between
1383 * _bt_search() and PredicateLockRelation(). We have to try again
1384 * after taking the relation-level predicate lock, to close a narrow
1385 * window where we wouldn't scan concurrently inserted tuples, but the
1386 * writer wouldn't see our predicate lock.
1387 */
1389 {
1391 stack = _bt_search(rel, NULL, &inskey, &so->currPos.buf, BT_READ);
1392 _bt_freestack(stack);
1393 }
1394
1395 if (!BufferIsValid(so->currPos.buf))
1396 {
1397 Assert(!so->needPrimScan);
1398 _bt_parallel_done(scan);
1399 return false;
1400 }
1401 }
1402
1403 /* position to the precise item on the page */
1404 offnum = _bt_binsrch(rel, &inskey, so->currPos.buf);
1405
1406 /*
1407 * Now load data from the first page of the scan (usually the page
1408 * currently in so->currPos.buf).
1409 *
1410 * If inskey.nextkey = false and inskey.backward = false, offnum is
1411 * positioned at the first non-pivot tuple >= inskey.scankeys.
1412 *
1413 * If inskey.nextkey = false and inskey.backward = true, offnum is
1414 * positioned at the last non-pivot tuple < inskey.scankeys.
1415 *
1416 * If inskey.nextkey = true and inskey.backward = false, offnum is
1417 * positioned at the first non-pivot tuple > inskey.scankeys.
1418 *
1419 * If inskey.nextkey = true and inskey.backward = true, offnum is
1420 * positioned at the last non-pivot tuple <= inskey.scankeys.
1421 *
1422 * It's possible that _bt_binsrch returned an offnum that is out of bounds
1423 * for the page. For example, when inskey is both < the leaf page's high
1424 * key and > all of its non-pivot tuples, offnum will be "maxoff + 1".
1425 */
1426 if (!_bt_readfirstpage(scan, offnum, dir))
1427 return false;
1428
1429 _bt_returnitem(scan, so);
1430 return true;
1431}
int16 AttrNumber
Definition: attnum.h:21
#define RegProcedureIsValid(p)
Definition: c.h:734
regproc RegProcedure
Definition: c.h:607
FmgrInfo * index_getprocinfo(Relation irel, AttrNumber attnum, uint16 procnum)
Definition: indexam.c:862
void _bt_metaversion(Relation rel, bool *heapkeyspace, bool *allequalimage)
Definition: nbtpage.c:739
void _bt_preprocess_keys(IndexScanDesc scan)
bool _bt_parallel_seize(IndexScanDesc scan, BlockNumber *next_scan_page, BlockNumber *last_curr_page, bool first)
Definition: nbtree.c:605
void _bt_parallel_done(IndexScanDesc scan)
Definition: nbtree.c:774
#define BTORDER_PROC
Definition: nbtree.h:712
#define BTScanPosIsValid(scanpos)
Definition: nbtree.h:1015
#define P_NONE
Definition: nbtree.h:212
#define BT_READ
Definition: nbtree.h:724
static bool _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, BlockNumber lastcurrblkno, ScanDirection dir, bool seized)
Definition: nbtsearch.c:2200
static OffsetNumber _bt_binsrch(Relation rel, BTScanInsert key, Buffer buf)
Definition: nbtsearch.c:343
static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:2534
static bool _bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum, ScanDirection dir)
Definition: nbtsearch.c:2114
BTStack _bt_search(Relation rel, Relation heaprel, BTScanInsert key, Buffer *bufP, int access)
Definition: nbtsearch.c:102
static void _bt_returnitem(IndexScanDesc scan, BTScanOpaque so)
Definition: nbtsearch.c:1985
void _bt_start_array_keys(IndexScanDesc scan, ScanDirection dir)
Definition: nbtutils.c:413
#define INDEX_MAX_KEYS
#define pgstat_count_index_scan(rel)
Definition: pgstat.h:683
#define InvalidOid
Definition: postgres_ext.h:37
void PredicateLockRelation(Relation relation, Snapshot snapshot)
Definition: predicate.c:2566
void ScanKeyEntryInitialize(ScanKey entry, int flags, AttrNumber attributeNumber, StrategyNumber strategy, Oid subtype, Oid collation, RegProcedure procedure, Datum argument)
Definition: scankey.c:32
void ScanKeyEntryInitializeWithInfo(ScanKey entry, int flags, AttrNumber attributeNumber, StrategyNumber strategy, Oid subtype, Oid collation, FmgrInfo *finfo, Datum argument)
Definition: scankey.c:101
#define ScanDirectionIsBackward(direction)
Definition: sdir.h:50
#define SK_ROW_HEADER
Definition: skey.h:117
#define SK_ROW_MEMBER
Definition: skey.h:118
#define SK_SEARCHNOTNULL
Definition: skey.h:122
#define SK_ROW_END
Definition: skey.h:119
ScanKeyData * ScanKey
Definition: skey.h:75
uint16 StrategyNumber
Definition: stratnum.h:22
#define BTGreaterStrategyNumber
Definition: stratnum.h:33
#define InvalidStrategy
Definition: stratnum.h:24
#define BTLessStrategyNumber
Definition: stratnum.h:29
#define BTLessEqualStrategyNumber
Definition: stratnum.h:30
#define BTGreaterEqualStrategyNumber
Definition: stratnum.h:32
Buffer buf
Definition: nbtree.h:958
Definition: fmgr.h:57
struct ParallelIndexScanDescData * parallel_scan
Definition: relscan.h:183
struct SnapshotData * xs_snapshot
Definition: relscan.h:136
#define IsolationIsSerializable()
Definition: xact.h:52

References _bt_binsrch(), _bt_endpoint(), _bt_freestack(), _bt_metaversion(), _bt_parallel_done(), _bt_parallel_seize(), _bt_preprocess_keys(), _bt_readfirstpage(), _bt_readnextpage(), _bt_returnitem(), _bt_search(), _bt_start_array_keys(), Assert, BT_READ, BTEqualStrategyNumber, BTGreaterEqualStrategyNumber, BTGreaterStrategyNumber, BTLessEqualStrategyNumber, BTLessStrategyNumber, BTORDER_PROC, BTScanPosIsValid, BTScanPosData::buf, BufferIsValid(), cur, BTScanOpaqueData::currPos, DatumGetPointer(), elog, ERROR, get_opfamily_proc(), i, index_getprocinfo(), INDEX_MAX_KEYS, IndexScanDescData::indexRelation, InvalidBlockNumber, InvalidOid, InvalidStrategy, IsolationIsSerializable, BTScanOpaqueData::keyData, BTScanOpaqueData::needPrimScan, BTScanOpaqueData::numArrayKeys, BTScanOpaqueData::numberOfKeys, IndexScanDescData::opaque, P_NONE, IndexScanDescData::parallel_scan, pgstat_count_index_scan, PredicateLockRelation(), BTScanOpaqueData::qual_ok, RelationData::rd_opcintype, RelationData::rd_opfamily, RegProcedureIsValid, RelationGetRelationName, ScanDirectionIsBackward, ScanDirectionIsForward, ScanKeyEntryInitialize(), ScanKeyEntryInitializeWithInfo(), ScanKeyData::sk_attno, SK_BT_DESC, SK_BT_NULLS_FIRST, ScanKeyData::sk_flags, SK_ISNULL, SK_ROW_END, SK_ROW_HEADER, SK_ROW_MEMBER, SK_SEARCHNOTNULL, ScanKeyData::sk_strategy, and IndexScanDescData::xs_snapshot.

Referenced by btgetbitmap(), and btgettuple().

◆ _bt_form_posting()

IndexTuple _bt_form_posting ( IndexTuple  base,
ItemPointer  htids,
int  nhtids 
)

Definition at line 864 of file nbtdedup.c.

865{
866 uint32 keysize,
867 newsize;
868 IndexTuple itup;
869
870 if (BTreeTupleIsPosting(base))
871 keysize = BTreeTupleGetPostingOffset(base);
872 else
873 keysize = IndexTupleSize(base);
874
876 Assert(nhtids > 0 && nhtids <= PG_UINT16_MAX);
877 Assert(keysize == MAXALIGN(keysize));
878
879 /* Determine final size of new tuple */
880 if (nhtids > 1)
881 newsize = MAXALIGN(keysize +
882 nhtids * sizeof(ItemPointerData));
883 else
884 newsize = keysize;
885
886 Assert(newsize <= INDEX_SIZE_MASK);
887 Assert(newsize == MAXALIGN(newsize));
888
889 /* Allocate memory using palloc0() (matches index_form_tuple()) */
890 itup = palloc0(newsize);
891 memcpy(itup, base, keysize);
892 itup->t_info &= ~INDEX_SIZE_MASK;
893 itup->t_info |= newsize;
894 if (nhtids > 1)
895 {
896 /* Form posting list tuple */
897 BTreeTupleSetPosting(itup, nhtids, keysize);
898 memcpy(BTreeTupleGetPosting(itup), htids,
899 sizeof(ItemPointerData) * nhtids);
900 Assert(_bt_posting_valid(itup));
901 }
902 else
903 {
904 /* Form standard non-pivot tuple */
905 itup->t_info &= ~INDEX_ALT_TID_MASK;
906 ItemPointerCopy(htids, &itup->t_tid);
908 }
909
910 return itup;
911}
#define PG_UINT16_MAX
Definition: c.h:544
static void ItemPointerCopy(const ItemPointerData *fromPointer, ItemPointerData *toPointer)
Definition: itemptr.h:172
static bool ItemPointerIsValid(const ItemPointerData *pointer)
Definition: itemptr.h:83
void * palloc0(Size size)
Definition: mcxt.c:1347
static void BTreeTupleSetPosting(IndexTuple itup, uint16 nhtids, int postingoffset)
Definition: nbtree.h:504
unsigned short t_info
Definition: itup.h:49

References Assert, BTreeTupleGetPosting(), BTreeTupleGetPostingOffset(), BTreeTupleIsPivot(), BTreeTupleIsPosting(), BTreeTupleSetPosting(), INDEX_SIZE_MASK, IndexTupleSize(), ItemPointerCopy(), ItemPointerIsValid(), MAXALIGN, palloc0(), PG_UINT16_MAX, IndexTupleData::t_info, and IndexTupleData::t_tid.

Referenced by _bt_dedup_finish_pending(), _bt_sort_dedup_finish_pending(), and bt_posting_plain_tuple().

◆ _bt_freestack()

void _bt_freestack ( BTStack  stack)

Definition at line 172 of file nbtutils.c.

173{
174 BTStack ostack;
175
176 while (stack != NULL)
177 {
178 ostack = stack;
179 stack = stack->bts_parent;
180 pfree(ostack);
181 }
182}
struct BTStackData * bts_parent
Definition: nbtree.h:741

References BTStackData::bts_parent, and pfree().

Referenced by _bt_doinsert(), _bt_first(), and bt_rootdescend().

◆ _bt_get_endpoint()

Buffer _bt_get_endpoint ( Relation  rel,
uint32  level,
bool  rightmost 
)

Definition at line 2451 of file nbtsearch.c.

2452{
2453 Buffer buf;
2454 Page page;
2455 BTPageOpaque opaque;
2456 OffsetNumber offnum;
2457 BlockNumber blkno;
2458 IndexTuple itup;
2459
2460 /*
2461 * If we are looking for a leaf page, okay to descend from fast root;
2462 * otherwise better descend from true root. (There is no point in being
2463 * smarter about intermediate levels.)
2464 */
2465 if (level == 0)
2466 buf = _bt_getroot(rel, NULL, BT_READ);
2467 else
2468 buf = _bt_gettrueroot(rel);
2469
2470 if (!BufferIsValid(buf))
2471 return InvalidBuffer;
2472
2473 page = BufferGetPage(buf);
2474 opaque = BTPageGetOpaque(page);
2475
2476 for (;;)
2477 {
2478 /*
2479 * If we landed on a deleted page, step right to find a live page
2480 * (there must be one). Also, if we want the rightmost page, step
2481 * right if needed to get to it (this could happen if the page split
2482 * since we obtained a pointer to it).
2483 */
2484 while (P_IGNORE(opaque) ||
2485 (rightmost && !P_RIGHTMOST(opaque)))
2486 {
2487 blkno = opaque->btpo_next;
2488 if (blkno == P_NONE)
2489 elog(ERROR, "fell off the end of index \"%s\"",
2491 buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2492 page = BufferGetPage(buf);
2493 opaque = BTPageGetOpaque(page);
2494 }
2495
2496 /* Done? */
2497 if (opaque->btpo_level == level)
2498 break;
2499 if (opaque->btpo_level < level)
2500 ereport(ERROR,
2501 (errcode(ERRCODE_INDEX_CORRUPTED),
2502 errmsg_internal("btree level %u not found in index \"%s\"",
2503 level, RelationGetRelationName(rel))));
2504
2505 /* Descend to leftmost or rightmost child page */
2506 if (rightmost)
2507 offnum = PageGetMaxOffsetNumber(page);
2508 else
2509 offnum = P_FIRSTDATAKEY(opaque);
2510
2511 itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
2512 blkno = BTreeTupleGetDownLink(itup);
2513
2514 buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2515 page = BufferGetPage(buf);
2516 opaque = BTPageGetOpaque(page);
2517 }
2518
2519 return buf;
2520}
Buffer _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
Definition: nbtpage.c:1003
Buffer _bt_gettrueroot(Relation rel)
Definition: nbtpage.c:580
Buffer _bt_getroot(Relation rel, Relation heaprel, int access)
Definition: nbtpage.c:344
static BlockNumber BTreeTupleGetDownLink(IndexTuple pivot)
Definition: nbtree.h:556
uint32 btpo_level
Definition: nbtree.h:66

References _bt_getroot(), _bt_gettrueroot(), _bt_relandgetbuf(), BT_READ, BTPageGetOpaque, BTPageOpaqueData::btpo_level, BTPageOpaqueData::btpo_next, BTreeTupleGetDownLink(), buf, BufferGetPage(), BufferIsValid(), elog, ereport, errcode(), errmsg_internal(), ERROR, InvalidBuffer, P_FIRSTDATAKEY, P_IGNORE, P_NONE, P_RIGHTMOST, PageGetItem(), PageGetItemId(), PageGetMaxOffsetNumber(), and RelationGetRelationName.

Referenced by _bt_endpoint(), and _bt_insert_parent().

◆ _bt_getbuf()

Buffer _bt_getbuf ( Relation  rel,
BlockNumber  blkno,
int  access 
)

Definition at line 845 of file nbtpage.c.

846{
847 Buffer buf;
848
850
851 /* Read an existing block of the relation */
852 buf = ReadBuffer(rel, blkno);
853 _bt_lockbuf(rel, buf, access);
854 _bt_checkpage(rel, buf);
855
856 return buf;
857}
void _bt_checkpage(Relation rel, Buffer buf)
Definition: nbtpage.c:797
void _bt_lockbuf(Relation rel, Buffer buf, int access)
Definition: nbtpage.c:1039
short access
Definition: preproc-type.c:36

References _bt_checkpage(), _bt_lockbuf(), Assert, BlockNumberIsValid(), buf, and ReadBuffer().

Referenced by _bt_finish_split(), _bt_getroot(), _bt_getrootheight(), _bt_getstackbuf(), _bt_gettrueroot(), _bt_insertonpg(), _bt_killitems(), _bt_leftsib_splitflag(), _bt_lock_and_validate_left(), _bt_metaversion(), _bt_moveright(), _bt_newlevel(), _bt_pagedel(), _bt_readnextpage(), _bt_rightsib_halfdeadflag(), _bt_set_cleanup_info(), _bt_split(), _bt_unlink_halfdead_page(), and _bt_vacuum_needs_cleanup().

◆ _bt_getroot()

Buffer _bt_getroot ( Relation  rel,
Relation  heaprel,
int  access 
)

Definition at line 344 of file nbtpage.c.

345{
346 Buffer metabuf;
347 Buffer rootbuf;
348 Page rootpage;
349 BTPageOpaque rootopaque;
350 BlockNumber rootblkno;
351 uint32 rootlevel;
352 BTMetaPageData *metad;
353
354 Assert(access == BT_READ || heaprel != NULL);
355
356 /*
357 * Try to use previously-cached metapage data to find the root. This
358 * normally saves one buffer access per index search, which is a very
359 * helpful savings in bufmgr traffic and hence contention.
360 */
361 if (rel->rd_amcache != NULL)
362 {
363 metad = (BTMetaPageData *) rel->rd_amcache;
364 /* We shouldn't have cached it if any of these fail */
365 Assert(metad->btm_magic == BTREE_MAGIC);
368 Assert(!metad->btm_allequalimage ||
370 Assert(metad->btm_root != P_NONE);
371
372 rootblkno = metad->btm_fastroot;
373 Assert(rootblkno != P_NONE);
374 rootlevel = metad->btm_fastlevel;
375
376 rootbuf = _bt_getbuf(rel, rootblkno, BT_READ);
377 rootpage = BufferGetPage(rootbuf);
378 rootopaque = BTPageGetOpaque(rootpage);
379
380 /*
381 * Since the cache might be stale, we check the page more carefully
382 * here than normal. We *must* check that it's not deleted. If it's
383 * not alone on its level, then we reject too --- this may be overly
384 * paranoid but better safe than sorry. Note we don't check P_ISROOT,
385 * because that's not set in a "fast root".
386 */
387 if (!P_IGNORE(rootopaque) &&
388 rootopaque->btpo_level == rootlevel &&
389 P_LEFTMOST(rootopaque) &&
390 P_RIGHTMOST(rootopaque))
391 {
392 /* OK, accept cached page as the root */
393 return rootbuf;
394 }
395 _bt_relbuf(rel, rootbuf);
396 /* Cache is stale, throw it away */
397 if (rel->rd_amcache)
398 pfree(rel->rd_amcache);
399 rel->rd_amcache = NULL;
400 }
401
402 metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
403 metad = _bt_getmeta(rel, metabuf);
404
405 /* if no root page initialized yet, do it */
406 if (metad->btm_root == P_NONE)
407 {
408 Page metapg;
409
410 /* If access = BT_READ, caller doesn't want us to create root yet */
411 if (access == BT_READ)
412 {
413 _bt_relbuf(rel, metabuf);
414 return InvalidBuffer;
415 }
416
417 /* trade in our read lock for a write lock */
418 _bt_unlockbuf(rel, metabuf);
419 _bt_lockbuf(rel, metabuf, BT_WRITE);
420
421 /*
422 * Race condition: if someone else initialized the metadata between
423 * the time we released the read lock and acquired the write lock, we
424 * must avoid doing it again.
425 */
426 if (metad->btm_root != P_NONE)
427 {
428 /*
429 * Metadata initialized by someone else. In order to guarantee no
430 * deadlocks, we have to release the metadata page and start all
431 * over again. (Is that really true? But it's hardly worth trying
432 * to optimize this case.)
433 */
434 _bt_relbuf(rel, metabuf);
435 return _bt_getroot(rel, heaprel, access);
436 }
437
438 /*
439 * Get, initialize, write, and leave a lock of the appropriate type on
440 * the new root page. Since this is the first page in the tree, it's
441 * a leaf as well as the root.
442 */
443 rootbuf = _bt_allocbuf(rel, heaprel);
444 rootblkno = BufferGetBlockNumber(rootbuf);
445 rootpage = BufferGetPage(rootbuf);
446 rootopaque = BTPageGetOpaque(rootpage);
447 rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE;
448 rootopaque->btpo_flags = (BTP_LEAF | BTP_ROOT);
449 rootopaque->btpo_level = 0;
450 rootopaque->btpo_cycleid = 0;
451 /* Get raw page pointer for metapage */
452 metapg = BufferGetPage(metabuf);
453
454 /* NO ELOG(ERROR) till meta is updated */
456
457 /* upgrade metapage if needed */
458 if (metad->btm_version < BTREE_NOVAC_VERSION)
459 _bt_upgrademetapage(metapg);
460
461 metad->btm_root = rootblkno;
462 metad->btm_level = 0;
463 metad->btm_fastroot = rootblkno;
464 metad->btm_fastlevel = 0;
467
468 MarkBufferDirty(rootbuf);
469 MarkBufferDirty(metabuf);
470
471 /* XLOG stuff */
472 if (RelationNeedsWAL(rel))
473 {
474 xl_btree_newroot xlrec;
475 XLogRecPtr recptr;
477
481
483 md.version = metad->btm_version;
484 md.root = rootblkno;
485 md.level = 0;
486 md.fastroot = rootblkno;
487 md.fastlevel = 0;
490
491 XLogRegisterBufData(2, (char *) &md, sizeof(xl_btree_metadata));
492
493 xlrec.rootblk = rootblkno;
494 xlrec.level = 0;
495
496 XLogRegisterData((char *) &xlrec, SizeOfBtreeNewroot);
497
498 recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT);
499
500 PageSetLSN(rootpage, recptr);
501 PageSetLSN(metapg, recptr);
502 }
503
505
506 /*
507 * swap root write lock for read lock. There is no danger of anyone
508 * else accessing the new root page while it's unlocked, since no one
509 * else knows where it is yet.
510 */
511 _bt_unlockbuf(rel, rootbuf);
512 _bt_lockbuf(rel, rootbuf, BT_READ);
513
514 /* okay, metadata is correct, release lock on it without caching */
515 _bt_relbuf(rel, metabuf);
516 }
517 else
518 {
519 rootblkno = metad->btm_fastroot;
520 Assert(rootblkno != P_NONE);
521 rootlevel = metad->btm_fastlevel;
522
523 /*
524 * Cache the metapage data for next time
525 */
527 sizeof(BTMetaPageData));
528 memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));
529
530 /*
531 * We are done with the metapage; arrange to release it via first
532 * _bt_relandgetbuf call
533 */
534 rootbuf = metabuf;
535
536 for (;;)
537 {
538 rootbuf = _bt_relandgetbuf(rel, rootbuf, rootblkno, BT_READ);
539 rootpage = BufferGetPage(rootbuf);
540 rootopaque = BTPageGetOpaque(rootpage);
541
542 if (!P_IGNORE(rootopaque))
543 break;
544
545 /* it's dead, Jim. step right one page */
546 if (P_RIGHTMOST(rootopaque))
547 elog(ERROR, "no live root page found in index \"%s\"",
549 rootblkno = rootopaque->btpo_next;
550 }
551
552 if (rootopaque->btpo_level != rootlevel)
553 elog(ERROR, "root page %u of index \"%s\" has level %u, expected %u",
554 rootblkno, RelationGetRelationName(rel),
555 rootopaque->btpo_level, rootlevel);
556 }
557
558 /*
559 * By here, we have a pin and read lock on the root page, and no lock set
560 * on the metadata page. Return the root page's buffer.
561 */
562 return rootbuf;
563}
void * MemoryContextAlloc(MemoryContext context, Size size)
Definition: mcxt.c:1181
void _bt_upgrademetapage(Page page)
Definition: nbtpage.c:107
Buffer _bt_allocbuf(Relation rel, Relation heaprel)
Definition: nbtpage.c:869
static BTMetaPageData * _bt_getmeta(Relation rel, Buffer metabuf)
Definition: nbtpage.c:142
void _bt_unlockbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:1070
#define BTREE_MIN_VERSION
Definition: nbtree.h:151
#define BTP_LEAF
Definition: nbtree.h:76
#define BTREE_MAGIC
Definition: nbtree.h:149
#define BTP_ROOT
Definition: nbtree.h:77
#define SizeOfBtreeNewroot
Definition: nbtxlog.h:347
#define XLOG_BTREE_NEWROOT
Definition: nbtxlog.h:37
uint32 btm_last_cleanup_num_delpages
Definition: nbtree.h:114
uint32 btm_level
Definition: nbtree.h:108
float8 btm_last_cleanup_num_heap_tuples
Definition: nbtree.h:116
BlockNumber btm_fastroot
Definition: nbtree.h:109
uint32 btm_version
Definition: nbtree.h:106
uint32 btm_magic
Definition: nbtree.h:105
bool btm_allequalimage
Definition: nbtree.h:118
uint32 btm_fastlevel
Definition: nbtree.h:110
BlockNumber btpo_prev
Definition: nbtree.h:64
void * rd_amcache
Definition: rel.h:229
MemoryContext rd_indexcxt
Definition: rel.h:204
uint32 level
Definition: nbtxlog.h:50
uint32 version
Definition: nbtxlog.h:48
bool allequalimage
Definition: nbtxlog.h:54
BlockNumber fastroot
Definition: nbtxlog.h:51
uint32 fastlevel
Definition: nbtxlog.h:52
BlockNumber root
Definition: nbtxlog.h:49
uint32 last_cleanup_num_delpages
Definition: nbtxlog.h:53
uint32 level
Definition: nbtxlog.h:344
BlockNumber rootblk
Definition: nbtxlog.h:343
#define REGBUF_WILL_INIT
Definition: xloginsert.h:34

References _bt_allocbuf(), _bt_getbuf(), _bt_getmeta(), _bt_getroot(), _bt_lockbuf(), _bt_relandgetbuf(), _bt_relbuf(), _bt_unlockbuf(), _bt_upgrademetapage(), xl_btree_metadata::allequalimage, Assert, BT_READ, BT_WRITE, BTMetaPageData::btm_allequalimage, BTMetaPageData::btm_fastlevel, BTMetaPageData::btm_fastroot, BTMetaPageData::btm_last_cleanup_num_delpages, BTMetaPageData::btm_last_cleanup_num_heap_tuples, BTMetaPageData::btm_level, BTMetaPageData::btm_magic, BTMetaPageData::btm_root, BTMetaPageData::btm_version, BTP_LEAF, BTP_ROOT, BTPageGetOpaque, BTPageOpaqueData::btpo_cycleid, BTPageOpaqueData::btpo_flags, BTPageOpaqueData::btpo_level, BTPageOpaqueData::btpo_next, BTPageOpaqueData::btpo_prev, BTREE_MAGIC, BTREE_METAPAGE, BTREE_MIN_VERSION, BTREE_NOVAC_VERSION, BTREE_VERSION, BufferGetBlockNumber(), BufferGetPage(), elog, END_CRIT_SECTION, ERROR, xl_btree_metadata::fastlevel, xl_btree_metadata::fastroot, InvalidBuffer, xl_btree_metadata::last_cleanup_num_delpages, xl_btree_metadata::level, xl_btree_newroot::level, MarkBufferDirty(), MemoryContextAlloc(), P_IGNORE, P_LEFTMOST, P_NONE, P_RIGHTMOST, PageSetLSN(), pfree(), RelationData::rd_amcache, RelationData::rd_indexcxt, REGBUF_STANDARD, REGBUF_WILL_INIT, RelationGetRelationName, RelationNeedsWAL, xl_btree_metadata::root, xl_btree_newroot::rootblk, SizeOfBtreeNewroot, START_CRIT_SECTION, xl_btree_metadata::version, XLOG_BTREE_NEWROOT, XLogBeginInsert(), XLogInsert(), XLogRegisterBufData(), XLogRegisterBuffer(), and XLogRegisterData().

Referenced by _bt_get_endpoint(), _bt_getroot(), and _bt_search().

◆ _bt_getrootheight()

int _bt_getrootheight ( Relation  rel)

Definition at line 675 of file nbtpage.c.

676{
677 BTMetaPageData *metad;
678
679 if (rel->rd_amcache == NULL)
680 {
681 Buffer metabuf;
682
683 metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
684 metad = _bt_getmeta(rel, metabuf);
685
686 /*
687 * If there's no root page yet, _bt_getroot() doesn't expect a cache
688 * to be made, so just stop here and report the index height is zero.
689 * (XXX perhaps _bt_getroot() should be changed to allow this case.)
690 */
691 if (metad->btm_root == P_NONE)
692 {
693 _bt_relbuf(rel, metabuf);
694 return 0;
695 }
696
697 /*
698 * Cache the metapage data for next time
699 */
701 sizeof(BTMetaPageData));
702 memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));
703 _bt_relbuf(rel, metabuf);
704 }
705
706 /* Get cached page */
707 metad = (BTMetaPageData *) rel->rd_amcache;
708 /* We shouldn't have cached it if any of these fail */
709 Assert(metad->btm_magic == BTREE_MAGIC);
712 Assert(!metad->btm_allequalimage ||
714 Assert(metad->btm_fastroot != P_NONE);
715
716 return metad->btm_fastlevel;
717}

References _bt_getbuf(), _bt_getmeta(), _bt_relbuf(), Assert, BT_READ, BTMetaPageData::btm_allequalimage, BTMetaPageData::btm_fastlevel, BTMetaPageData::btm_fastroot, BTMetaPageData::btm_magic, BTMetaPageData::btm_root, BTMetaPageData::btm_version, BTREE_MAGIC, BTREE_METAPAGE, BTREE_MIN_VERSION, BTREE_NOVAC_VERSION, BTREE_VERSION, MemoryContextAlloc(), P_NONE, RelationData::rd_amcache, and RelationData::rd_indexcxt.

Referenced by _bt_insertonpg(), and btgettreeheight().

◆ _bt_getstackbuf()

Buffer _bt_getstackbuf ( Relation  rel,
Relation  heaprel,
BTStack  stack,
BlockNumber  child 
)

Definition at line 2319 of file nbtinsert.c.

2320{
2321 BlockNumber blkno;
2323
2324 blkno = stack->bts_blkno;
2325 start = stack->bts_offset;
2326
2327 for (;;)
2328 {
2329 Buffer buf;
2330 Page page;
2331 BTPageOpaque opaque;
2332
2333 buf = _bt_getbuf(rel, blkno, BT_WRITE);
2334 page = BufferGetPage(buf);
2335 opaque = BTPageGetOpaque(page);
2336
2337 Assert(heaprel != NULL);
2338 if (P_INCOMPLETE_SPLIT(opaque))
2339 {
2340 _bt_finish_split(rel, heaprel, buf, stack->bts_parent);
2341 continue;
2342 }
2343
2344 if (!P_IGNORE(opaque))
2345 {
2346 OffsetNumber offnum,
2347 minoff,
2348 maxoff;
2349 ItemId itemid;
2350 IndexTuple item;
2351
2352 minoff = P_FIRSTDATAKEY(opaque);
2353 maxoff = PageGetMaxOffsetNumber(page);
2354
2355 /*
2356 * start = InvalidOffsetNumber means "search the whole page". We
2357 * need this test anyway due to possibility that page has a high
2358 * key now when it didn't before.
2359 */
2360 if (start < minoff)
2361 start = minoff;
2362
2363 /*
2364 * Need this check too, to guard against possibility that page
2365 * split since we visited it originally.
2366 */
2367 if (start > maxoff)
2368 start = OffsetNumberNext(maxoff);
2369
2370 /*
2371 * These loops will check every item on the page --- but in an
2372 * order that's attuned to the probability of where it actually
2373 * is. Scan to the right first, then to the left.
2374 */
2375 for (offnum = start;
2376 offnum <= maxoff;
2377 offnum = OffsetNumberNext(offnum))
2378 {
2379 itemid = PageGetItemId(page, offnum);
2380 item = (IndexTuple) PageGetItem(page, itemid);
2381
2382 if (BTreeTupleGetDownLink(item) == child)
2383 {
2384 /* Return accurate pointer to where link is now */
2385 stack->bts_blkno = blkno;
2386 stack->bts_offset = offnum;
2387 return buf;
2388 }
2389 }
2390
2391 for (offnum = OffsetNumberPrev(start);
2392 offnum >= minoff;
2393 offnum = OffsetNumberPrev(offnum))
2394 {
2395 itemid = PageGetItemId(page, offnum);
2396 item = (IndexTuple) PageGetItem(page, itemid);
2397
2398 if (BTreeTupleGetDownLink(item) == child)
2399 {
2400 /* Return accurate pointer to where link is now */
2401 stack->bts_blkno = blkno;
2402 stack->bts_offset = offnum;
2403 return buf;
2404 }
2405 }
2406 }
2407
2408 /*
2409 * The item we're looking for moved right at least one page.
2410 *
2411 * Lehman and Yao couple/chain locks when moving right here, which we
2412 * can avoid. See nbtree/README.
2413 */
2414 if (P_RIGHTMOST(opaque))
2415 {
2416 _bt_relbuf(rel, buf);
2417 return InvalidBuffer;
2418 }
2419 blkno = opaque->btpo_next;
2421 _bt_relbuf(rel, buf);
2422 }
2423}
return str start
void _bt_finish_split(Relation rel, Relation heaprel, Buffer lbuf, BTStack stack)
Definition: nbtinsert.c:2241
#define OffsetNumberPrev(offsetNumber)
Definition: off.h:54
BlockNumber bts_blkno
Definition: nbtree.h:739
OffsetNumber bts_offset
Definition: nbtree.h:740

References _bt_finish_split(), _bt_getbuf(), _bt_relbuf(), Assert, BT_WRITE, BTPageGetOpaque, BTPageOpaqueData::btpo_next, BTreeTupleGetDownLink(), BTStackData::bts_blkno, BTStackData::bts_offset, BTStackData::bts_parent, buf, BufferGetPage(), InvalidBuffer, InvalidOffsetNumber, OffsetNumberNext, OffsetNumberPrev, P_FIRSTDATAKEY, P_IGNORE, P_INCOMPLETE_SPLIT, P_RIGHTMOST, PageGetItem(), PageGetItemId(), PageGetMaxOffsetNumber(), and start.

Referenced by _bt_insert_parent(), and _bt_lock_subtree_parent().

◆ _bt_gettrueroot()

Buffer _bt_gettrueroot ( Relation  rel)

Definition at line 580 of file nbtpage.c.

581{
582 Buffer metabuf;
583 Page metapg;
584 BTPageOpaque metaopaque;
585 Buffer rootbuf;
586 Page rootpage;
587 BTPageOpaque rootopaque;
588 BlockNumber rootblkno;
589 uint32 rootlevel;
590 BTMetaPageData *metad;
591
592 /*
593 * We don't try to use cached metapage data here, since (a) this path is
594 * not performance-critical, and (b) if we are here it suggests our cache
595 * is out-of-date anyway. In light of point (b), it's probably safest to
596 * actively flush any cached metapage info.
597 */
598 if (rel->rd_amcache)
599 pfree(rel->rd_amcache);
600 rel->rd_amcache = NULL;
601
602 metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
603 metapg = BufferGetPage(metabuf);
604 metaopaque = BTPageGetOpaque(metapg);
605 metad = BTPageGetMeta(metapg);
606
607 if (!P_ISMETA(metaopaque) ||
608 metad->btm_magic != BTREE_MAGIC)
610 (errcode(ERRCODE_INDEX_CORRUPTED),
611 errmsg("index \"%s\" is not a btree",
613
614 if (metad->btm_version < BTREE_MIN_VERSION ||
615 metad->btm_version > BTREE_VERSION)
617 (errcode(ERRCODE_INDEX_CORRUPTED),
618 errmsg("version mismatch in index \"%s\": file version %d, "
619 "current version %d, minimal supported version %d",
622
623 /* if no root page initialized yet, fail */
624 if (metad->btm_root == P_NONE)
625 {
626 _bt_relbuf(rel, metabuf);
627 return InvalidBuffer;
628 }
629
630 rootblkno = metad->btm_root;
631 rootlevel = metad->btm_level;
632
633 /*
634 * We are done with the metapage; arrange to release it via first
635 * _bt_relandgetbuf call
636 */
637 rootbuf = metabuf;
638
639 for (;;)
640 {
641 rootbuf = _bt_relandgetbuf(rel, rootbuf, rootblkno, BT_READ);
642 rootpage = BufferGetPage(rootbuf);
643 rootopaque = BTPageGetOpaque(rootpage);
644
645 if (!P_IGNORE(rootopaque))
646 break;
647
648 /* it's dead, Jim. step right one page */
649 if (P_RIGHTMOST(rootopaque))
650 elog(ERROR, "no live root page found in index \"%s\"",
652 rootblkno = rootopaque->btpo_next;
653 }
654
655 if (rootopaque->btpo_level != rootlevel)
656 elog(ERROR, "root page %u of index \"%s\" has level %u, expected %u",
657 rootblkno, RelationGetRelationName(rel),
658 rootopaque->btpo_level, rootlevel);
659
660 return rootbuf;
661}
#define P_ISMETA(opaque)
Definition: nbtree.h:223

References _bt_getbuf(), _bt_relandgetbuf(), _bt_relbuf(), BT_READ, BTMetaPageData::btm_level, BTMetaPageData::btm_magic, BTMetaPageData::btm_root, BTMetaPageData::btm_version, BTPageGetMeta, BTPageGetOpaque, BTPageOpaqueData::btpo_level, BTPageOpaqueData::btpo_next, BTREE_MAGIC, BTREE_METAPAGE, BTREE_MIN_VERSION, BTREE_VERSION, BufferGetPage(), elog, ereport, errcode(), errmsg(), ERROR, InvalidBuffer, P_IGNORE, P_ISMETA, P_NONE, P_RIGHTMOST, pfree(), RelationData::rd_amcache, and RelationGetRelationName.

Referenced by _bt_get_endpoint().

◆ _bt_initmetapage()

void _bt_initmetapage ( Page  page,
BlockNumber  rootbknum,
uint32  level,
bool  allequalimage 
)

Definition at line 67 of file nbtpage.c.

69{
70 BTMetaPageData *metad;
71 BTPageOpaque metaopaque;
72
73 _bt_pageinit(page, BLCKSZ);
74
75 metad = BTPageGetMeta(page);
76 metad->btm_magic = BTREE_MAGIC;
78 metad->btm_root = rootbknum;
79 metad->btm_level = level;
80 metad->btm_fastroot = rootbknum;
81 metad->btm_fastlevel = level;
84 metad->btm_allequalimage = allequalimage;
85
86 metaopaque = BTPageGetOpaque(page);
87 metaopaque->btpo_flags = BTP_META;
88
89 /*
90 * Set pd_lower just past the end of the metadata. This is essential,
91 * because without doing so, metadata will be lost if xlog.c compresses
92 * the page.
93 */
94 ((PageHeader) page)->pd_lower =
95 ((char *) metad + sizeof(BTMetaPageData)) - (char *) page;
96}
PageHeaderData * PageHeader
Definition: bufpage.h:174
#define BTP_META
Definition: nbtree.h:79

References _bt_pageinit(), BTMetaPageData::btm_allequalimage, BTMetaPageData::btm_fastlevel, BTMetaPageData::btm_fastroot, BTMetaPageData::btm_last_cleanup_num_delpages, BTMetaPageData::btm_last_cleanup_num_heap_tuples, BTMetaPageData::btm_level, BTMetaPageData::btm_magic, BTMetaPageData::btm_root, BTMetaPageData::btm_version, BTP_META, BTPageGetMeta, BTPageGetOpaque, BTPageOpaqueData::btpo_flags, BTREE_MAGIC, and BTREE_VERSION.

Referenced by _bt_uppershutdown(), and btbuildempty().

◆ _bt_keep_natts_fast()

int _bt_keep_natts_fast ( Relation  rel,
IndexTuple  lastleft,
IndexTuple  firstright 
)

Definition at line 3032 of file nbtutils.c.

3033{
3034 TupleDesc itupdesc = RelationGetDescr(rel);
3036 int keepnatts;
3037
3038 keepnatts = 1;
3039 for (int attnum = 1; attnum <= keysz; attnum++)
3040 {
3041 Datum datum1,
3042 datum2;
3043 bool isNull1,
3044 isNull2;
3045 CompactAttribute *att;
3046
3047 datum1 = index_getattr(lastleft, attnum, itupdesc, &isNull1);
3048 datum2 = index_getattr(firstright, attnum, itupdesc, &isNull2);
3049 att = TupleDescCompactAttr(itupdesc, attnum - 1);
3050
3051 if (isNull1 != isNull2)
3052 break;
3053
3054 if (!isNull1 &&
3055 !datum_image_eq(datum1, datum2, att->attbyval, att->attlen))
3056 break;
3057
3058 keepnatts++;
3059 }
3060
3061 return keepnatts;
3062}
bool datum_image_eq(Datum value1, Datum value2, bool typByVal, int typLen)
Definition: datum.c:266
int16 attnum
Definition: pg_attribute.h:74
int16 attlen
Definition: tupdesc.h:70
static CompactAttribute * TupleDescCompactAttr(TupleDesc tupdesc, int i)
Definition: tupdesc.h:168

References CompactAttribute::attbyval, CompactAttribute::attlen, attnum, datum_image_eq(), index_getattr(), IndexRelationGetNumberOfKeyAttributes, RelationGetDescr, and TupleDescCompactAttr().

Referenced by _bt_afternewitemoff(), _bt_bottomupdel_pass(), _bt_dedup_pass(), _bt_do_singleval(), _bt_keep_natts(), _bt_load(), _bt_split_penalty(), and _bt_strategy().

◆ _bt_killitems()

void _bt_killitems ( IndexScanDesc  scan)

Definition at line 2333 of file nbtutils.c.

2334{
2335 BTScanOpaque so = (BTScanOpaque) scan->opaque;
2336 Page page;
2337 BTPageOpaque opaque;
2338 OffsetNumber minoff;
2339 OffsetNumber maxoff;
2340 int i;
2341 int numKilled = so->numKilled;
2342 bool killedsomething = false;
2343 bool droppedpin PG_USED_FOR_ASSERTS_ONLY;
2344
2346
2347 /*
2348 * Always reset the scan state, so we don't look for same items on other
2349 * pages.
2350 */
2351 so->numKilled = 0;
2352
2353 if (BTScanPosIsPinned(so->currPos))
2354 {
2355 /*
2356 * We have held the pin on this page since we read the index tuples,
2357 * so all we need to do is lock it. The pin will have prevented
2358 * re-use of any TID on the page, so there is no need to check the
2359 * LSN.
2360 */
2361 droppedpin = false;
2363
2364 page = BufferGetPage(so->currPos.buf);
2365 }
2366 else
2367 {
2368 Buffer buf;
2369
2370 droppedpin = true;
2371 /* Attempt to re-read the buffer, getting pin and lock. */
2373
2374 page = BufferGetPage(buf);
2375 if (BufferGetLSNAtomic(buf) == so->currPos.lsn)
2376 so->currPos.buf = buf;
2377 else
2378 {
2379 /* Modified while not pinned means hinting is not safe. */
2381 return;
2382 }
2383 }
2384
2385 opaque = BTPageGetOpaque(page);
2386 minoff = P_FIRSTDATAKEY(opaque);
2387 maxoff = PageGetMaxOffsetNumber(page);
2388
2389 for (i = 0; i < numKilled; i++)
2390 {
2391 int itemIndex = so->killedItems[i];
2392 BTScanPosItem *kitem = &so->currPos.items[itemIndex];
2393 OffsetNumber offnum = kitem->indexOffset;
2394
2395 Assert(itemIndex >= so->currPos.firstItem &&
2396 itemIndex <= so->currPos.lastItem);
2397 if (offnum < minoff)
2398 continue; /* pure paranoia */
2399 while (offnum <= maxoff)
2400 {
2401 ItemId iid = PageGetItemId(page, offnum);
2402 IndexTuple ituple = (IndexTuple) PageGetItem(page, iid);
2403 bool killtuple = false;
2404
2405 if (BTreeTupleIsPosting(ituple))
2406 {
2407 int pi = i + 1;
2408 int nposting = BTreeTupleGetNPosting(ituple);
2409 int j;
2410
2411 /*
2412 * We rely on the convention that heap TIDs in the scanpos
2413 * items array are stored in ascending heap TID order for a
2414 * group of TIDs that originally came from a posting list
2415 * tuple. This convention even applies during backwards
2416 * scans, where returning the TIDs in descending order might
2417 * seem more natural. This is about effectiveness, not
2418 * correctness.
2419 *
2420 * Note that the page may have been modified in almost any way
2421 * since we first read it (in the !droppedpin case), so it's
2422 * possible that this posting list tuple wasn't a posting list
2423 * tuple when we first encountered its heap TIDs.
2424 */
2425 for (j = 0; j < nposting; j++)
2426 {
2427 ItemPointer item = BTreeTupleGetPostingN(ituple, j);
2428
2429 if (!ItemPointerEquals(item, &kitem->heapTid))
2430 break; /* out of posting list loop */
2431
2432 /*
2433 * kitem must have matching offnum when heap TIDs match,
2434 * though only in the common case where the page can't
2435 * have been concurrently modified
2436 */
2437 Assert(kitem->indexOffset == offnum || !droppedpin);
2438
2439 /*
2440 * Read-ahead to later kitems here.
2441 *
2442 * We rely on the assumption that not advancing kitem here
2443 * will prevent us from considering the posting list tuple
2444 * fully dead by not matching its next heap TID in next
2445 * loop iteration.
2446 *
2447 * If, on the other hand, this is the final heap TID in
2448 * the posting list tuple, then tuple gets killed
2449 * regardless (i.e. we handle the case where the last
2450 * kitem is also the last heap TID in the last index tuple
2451 * correctly -- posting tuple still gets killed).
2452 */
2453 if (pi < numKilled)
2454 kitem = &so->currPos.items[so->killedItems[pi++]];
2455 }
2456
2457 /*
2458 * Don't bother advancing the outermost loop's int iterator to
2459 * avoid processing killed items that relate to the same
2460 * offnum/posting list tuple. This micro-optimization hardly
2461 * seems worth it. (Further iterations of the outermost loop
2462 * will fail to match on this same posting list's first heap
2463 * TID instead, so we'll advance to the next offnum/index
2464 * tuple pretty quickly.)
2465 */
2466 if (j == nposting)
2467 killtuple = true;
2468 }
2469 else if (ItemPointerEquals(&ituple->t_tid, &kitem->heapTid))
2470 killtuple = true;
2471
2472 /*
2473 * Mark index item as dead, if it isn't already. Since this
2474 * happens while holding a buffer lock possibly in shared mode,
2475 * it's possible that multiple processes attempt to do this
2476 * simultaneously, leading to multiple full-page images being sent
2477 * to WAL (if wal_log_hints or data checksums are enabled), which
2478 * is undesirable.
2479 */
2480 if (killtuple && !ItemIdIsDead(iid))
2481 {
2482 /* found the item/all posting list items */
2483 ItemIdMarkDead(iid);
2484 killedsomething = true;
2485 break; /* out of inner search loop */
2486 }
2487 offnum = OffsetNumberNext(offnum);
2488 }
2489 }
2490
2491 /*
2492 * Since this can be redone later if needed, mark as dirty hint.
2493 *
2494 * Whenever we mark anything LP_DEAD, we also set the page's
2495 * BTP_HAS_GARBAGE flag, which is likewise just a hint. (Note that we
2496 * only rely on the page-level flag in !heapkeyspace indexes.)
2497 */
2498 if (killedsomething)
2499 {
2500 opaque->btpo_flags |= BTP_HAS_GARBAGE;
2501 MarkBufferDirtyHint(so->currPos.buf, true);
2502 }
2503
2505}
XLogRecPtr BufferGetLSNAtomic(Buffer buffer)
Definition: bufmgr.c:3985
void MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
Definition: bufmgr.c:4930
int j
Definition: isn.c:73
#define ItemIdMarkDead(itemId)
Definition: itemid.h:179
#define BTP_HAS_GARBAGE
Definition: nbtree.h:82
int * killedItems
Definition: nbtree.h:1053
BlockNumber currPage
Definition: nbtree.h:961
int firstItem
Definition: nbtree.h:989
BTScanPosItem items[MaxTIDsPerBTreePage]
Definition: nbtree.h:993
XLogRecPtr lsn
Definition: nbtree.h:964
ItemPointerData heapTid
Definition: nbtree.h:951
OffsetNumber indexOffset
Definition: nbtree.h:952

References _bt_getbuf(), _bt_lockbuf(), _bt_relbuf(), _bt_unlockbuf(), Assert, BT_READ, BTP_HAS_GARBAGE, BTPageGetOpaque, BTreeTupleGetNPosting(), BTreeTupleGetPostingN(), BTreeTupleIsPosting(), BTScanPosIsPinned, BTScanPosIsValid, buf, BTScanPosData::buf, BufferGetLSNAtomic(), BufferGetPage(), BTScanPosData::currPage, BTScanOpaqueData::currPos, BTScanPosData::firstItem, BTScanPosItem::heapTid, i, BTScanPosItem::indexOffset, IndexScanDescData::indexRelation, ItemIdIsDead, ItemIdMarkDead, ItemPointerEquals(), BTScanPosData::items, j, BTScanOpaqueData::killedItems, BTScanPosData::lsn, MarkBufferDirtyHint(), BTScanOpaqueData::numKilled, OffsetNumberNext, IndexScanDescData::opaque, P_FIRSTDATAKEY, PageGetItem(), PageGetItemId(), PageGetMaxOffsetNumber(), PG_USED_FOR_ASSERTS_ONLY, and IndexTupleData::t_tid.

Referenced by _bt_steppage(), btendscan(), btrescan(), and btrestrpos().

◆ _bt_lockbuf()

void _bt_lockbuf ( Relation  rel,
Buffer  buf,
int  access 
)

Definition at line 1039 of file nbtpage.c.

1040{
1041 /* LockBuffer() asserts that pin is held by this backend */
1043
1044 /*
1045 * It doesn't matter that _bt_unlockbuf() won't get called in the event of
1046 * an nbtree error (e.g. a unique violation error). That won't cause
1047 * Valgrind false positives.
1048 *
1049 * The nbtree client requests are superimposed on top of the bufmgr.c
1050 * buffer pin client requests. In the event of an nbtree error the buffer
1051 * will certainly get marked as defined when the backend once again
1052 * acquires its first pin on the buffer. (Of course, if the backend never
1053 * touches the buffer again then it doesn't matter that it remains
1054 * non-accessible to Valgrind.)
1055 *
1056 * Note: When an IndexTuple C pointer gets computed using an ItemId read
1057 * from a page while a lock was held, the C pointer becomes unsafe to
1058 * dereference forever as soon as the lock is released. Valgrind can only
1059 * detect cases where the pointer gets dereferenced with no _current_
1060 * lock/pin held, though.
1061 */
1062 if (!RelationUsesLocalBuffers(rel))
1064}
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:5100

References buf