PostgreSQL Source Code  git master
nbtree.h File Reference
#include "access/amapi.h"
#include "access/itup.h"
#include "access/sdir.h"
#include "access/xlogreader.h"
#include "catalog/pg_am_d.h"
#include "catalog/pg_index.h"
#include "lib/stringinfo.h"
#include "storage/bufmgr.h"
#include "storage/shm_toc.h"
Include dependency graph for nbtree.h:
This graph shows which files directly or indirectly include this file:

Go to the source code of this file.

Data Structures

struct  BTPageOpaqueData
 
struct  BTMetaPageData
 
struct  BTStackData
 
struct  BTScanInsertData
 
struct  BTInsertStateData
 
struct  BTDedupInterval
 
struct  BTDedupStateData
 
struct  BTVacuumPostingData
 
struct  BTScanPosItem
 
struct  BTScanPosData
 
struct  BTArrayKeyInfo
 
struct  BTScanOpaqueData
 
struct  BTOptions
 

Macros

#define BTP_LEAF   (1 << 0) /* leaf page, i.e. not internal page */
 
#define BTP_ROOT   (1 << 1) /* root page (has no parent) */
 
#define BTP_DELETED   (1 << 2) /* page has been deleted from tree */
 
#define BTP_META   (1 << 3) /* meta-page */
 
#define BTP_HALF_DEAD   (1 << 4) /* empty, but still in tree */
 
#define BTP_SPLIT_END   (1 << 5) /* rightmost page of split group */
 
#define BTP_HAS_GARBAGE   (1 << 6) /* page has LP_DEAD tuples */
 
#define BTP_INCOMPLETE_SPLIT   (1 << 7) /* right sibling's downlink is missing */
 
#define MAX_BT_CYCLE_ID   0xFF7F
 
#define BTPageGetMeta(p)   ((BTMetaPageData *) PageGetContents(p))
 
#define BTREE_METAPAGE   0 /* first page is meta */
 
#define BTREE_MAGIC   0x053162 /* magic number in metapage */
 
#define BTREE_VERSION   4 /* current version number */
 
#define BTREE_MIN_VERSION   2 /* minimum supported version */
 
#define BTREE_NOVAC_VERSION   3 /* version with all meta fields set */
 
#define BTMaxItemSize(page)
 
#define BTMaxItemSizeNoHeapTid(page)
 
#define MaxTIDsPerBTreePage
 
#define BTREE_MIN_FILLFACTOR   10
 
#define BTREE_DEFAULT_FILLFACTOR   90
 
#define BTREE_NONLEAF_FILLFACTOR   70
 
#define BTREE_SINGLEVAL_FILLFACTOR   96
 
#define P_NONE   0
 
#define P_LEFTMOST(opaque)   ((opaque)->btpo_prev == P_NONE)
 
#define P_RIGHTMOST(opaque)   ((opaque)->btpo_next == P_NONE)
 
#define P_ISLEAF(opaque)   (((opaque)->btpo_flags & BTP_LEAF) != 0)
 
#define P_ISROOT(opaque)   (((opaque)->btpo_flags & BTP_ROOT) != 0)
 
#define P_ISDELETED(opaque)   (((opaque)->btpo_flags & BTP_DELETED) != 0)
 
#define P_ISMETA(opaque)   (((opaque)->btpo_flags & BTP_META) != 0)
 
#define P_ISHALFDEAD(opaque)   (((opaque)->btpo_flags & BTP_HALF_DEAD) != 0)
 
#define P_IGNORE(opaque)   (((opaque)->btpo_flags & (BTP_DELETED|BTP_HALF_DEAD)) != 0)
 
#define P_HAS_GARBAGE(opaque)   (((opaque)->btpo_flags & BTP_HAS_GARBAGE) != 0)
 
#define P_INCOMPLETE_SPLIT(opaque)   (((opaque)->btpo_flags & BTP_INCOMPLETE_SPLIT) != 0)
 
#define P_HIKEY   ((OffsetNumber) 1)
 
#define P_FIRSTKEY   ((OffsetNumber) 2)
 
#define P_FIRSTDATAKEY(opaque)   (P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY)
 
#define INDEX_ALT_TID_MASK   INDEX_AM_RESERVED_BIT
 
#define BT_RESERVED_OFFSET_MASK   0xF000
 
#define BT_OFFSET_MASK   0x0FFF
 
#define BT_PIVOT_HEAP_TID_ATTR   0x1000
 
#define BT_IS_POSTING   0x2000
 
#define BTreeTupleGetNAtts(itup, rel)
 
#define BTCommuteStrategyNumber(strat)   (BTMaxStrategyNumber + 1 - (strat))
 
#define BTORDER_PROC   1
 
#define BTSORTSUPPORT_PROC   2
 
#define BTINRANGE_PROC   3
 
#define BTEQUALIMAGE_PROC   4
 
#define BTOPTIONS_PROC   5
 
#define BTNProcs   5
 
#define BT_READ   BUFFER_LOCK_SHARE
 
#define BT_WRITE   BUFFER_LOCK_EXCLUSIVE
 
#define BTScanPosIsPinned(scanpos)
 
#define BTScanPosUnpin(scanpos)
 
#define BTScanPosUnpinIfPinned(scanpos)
 
#define BTScanPosIsValid(scanpos)
 
#define BTScanPosInvalidate(scanpos)
 
#define SK_BT_REQFWD   0x00010000 /* required to continue forward scan */
 
#define SK_BT_REQBKWD   0x00020000 /* required to continue backward scan */
 
#define SK_BT_INDOPTION_SHIFT   24 /* must clear the above bits */
 
#define SK_BT_DESC   (INDOPTION_DESC << SK_BT_INDOPTION_SHIFT)
 
#define SK_BT_NULLS_FIRST   (INDOPTION_NULLS_FIRST << SK_BT_INDOPTION_SHIFT)
 
#define BTGetFillFactor(relation)
 
#define BTGetTargetPageFreeSpace(relation)   (BLCKSZ * (100 - BTGetFillFactor(relation)) / 100)
 
#define BTGetDeduplicateItems(relation)
 
#define PROGRESS_BTREE_PHASE_INDEXBUILD_TABLESCAN   2
 
#define PROGRESS_BTREE_PHASE_PERFORMSORT_1   3
 
#define PROGRESS_BTREE_PHASE_PERFORMSORT_2   4
 
#define PROGRESS_BTREE_PHASE_LEAF_LOAD   5
 

Typedefs

typedef uint16 BTCycleId
 
typedef struct BTPageOpaqueData BTPageOpaqueData
 
typedef BTPageOpaqueDataBTPageOpaque
 
typedef struct BTMetaPageData BTMetaPageData
 
typedef struct BTStackData BTStackData
 
typedef BTStackDataBTStack
 
typedef struct BTScanInsertData BTScanInsertData
 
typedef BTScanInsertDataBTScanInsert
 
typedef struct BTInsertStateData BTInsertStateData
 
typedef BTInsertStateDataBTInsertState
 
typedef struct BTDedupInterval BTDedupInterval
 
typedef struct BTDedupStateData BTDedupStateData
 
typedef BTDedupStateDataBTDedupState
 
typedef struct BTVacuumPostingData BTVacuumPostingData
 
typedef BTVacuumPostingDataBTVacuumPosting
 
typedef struct BTScanPosItem BTScanPosItem
 
typedef struct BTScanPosData BTScanPosData
 
typedef BTScanPosDataBTScanPos
 
typedef struct BTArrayKeyInfo BTArrayKeyInfo
 
typedef struct BTScanOpaqueData BTScanOpaqueData
 
typedef BTScanOpaqueDataBTScanOpaque
 
typedef struct BTOptions BTOptions
 

Functions

static bool BTreeTupleIsPivot (IndexTuple itup)
 
static bool BTreeTupleIsPosting (IndexTuple itup)
 
static void BTreeTupleSetPosting (IndexTuple itup, int nhtids, int postingoffset)
 
static uint16 BTreeTupleGetNPosting (IndexTuple posting)
 
static uint32 BTreeTupleGetPostingOffset (IndexTuple posting)
 
static ItemPointer BTreeTupleGetPosting (IndexTuple posting)
 
static ItemPointer BTreeTupleGetPostingN (IndexTuple posting, int n)
 
static BlockNumber BTreeTupleGetDownLink (IndexTuple pivot)
 
static void BTreeTupleSetDownLink (IndexTuple pivot, BlockNumber blkno)
 
static void BTreeTupleSetNAtts (IndexTuple itup, uint16 nkeyatts, bool heaptid)
 
static BlockNumber BTreeTupleGetTopParent (IndexTuple leafhikey)
 
static void BTreeTupleSetTopParent (IndexTuple leafhikey, BlockNumber blkno)
 
static ItemPointer BTreeTupleGetHeapTID (IndexTuple itup)
 
static ItemPointer BTreeTupleGetMaxHeapTID (IndexTuple itup)
 
void btbuildempty (Relation index)
 
bool btinsert (Relation rel, Datum *values, bool *isnull, ItemPointer ht_ctid, Relation heapRel, IndexUniqueCheck checkUnique, struct IndexInfo *indexInfo)
 
IndexScanDesc btbeginscan (Relation rel, int nkeys, int norderbys)
 
Size btestimateparallelscan (void)
 
void btinitparallelscan (void *target)
 
bool btgettuple (IndexScanDesc scan, ScanDirection dir)
 
int64 btgetbitmap (IndexScanDesc scan, TIDBitmap *tbm)
 
void btrescan (IndexScanDesc scan, ScanKey scankey, int nscankeys, ScanKey orderbys, int norderbys)
 
void btparallelrescan (IndexScanDesc scan)
 
void btendscan (IndexScanDesc scan)
 
void btmarkpos (IndexScanDesc scan)
 
void btrestrpos (IndexScanDesc scan)
 
IndexBulkDeleteResultbtbulkdelete (IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state)
 
IndexBulkDeleteResultbtvacuumcleanup (IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
 
bool btcanreturn (Relation index, int attno)
 
bool _bt_parallel_seize (IndexScanDesc scan, BlockNumber *pageno)
 
void _bt_parallel_release (IndexScanDesc scan, BlockNumber scan_page)
 
void _bt_parallel_done (IndexScanDesc scan)
 
void _bt_parallel_advance_array_keys (IndexScanDesc scan)
 
void _bt_dedup_one_page (Relation rel, Buffer buf, Relation heapRel, IndexTuple newitem, Size newitemsz, bool checkingunique)
 
void _bt_dedup_start_pending (BTDedupState state, IndexTuple base, OffsetNumber baseoff)
 
bool _bt_dedup_save_htid (BTDedupState state, IndexTuple itup)
 
Size _bt_dedup_finish_pending (Page newpage, BTDedupState state)
 
IndexTuple _bt_form_posting (IndexTuple base, ItemPointer htids, int nhtids)
 
void _bt_update_posting (BTVacuumPosting vacposting)
 
IndexTuple _bt_swap_posting (IndexTuple newitem, IndexTuple oposting, int postingoff)
 
bool _bt_doinsert (Relation rel, IndexTuple itup, IndexUniqueCheck checkUnique, Relation heapRel)
 
void _bt_finish_split (Relation rel, Buffer lbuf, BTStack stack)
 
Buffer _bt_getstackbuf (Relation rel, BTStack stack, BlockNumber child)
 
OffsetNumber _bt_findsplitloc (Relation rel, Page page, OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem, bool *newitemonleft)
 
void _bt_initmetapage (Page page, BlockNumber rootbknum, uint32 level, bool allequalimage)
 
void _bt_update_meta_cleanup_info (Relation rel, TransactionId oldestBtpoXact, float8 numHeapTuples)
 
void _bt_upgrademetapage (Page page)
 
Buffer _bt_getroot (Relation rel, int access)
 
Buffer _bt_gettrueroot (Relation rel)
 
int _bt_getrootheight (Relation rel)
 
void _bt_metaversion (Relation rel, bool *heapkeyspace, bool *allequalimage)
 
void _bt_checkpage (Relation rel, Buffer buf)
 
Buffer _bt_getbuf (Relation rel, BlockNumber blkno, int access)
 
Buffer _bt_relandgetbuf (Relation rel, Buffer obuf, BlockNumber blkno, int access)
 
void _bt_relbuf (Relation rel, Buffer buf)
 
void _bt_pageinit (Page page, Size size)
 
bool _bt_page_recyclable (Page page)
 
void _bt_delitems_vacuum (Relation rel, Buffer buf, OffsetNumber *deletable, int ndeletable, BTVacuumPosting *updatable, int nupdatable)
 
void _bt_delitems_delete (Relation rel, Buffer buf, OffsetNumber *deletable, int ndeletable, Relation heapRel)
 
int _bt_pagedel (Relation rel, Buffer buf)
 
BTStack _bt_search (Relation rel, BTScanInsert key, Buffer *bufP, int access, Snapshot snapshot)
 
Buffer _bt_moveright (Relation rel, BTScanInsert key, Buffer buf, bool forupdate, BTStack stack, int access, Snapshot snapshot)
 
OffsetNumber _bt_binsrch_insert (Relation rel, BTInsertState insertstate)
 
int32 _bt_compare (Relation rel, BTScanInsert key, Page page, OffsetNumber offnum)
 
bool _bt_first (IndexScanDesc scan, ScanDirection dir)
 
bool _bt_next (IndexScanDesc scan, ScanDirection dir)
 
Buffer _bt_get_endpoint (Relation rel, uint32 level, bool rightmost, Snapshot snapshot)
 
BTScanInsert _bt_mkscankey (Relation rel, IndexTuple itup)
 
void _bt_freestack (BTStack stack)
 
void _bt_preprocess_array_keys (IndexScanDesc scan)
 
void _bt_start_array_keys (IndexScanDesc scan, ScanDirection dir)
 
bool _bt_advance_array_keys (IndexScanDesc scan, ScanDirection dir)
 
void _bt_mark_array_keys (IndexScanDesc scan)
 
void _bt_restore_array_keys (IndexScanDesc scan)
 
void _bt_preprocess_keys (IndexScanDesc scan)
 
bool _bt_checkkeys (IndexScanDesc scan, IndexTuple tuple, int tupnatts, ScanDirection dir, bool *continuescan)
 
void _bt_killitems (IndexScanDesc scan)
 
BTCycleId _bt_vacuum_cycleid (Relation rel)
 
BTCycleId _bt_start_vacuum (Relation rel)
 
void _bt_end_vacuum (Relation rel)
 
void _bt_end_vacuum_callback (int code, Datum arg)
 
Size BTreeShmemSize (void)
 
void BTreeShmemInit (void)
 
byteabtoptions (Datum reloptions, bool validate)
 
bool btproperty (Oid index_oid, int attno, IndexAMProperty prop, const char *propname, bool *res, bool *isnull)
 
char * btbuildphasename (int64 phasenum)
 
IndexTuple _bt_truncate (Relation rel, IndexTuple lastleft, IndexTuple firstright, BTScanInsert itup_key)
 
int _bt_keep_natts_fast (Relation rel, IndexTuple lastleft, IndexTuple firstright)
 
bool _bt_check_natts (Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum)
 
void _bt_check_third_page (Relation rel, Relation heap, bool needheaptidspace, Page page, IndexTuple newtup)
 
bool _bt_allequalimage (Relation rel, bool debugmessage)
 
bool btvalidate (Oid opclassoid)
 
IndexBuildResultbtbuild (Relation heap, Relation index, struct IndexInfo *indexInfo)
 
void _bt_parallel_build_main (dsm_segment *seg, shm_toc *toc)
 

Macro Definition Documentation

◆ BT_IS_POSTING

#define BT_IS_POSTING   0x2000

Definition at line 350 of file nbtree.h.

Referenced by BTreeTupleIsPivot(), BTreeTupleIsPosting(), and BTreeTupleSetPosting().

◆ BT_OFFSET_MASK

#define BT_OFFSET_MASK   0x0FFF

Definition at line 348 of file nbtree.h.

Referenced by _bt_check_natts(), BTreeTupleGetNPosting(), and BTreeTupleSetPosting().

◆ BT_PIVOT_HEAP_TID_ATTR

#define BT_PIVOT_HEAP_TID_ATTR   0x1000

Definition at line 349 of file nbtree.h.

Referenced by _bt_check_natts(), BTreeTupleGetHeapTID(), and BTreeTupleSetNAtts().

◆ BT_READ

◆ BT_RESERVED_OFFSET_MASK

#define BT_RESERVED_OFFSET_MASK   0xF000

Definition at line 347 of file nbtree.h.

Referenced by BTreeTupleSetNAtts().

◆ BT_WRITE

◆ BTCommuteStrategyNumber

#define BTCommuteStrategyNumber (   strat)    (BTMaxStrategyNumber + 1 - (strat))

Definition at line 560 of file nbtree.h.

Referenced by _bt_compare_scankey_args(), and _bt_fix_scankey_strategy().

◆ BTEQUALIMAGE_PROC

#define BTEQUALIMAGE_PROC   4

Definition at line 585 of file nbtree.h.

Referenced by _bt_allequalimage(), assignProcTypes(), and btvalidate().

◆ BTGetDeduplicateItems

#define BTGetDeduplicateItems (   relation)
Value:
(AssertMacro(relation->rd_rel->relkind == RELKIND_INDEX && \
relation->rd_rel->relam == BTREE_AM_OID), \
((relation)->rd_options ? \
((BTOptions *) (relation)->rd_options)->deduplicate_items : true))
#define AssertMacro(condition)
Definition: c.h:739

Definition at line 982 of file nbtree.h.

Referenced by _bt_findinsertloc(), and _bt_load().

◆ BTGetFillFactor

#define BTGetFillFactor (   relation)
Value:
(AssertMacro(relation->rd_rel->relkind == RELKIND_INDEX && \
relation->rd_rel->relam == BTREE_AM_OID), \
(relation)->rd_options ? \
((BTOptions *) (relation)->rd_options)->fillfactor : \
BTREE_DEFAULT_FILLFACTOR)
#define AssertMacro(condition)
Definition: c.h:739

Definition at line 974 of file nbtree.h.

Referenced by _bt_findsplitloc().

◆ BTGetTargetPageFreeSpace

#define BTGetTargetPageFreeSpace (   relation)    (BLCKSZ * (100 - BTGetFillFactor(relation)) / 100)

Definition at line 980 of file nbtree.h.

Referenced by _bt_pagestate().

◆ BTINRANGE_PROC

#define BTINRANGE_PROC   3

Definition at line 584 of file nbtree.h.

Referenced by assignProcTypes(), btvalidate(), and transformFrameOffset().

◆ BTMaxItemSize

#define BTMaxItemSize (   page)
Value:
3*sizeof(ItemIdData) + \
3*sizeof(ItemPointerData)) - \
MAXALIGN(sizeof(BTPageOpaqueData))) / 3)
#define SizeOfPageHeaderData
Definition: bufpage.h:216
#define PageGetPageSize(page)
Definition: bufpage.h:268
#define MAXALIGN(LEN)
Definition: c.h:691
#define MAXALIGN_DOWN(LEN)
Definition: c.h:703

Definition at line 157 of file nbtree.h.

Referenced by _bt_buildadd(), _bt_check_third_page(), _bt_dedup_one_page(), _bt_findinsertloc(), _bt_load(), bt_target_page_check(), and btree_xlog_dedup().

◆ BTMaxItemSizeNoHeapTid

#define BTMaxItemSizeNoHeapTid (   page)
Value:
MAXALIGN(sizeof(BTPageOpaqueData))) / 3)
#define SizeOfPageHeaderData
Definition: bufpage.h:216
#define PageGetPageSize(page)
Definition: bufpage.h:268
#define MAXALIGN(LEN)
Definition: c.h:691
#define MAXALIGN_DOWN(LEN)
Definition: c.h:703

Definition at line 163 of file nbtree.h.

Referenced by _bt_check_third_page(), and bt_target_page_check().

◆ BTNProcs

#define BTNProcs   5

Definition at line 587 of file nbtree.h.

Referenced by bthandler().

◆ BTOPTIONS_PROC

#define BTOPTIONS_PROC   5

Definition at line 586 of file nbtree.h.

Referenced by bthandler(), and btvalidate().

◆ BTORDER_PROC

◆ BTP_DELETED

#define BTP_DELETED   (1 << 2) /* page has been deleted from tree */

Definition at line 74 of file nbtree.h.

Referenced by _bt_unlink_halfdead_page(), and btree_xlog_unlink_page().

◆ BTP_HALF_DEAD

#define BTP_HALF_DEAD   (1 << 4) /* empty, but still in tree */

◆ BTP_HAS_GARBAGE

#define BTP_HAS_GARBAGE   (1 << 6) /* page has LP_DEAD tuples */

◆ BTP_INCOMPLETE_SPLIT

#define BTP_INCOMPLETE_SPLIT   (1 << 7) /* right sibling's downlink is missing */

◆ BTP_LEAF

#define BTP_LEAF   (1 << 0) /* leaf page, i.e. not internal page */

◆ BTP_META

#define BTP_META   (1 << 3) /* meta-page */

Definition at line 75 of file nbtree.h.

Referenced by _bt_initmetapage(), _bt_restore_meta(), and _bt_upgrademetapage().

◆ BTP_ROOT

#define BTP_ROOT   (1 << 1) /* root page (has no parent) */

Definition at line 73 of file nbtree.h.

Referenced by _bt_getroot(), _bt_newroot(), _bt_split(), _bt_uppershutdown(), and btree_xlog_newroot().

◆ BTP_SPLIT_END

#define BTP_SPLIT_END   (1 << 5) /* rightmost page of split group */

Definition at line 77 of file nbtree.h.

Referenced by _bt_split(), btree_mask(), and btvacuumpage().

◆ BTPageGetMeta

◆ BTREE_DEFAULT_FILLFACTOR

#define BTREE_DEFAULT_FILLFACTOR   90

Definition at line 194 of file nbtree.h.

◆ BTREE_MAGIC

#define BTREE_MAGIC   0x053162 /* magic number in metapage */

◆ BTREE_METAPAGE

◆ BTREE_MIN_FILLFACTOR

#define BTREE_MIN_FILLFACTOR   10

Definition at line 193 of file nbtree.h.

◆ BTREE_MIN_VERSION

#define BTREE_MIN_VERSION   2 /* minimum supported version */

◆ BTREE_NONLEAF_FILLFACTOR

#define BTREE_NONLEAF_FILLFACTOR   70

Definition at line 195 of file nbtree.h.

Referenced by _bt_findsplitloc(), and _bt_pagestate().

◆ BTREE_NOVAC_VERSION

◆ BTREE_SINGLEVAL_FILLFACTOR

#define BTREE_SINGLEVAL_FILLFACTOR   96

Definition at line 196 of file nbtree.h.

Referenced by _bt_findsplitloc(), and _bt_singleval_fillfactor().

◆ BTREE_VERSION

#define BTREE_VERSION   4 /* current version number */

◆ BTreeTupleGetNAtts

#define BTreeTupleGetNAtts (   itup,
  rel 
)
Value:
( \
(BTreeTupleIsPivot(itup)) ? \
( \
) \
: \
)
#define ItemPointerGetOffsetNumberNoCheck(pointer)
Definition: itemptr.h:108
static bool BTreeTupleIsPivot(IndexTuple itup)
Definition: nbtree.h:357
#define BT_OFFSET_MASK
Definition: nbtree.h:348
#define IndexRelationGetNumberOfAttributes(relation)
Definition: rel.h:468

Definition at line 452 of file nbtree.h.

Referenced by _bt_buildadd(), _bt_check_natts(), _bt_checkkeys(), _bt_compare(), _bt_insertonpg(), _bt_mkscankey(), _bt_newroot(), _bt_readpage(), _bt_split(), _bt_uppershutdown(), and bt_target_page_check().

◆ BTScanPosInvalidate

#define BTScanPosInvalidate (   scanpos)
Value:
do { \
(scanpos).currPage = InvalidBlockNumber; \
(scanpos).nextPage = InvalidBlockNumber; \
(scanpos).buf = InvalidBuffer; \
(scanpos).lsn = InvalidXLogRecPtr; \
(scanpos).nextTupleOffset = 0; \
} while (0);
#define InvalidXLogRecPtr
Definition: xlogdefs.h:28
#define InvalidBuffer
Definition: buf.h:25
static char * buf
Definition: pg_test_fsync.c:67
#define InvalidBlockNumber
Definition: block.h:33

Definition at line 891 of file nbtree.h.

Referenced by _bt_endpoint(), _bt_first(), _bt_readnextpage(), _bt_steppage(), btbeginscan(), btmarkpos(), btrescan(), and btrestrpos().

◆ BTScanPosIsPinned

#define BTScanPosIsPinned (   scanpos)
Value:
( \
AssertMacro(BlockNumberIsValid((scanpos).currPage) || \
!BufferIsValid((scanpos).buf)), \
BufferIsValid((scanpos).buf) \
)
static char * buf
Definition: pg_test_fsync.c:67
#define BlockNumberIsValid(blockNumber)
Definition: block.h:70
#define BufferIsValid(bufnum)
Definition: bufmgr.h:123

Definition at line 868 of file nbtree.h.

Referenced by _bt_killitems(), _bt_readnextpage(), _bt_readpage(), _bt_steppage(), and btrestrpos().

◆ BTScanPosIsValid

#define BTScanPosIsValid (   scanpos)
Value:
( \
AssertMacro(BlockNumberIsValid((scanpos).currPage) || \
!BufferIsValid((scanpos).buf)), \
BlockNumberIsValid((scanpos).currPage) \
)
static char * buf
Definition: pg_test_fsync.c:67
#define BlockNumberIsValid(blockNumber)
Definition: block.h:70
#define BufferIsValid(bufnum)
Definition: bufmgr.h:123

Definition at line 885 of file nbtree.h.

Referenced by _bt_first(), _bt_killitems(), _bt_steppage(), btendscan(), btgettuple(), btmarkpos(), btrescan(), and btrestrpos().

◆ BTScanPosUnpin

#define BTScanPosUnpin (   scanpos)
Value:
do { \
ReleaseBuffer((scanpos).buf); \
(scanpos).buf = InvalidBuffer; \
} while (0)
#define InvalidBuffer
Definition: buf.h:25
static char * buf
Definition: pg_test_fsync.c:67

Definition at line 874 of file nbtree.h.

◆ BTScanPosUnpinIfPinned

#define BTScanPosUnpinIfPinned (   scanpos)
Value:
do { \
if (BTScanPosIsPinned(scanpos)) \
BTScanPosUnpin(scanpos); \
} while (0)
#define BTScanPosIsPinned(scanpos)
Definition: nbtree.h:868

Definition at line 879 of file nbtree.h.

Referenced by _bt_readnextpage(), _bt_steppage(), btendscan(), btmarkpos(), btrescan(), and btrestrpos().

◆ BTSORTSUPPORT_PROC

#define BTSORTSUPPORT_PROC   2

Definition at line 583 of file nbtree.h.

Referenced by assignProcTypes(), btvalidate(), FinishSortSupportFunction(), and MJExamineQuals().

◆ INDEX_ALT_TID_MASK

◆ MAX_BT_CYCLE_ID

#define MAX_BT_CYCLE_ID   0xFF7F

Definition at line 88 of file nbtree.h.

Referenced by _bt_start_vacuum().

◆ MaxTIDsPerBTreePage

#define MaxTIDsPerBTreePage
Value:
(int) ((BLCKSZ - SizeOfPageHeaderData - sizeof(BTPageOpaqueData)) / \
sizeof(ItemPointerData))
#define SizeOfPageHeaderData
Definition: bufpage.h:216

Definition at line 179 of file nbtree.h.

Referenced by _bt_readpage(), bt_check_every_level(), and btgettuple().

◆ P_FIRSTDATAKEY

◆ P_FIRSTKEY

#define P_FIRSTKEY   ((OffsetNumber) 2)

◆ P_HAS_GARBAGE

#define P_HAS_GARBAGE (   opaque)    (((opaque)->btpo_flags & BTP_HAS_GARBAGE) != 0)

◆ P_HIKEY

◆ P_IGNORE

◆ P_INCOMPLETE_SPLIT

◆ P_ISDELETED

◆ P_ISHALFDEAD

#define P_ISHALFDEAD (   opaque)    (((opaque)->btpo_flags & BTP_HALF_DEAD) != 0)

◆ P_ISLEAF

◆ P_ISMETA

#define P_ISMETA (   opaque)    (((opaque)->btpo_flags & BTP_META) != 0)

Definition at line 217 of file nbtree.h.

Referenced by _bt_getmeta(), _bt_gettrueroot(), bt_page_items_bytea(), and palloc_btree_page().

◆ P_ISROOT

#define P_ISROOT (   opaque)    (((opaque)->btpo_flags & BTP_ROOT) != 0)

◆ P_LEFTMOST

#define P_LEFTMOST (   opaque)    ((opaque)->btpo_prev == P_NONE)

◆ P_NONE

◆ P_RIGHTMOST

◆ PROGRESS_BTREE_PHASE_INDEXBUILD_TABLESCAN

#define PROGRESS_BTREE_PHASE_INDEXBUILD_TABLESCAN   2

Definition at line 993 of file nbtree.h.

Referenced by _bt_spools_heapscan(), and btbuildphasename().

◆ PROGRESS_BTREE_PHASE_LEAF_LOAD

#define PROGRESS_BTREE_PHASE_LEAF_LOAD   5

Definition at line 996 of file nbtree.h.

Referenced by _bt_leafbuild(), and btbuildphasename().

◆ PROGRESS_BTREE_PHASE_PERFORMSORT_1

#define PROGRESS_BTREE_PHASE_PERFORMSORT_1   3

Definition at line 994 of file nbtree.h.

Referenced by _bt_leafbuild(), and btbuildphasename().

◆ PROGRESS_BTREE_PHASE_PERFORMSORT_2

#define PROGRESS_BTREE_PHASE_PERFORMSORT_2   4

Definition at line 995 of file nbtree.h.

Referenced by _bt_leafbuild(), and btbuildphasename().

◆ SK_BT_DESC

◆ SK_BT_INDOPTION_SHIFT

#define SK_BT_INDOPTION_SHIFT   24 /* must clear the above bits */

Definition at line 961 of file nbtree.h.

Referenced by _bt_fix_scankey_strategy(), and _bt_mkscankey().

◆ SK_BT_NULLS_FIRST

◆ SK_BT_REQBKWD

#define SK_BT_REQBKWD   0x00020000 /* required to continue backward scan */

Definition at line 960 of file nbtree.h.

Referenced by _bt_check_rowcompare(), _bt_checkkeys(), and _bt_mark_scankey_required().

◆ SK_BT_REQFWD

#define SK_BT_REQFWD   0x00010000 /* required to continue forward scan */

Definition at line 959 of file nbtree.h.

Referenced by _bt_check_rowcompare(), _bt_checkkeys(), and _bt_mark_scankey_required().

Typedef Documentation

◆ BTArrayKeyInfo

◆ BTCycleId

typedef uint16 BTCycleId

Definition at line 28 of file nbtree.h.

◆ BTDedupInterval

◆ BTDedupState

Definition at line 772 of file nbtree.h.

◆ BTDedupStateData

◆ BTInsertState

Definition at line 715 of file nbtree.h.

◆ BTInsertStateData

◆ BTMetaPageData

◆ BTOptions

typedef struct BTOptions BTOptions

◆ BTPageOpaque

Definition at line 69 of file nbtree.h.

◆ BTPageOpaqueData

◆ BTScanInsert

Definition at line 676 of file nbtree.h.

◆ BTScanInsertData

◆ BTScanOpaque

Definition at line 952 of file nbtree.h.

◆ BTScanOpaqueData

◆ BTScanPos

Definition at line 866 of file nbtree.h.

◆ BTScanPosData

typedef struct BTScanPosData BTScanPosData

◆ BTScanPosItem

typedef struct BTScanPosItem BTScanPosItem

◆ BTStack

typedef BTStackData* BTStack

Definition at line 614 of file nbtree.h.

◆ BTStackData

typedef struct BTStackData BTStackData

◆ BTVacuumPosting

Definition at line 792 of file nbtree.h.

◆ BTVacuumPostingData

Function Documentation

◆ _bt_advance_array_keys()

bool _bt_advance_array_keys ( IndexScanDesc  scan,
ScanDirection  dir 
)

Definition at line 544 of file nbtutils.c.

References _bt_parallel_advance_array_keys(), BTScanOpaqueData::arrayKeyData, BTScanOpaqueData::arrayKeys, BTArrayKeyInfo::cur_elem, BTArrayKeyInfo::elem_values, i, BTArrayKeyInfo::num_elems, BTScanOpaqueData::numArrayKeys, IndexScanDescData::opaque, IndexScanDescData::parallel_scan, BTArrayKeyInfo::scan_key, ScanDirectionIsBackward, and ScanKeyData::sk_argument.

Referenced by btgetbitmap(), and btgettuple().

545 {
546  BTScanOpaque so = (BTScanOpaque) scan->opaque;
547  bool found = false;
548  int i;
549 
550  /*
551  * We must advance the last array key most quickly, since it will
552  * correspond to the lowest-order index column among the available
553  * qualifications. This is necessary to ensure correct ordering of output
554  * when there are multiple array keys.
555  */
556  for (i = so->numArrayKeys - 1; i >= 0; i--)
557  {
558  BTArrayKeyInfo *curArrayKey = &so->arrayKeys[i];
559  ScanKey skey = &so->arrayKeyData[curArrayKey->scan_key];
560  int cur_elem = curArrayKey->cur_elem;
561  int num_elems = curArrayKey->num_elems;
562 
563  if (ScanDirectionIsBackward(dir))
564  {
565  if (--cur_elem < 0)
566  {
567  cur_elem = num_elems - 1;
568  found = false; /* need to advance next array key */
569  }
570  else
571  found = true;
572  }
573  else
574  {
575  if (++cur_elem >= num_elems)
576  {
577  cur_elem = 0;
578  found = false; /* need to advance next array key */
579  }
580  else
581  found = true;
582  }
583 
584  curArrayKey->cur_elem = cur_elem;
585  skey->sk_argument = curArrayKey->elem_values[cur_elem];
586  if (found)
587  break;
588  }
589 
590  /* advance parallel scan */
591  if (scan->parallel_scan != NULL)
593 
594  return found;
595 }
Datum * elem_values
Definition: nbtree.h:907
#define ScanDirectionIsBackward(direction)
Definition: sdir.h:41
BTScanOpaqueData * BTScanOpaque
Definition: nbtree.h:952
int cur_elem
Definition: nbtree.h:904
int numArrayKeys
Definition: nbtree.h:919
ScanKey arrayKeyData
Definition: nbtree.h:918
struct ParallelIndexScanDescData * parallel_scan
Definition: relscan.h:151
int num_elems
Definition: nbtree.h:906
void _bt_parallel_advance_array_keys(IndexScanDesc scan)
Definition: nbtree.c:769
BTArrayKeyInfo * arrayKeys
Definition: nbtree.h:923
int i
Datum sk_argument
Definition: skey.h:72
int scan_key
Definition: nbtree.h:903

◆ _bt_allequalimage()

bool _bt_allequalimage ( Relation  rel,
bool  debugmessage 
)

Definition at line 2692 of file nbtutils.c.

References BTEQUALIMAGE_PROC, BTSortArrayContext::collation, DatumGetBool, DEBUG1, elog, get_opfamily_proc(), i, IndexRelationGetNumberOfAttributes, IndexRelationGetNumberOfKeyAttributes, IsSystemRelation(), ObjectIdGetDatum, OidFunctionCall1Coll(), OidIsValid, RelationData::rd_indcollation, RelationData::rd_opcintype, RelationData::rd_opfamily, and RelationGetRelationName.

Referenced by _bt_leafbuild(), and btbuildempty().

2693 {
2694  bool allequalimage = true;
2695 
2696  /* INCLUDE indexes don't support deduplication */
2699  return false;
2700 
2701  /*
2702  * There is no special reason why deduplication cannot work with system
2703  * relations (i.e. with system catalog indexes and TOAST indexes). We
2704  * deem deduplication unsafe for these indexes all the same, since the
2705  * alternative is to force users to always use deduplication, without
2706  * being able to opt out. (ALTER INDEX is not supported with system
2707  * indexes, so users would have no way to set the deduplicate_items
2708  * storage parameter to 'off'.)
2709  */
2710  if (IsSystemRelation(rel))
2711  return false;
2712 
2713  for (int i = 0; i < IndexRelationGetNumberOfKeyAttributes(rel); i++)
2714  {
2715  Oid opfamily = rel->rd_opfamily[i];
2716  Oid opcintype = rel->rd_opcintype[i];
2717  Oid collation = rel->rd_indcollation[i];
2718  Oid equalimageproc;
2719 
2720  equalimageproc = get_opfamily_proc(opfamily, opcintype, opcintype,
2722 
2723  /*
2724  * If there is no BTEQUALIMAGE_PROC then deduplication is assumed to
2725  * be unsafe. Otherwise, actually call proc and see what it says.
2726  */
2727  if (!OidIsValid(equalimageproc) ||
2728  !DatumGetBool(OidFunctionCall1Coll(equalimageproc, collation,
2729  ObjectIdGetDatum(opcintype))))
2730  {
2731  allequalimage = false;
2732  break;
2733  }
2734  }
2735 
2736  /*
2737  * Don't elog() until here to avoid reporting on a system relation index
2738  * or an INCLUDE index
2739  */
2740  if (debugmessage)
2741  {
2742  if (allequalimage)
2743  elog(DEBUG1, "index \"%s\" can safely use deduplication",
2745  else
2746  elog(DEBUG1, "index \"%s\" cannot use deduplication",
2748  }
2749 
2750  return allequalimage;
2751 }
#define DEBUG1
Definition: elog.h:25
bool IsSystemRelation(Relation relation)
Definition: catalog.c:68
#define BTEQUALIMAGE_PROC
Definition: nbtree.h:585
unsigned int Oid
Definition: postgres_ext.h:31
#define OidIsValid(objectId)
Definition: c.h:644
Oid * rd_indcollation
Definition: rel.h:199
#define ObjectIdGetDatum(X)
Definition: postgres.h:507
#define DatumGetBool(X)
Definition: postgres.h:393
#define IndexRelationGetNumberOfAttributes(relation)
Definition: rel.h:468
#define RelationGetRelationName(relation)
Definition: rel.h:490
Oid * rd_opfamily
Definition: rel.h:189
#define IndexRelationGetNumberOfKeyAttributes(relation)
Definition: rel.h:475
Datum OidFunctionCall1Coll(Oid functionId, Oid collation, Datum arg1)
Definition: fmgr.c:1414
Oid get_opfamily_proc(Oid opfamily, Oid lefttype, Oid righttype, int16 procnum)
Definition: lsyscache.c:745
#define elog(elevel,...)
Definition: elog.h:214
int i
Oid * rd_opcintype
Definition: rel.h:190

◆ _bt_binsrch_insert()

OffsetNumber _bt_binsrch_insert ( Relation  rel,
BTInsertState  insertstate 
)

Definition at line 452 of file nbtsearch.c.

References _bt_binsrch_posting(), _bt_compare(), Assert, BTInsertStateData::bounds_valid, BTInsertStateData::buf, BufferGetPage, InvalidOffsetNumber, BTInsertStateData::itup_key, sort-test::key, BTInsertStateData::low, BTScanInsertData::nextkey, P_FIRSTDATAKEY, P_ISLEAF, PageGetMaxOffsetNumber, PageGetSpecialPointer, BTInsertStateData::postingoff, BTScanInsertData::scantid, BTInsertStateData::stricthigh, and unlikely.

Referenced by _bt_check_unique(), _bt_findinsertloc(), and bt_rootdescend().

453 {
454  BTScanInsert key = insertstate->itup_key;
455  Page page;
456  BTPageOpaque opaque;
457  OffsetNumber low,
458  high,
459  stricthigh;
460  int32 result,
461  cmpval;
462 
463  page = BufferGetPage(insertstate->buf);
464  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
465 
466  Assert(P_ISLEAF(opaque));
467  Assert(!key->nextkey);
468  Assert(insertstate->postingoff == 0);
469 
470  if (!insertstate->bounds_valid)
471  {
472  /* Start new binary search */
473  low = P_FIRSTDATAKEY(opaque);
474  high = PageGetMaxOffsetNumber(page);
475  }
476  else
477  {
478  /* Restore result of previous binary search against same page */
479  low = insertstate->low;
480  high = insertstate->stricthigh;
481  }
482 
483  /* If there are no keys on the page, return the first available slot */
484  if (unlikely(high < low))
485  {
486  /* Caller can't reuse bounds */
487  insertstate->low = InvalidOffsetNumber;
488  insertstate->stricthigh = InvalidOffsetNumber;
489  insertstate->bounds_valid = false;
490  return low;
491  }
492 
493  /*
494  * Binary search to find the first key on the page >= scan key. (nextkey
495  * is always false when inserting).
496  *
497  * The loop invariant is: all slots before 'low' are < scan key, all slots
498  * at or after 'high' are >= scan key. 'stricthigh' is > scan key, and is
499  * maintained to save additional search effort for caller.
500  *
501  * We can fall out when high == low.
502  */
503  if (!insertstate->bounds_valid)
504  high++; /* establish the loop invariant for high */
505  stricthigh = high; /* high initially strictly higher */
506 
507  cmpval = 1; /* !nextkey comparison value */
508 
509  while (high > low)
510  {
511  OffsetNumber mid = low + ((high - low) / 2);
512 
513  /* We have low <= mid < high, so mid points at a real slot */
514 
515  result = _bt_compare(rel, key, page, mid);
516 
517  if (result >= cmpval)
518  low = mid + 1;
519  else
520  {
521  high = mid;
522  if (result != 0)
523  stricthigh = high;
524  }
525 
526  /*
527  * If tuple at offset located by binary search is a posting list whose
528  * TID range overlaps with caller's scantid, perform posting list
529  * binary search to set postingoff for caller. Caller must split the
530  * posting list when postingoff is set. This should happen
531  * infrequently.
532  */
533  if (unlikely(result == 0 && key->scantid != NULL))
534  insertstate->postingoff = _bt_binsrch_posting(key, page, mid);
535  }
536 
537  /*
538  * On a leaf page, a binary search always returns the first key >= scan
539  * key (at least in !nextkey case), which could be the last slot + 1. This
540  * is also the lower bound of cached search.
541  *
542  * stricthigh may also be the last slot + 1, which prevents caller from
543  * using bounds directly, but is still useful to us if we're called a
544  * second time with cached bounds (cached low will be < stricthigh when
545  * that happens).
546  */
547  insertstate->low = low;
548  insertstate->stricthigh = stricthigh;
549  insertstate->bounds_valid = true;
550 
551  return low;
552 }
bool bounds_valid
Definition: nbtree.h:703
ItemPointer scantid
Definition: nbtree.h:671
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:244
OffsetNumber stricthigh
Definition: nbtree.h:705
OffsetNumber low
Definition: nbtree.h:704
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:357
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:69
signed int int32
Definition: c.h:355
uint16 OffsetNumber
Definition: off.h:24
static int _bt_binsrch_posting(BTScanInsert key, Page page, OffsetNumber offnum)
Definition: nbtsearch.c:563
int32 _bt_compare(Relation rel, BTScanInsert key, Page page, OffsetNumber offnum)
Definition: nbtsearch.c:649
#define BufferGetPage(buffer)
Definition: bufmgr.h:169
#define InvalidOffsetNumber
Definition: off.h:26
BTScanInsert itup_key
Definition: nbtree.h:693
#define Assert(condition)
Definition: c.h:738
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
#define unlikely(x)
Definition: c.h:206
Pointer Page
Definition: bufpage.h:78
#define P_ISLEAF(opaque)
Definition: nbtree.h:214

◆ _bt_check_natts()

bool _bt_check_natts ( Relation  rel,
bool  heapkeyspace,
Page  page,
OffsetNumber  offnum 
)

Definition at line 2466 of file nbtutils.c.

References Assert, BT_OFFSET_MASK, BT_PIVOT_HEAP_TID_ATTR, BTreeTupleGetHeapTID(), BTreeTupleGetNAtts, BTreeTupleIsPivot(), BTreeTupleIsPosting(), FirstOffsetNumber, INDEX_MAX_KEYS, IndexRelationGetNumberOfAttributes, IndexRelationGetNumberOfKeyAttributes, ItemPointerGetOffsetNumber, ItemPointerGetOffsetNumberNoCheck, P_FIRSTDATAKEY, P_HIKEY, P_IGNORE, P_ISLEAF, P_RIGHTMOST, PageGetItem, PageGetItemId, PageGetMaxOffsetNumber, PageGetSpecialPointer, StaticAssertStmt, and IndexTupleData::t_tid.

Referenced by _bt_compare(), and bt_target_page_check().

2467 {
2471  IndexTuple itup;
2472  int tupnatts;
2473 
2474  /*
2475  * We cannot reliably test a deleted or half-dead page, since they have
2476  * dummy high keys
2477  */
2478  if (P_IGNORE(opaque))
2479  return true;
2480 
2481  Assert(offnum >= FirstOffsetNumber &&
2482  offnum <= PageGetMaxOffsetNumber(page));
2483 
2484  /*
2485  * Mask allocated for number of keys in index tuple must be able to fit
2486  * maximum possible number of index attributes
2487  */
2489  "BT_OFFSET_MASK can't fit INDEX_MAX_KEYS");
2490 
2491  itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
2492  tupnatts = BTreeTupleGetNAtts(itup, rel);
2493 
2494  /* !heapkeyspace indexes do not support deduplication */
2495  if (!heapkeyspace && BTreeTupleIsPosting(itup))
2496  return false;
2497 
2498  /* Posting list tuples should never have "pivot heap TID" bit set */
2499  if (BTreeTupleIsPosting(itup) &&
2501  BT_PIVOT_HEAP_TID_ATTR) != 0)
2502  return false;
2503 
2504  /* INCLUDE indexes do not support deduplication */
2505  if (natts != nkeyatts && BTreeTupleIsPosting(itup))
2506  return false;
2507 
2508  if (P_ISLEAF(opaque))
2509  {
2510  if (offnum >= P_FIRSTDATAKEY(opaque))
2511  {
2512  /*
2513  * Non-pivot tuple should never be explicitly marked as a pivot
2514  * tuple
2515  */
2516  if (BTreeTupleIsPivot(itup))
2517  return false;
2518 
2519  /*
2520  * Leaf tuples that are not the page high key (non-pivot tuples)
2521  * should never be truncated. (Note that tupnatts must have been
2522  * inferred, even with a posting list tuple, because only pivot
2523  * tuples store tupnatts directly.)
2524  */
2525  return tupnatts == natts;
2526  }
2527  else
2528  {
2529  /*
2530  * Rightmost page doesn't contain a page high key, so tuple was
2531  * checked above as ordinary leaf tuple
2532  */
2533  Assert(!P_RIGHTMOST(opaque));
2534 
2535  /*
2536  * !heapkeyspace high key tuple contains only key attributes. Note
2537  * that tupnatts will only have been explicitly represented in
2538  * !heapkeyspace indexes that happen to have non-key attributes.
2539  */
2540  if (!heapkeyspace)
2541  return tupnatts == nkeyatts;
2542 
2543  /* Use generic heapkeyspace pivot tuple handling */
2544  }
2545  }
2546  else /* !P_ISLEAF(opaque) */
2547  {
2548  if (offnum == P_FIRSTDATAKEY(opaque))
2549  {
2550  /*
2551  * The first tuple on any internal page (possibly the first after
2552  * its high key) is its negative infinity tuple. Negative
2553  * infinity tuples are always truncated to zero attributes. They
2554  * are a particular kind of pivot tuple.
2555  */
2556  if (heapkeyspace)
2557  return tupnatts == 0;
2558 
2559  /*
2560  * The number of attributes won't be explicitly represented if the
2561  * negative infinity tuple was generated during a page split that
2562  * occurred with a version of Postgres before v11. There must be
2563  * a problem when there is an explicit representation that is
2564  * non-zero, or when there is no explicit representation and the
2565  * tuple is evidently not a pre-pg_upgrade tuple.
2566  *
2567  * Prior to v11, downlinks always had P_HIKEY as their offset.
2568  * Accept that as an alternative indication of a valid
2569  * !heapkeyspace negative infinity tuple.
2570  */
2571  return tupnatts == 0 ||
2573  }
2574  else
2575  {
2576  /*
2577  * !heapkeyspace downlink tuple with separator key contains only
2578  * key attributes. Note that tupnatts will only have been
2579  * explicitly represented in !heapkeyspace indexes that happen to
2580  * have non-key attributes.
2581  */
2582  if (!heapkeyspace)
2583  return tupnatts == nkeyatts;
2584 
2585  /* Use generic heapkeyspace pivot tuple handling */
2586  }
2587 
2588  }
2589 
2590  /* Handle heapkeyspace pivot tuples (excluding minus infinity items) */
2591  Assert(heapkeyspace);
2592 
2593  /*
2594  * Explicit representation of the number of attributes is mandatory with
2595  * heapkeyspace index pivot tuples, regardless of whether or not there are
2596  * non-key attributes.
2597  */
2598  if (!BTreeTupleIsPivot(itup))
2599  return false;
2600 
2601  /* Pivot tuple should not use posting list representation (redundant) */
2602  if (BTreeTupleIsPosting(itup))
2603  return false;
2604 
2605  /*
2606  * Heap TID is a tiebreaker key attribute, so it cannot be untruncated
2607  * when any other key attribute is truncated
2608  */
2609  if (BTreeTupleGetHeapTID(itup) != NULL && tupnatts != nkeyatts)
2610  return false;
2611 
2612  /*
2613  * Pivot tuple must have at least one untruncated key attribute (minus
2614  * infinity pivot tuples are the only exception). Pivot tuples can never
2615  * represent that there is a value present for a key attribute that
2616  * exceeds pg_index.indnkeyatts for the index.
2617  */
2618  return tupnatts > 0 && tupnatts <= nkeyatts;
2619 }
signed short int16
Definition: c.h:354
#define ItemPointerGetOffsetNumberNoCheck(pointer)
Definition: itemptr.h:108
#define P_IGNORE(opaque)
Definition: nbtree.h:219
static bool BTreeTupleIsPivot(IndexTuple itup)
Definition: nbtree.h:357
static ItemPointer BTreeTupleGetHeapTID(IndexTuple itup)
Definition: nbtree.h:513
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:244
ItemPointerData t_tid
Definition: itup.h:37
#define BT_OFFSET_MASK
Definition: nbtree.h:348
#define BTreeTupleGetNAtts(itup, rel)
Definition: nbtree.h:452
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:357
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:69
#define StaticAssertStmt(condition, errmessage)
Definition: c.h:852
#define FirstOffsetNumber
Definition: off.h:27
IndexTupleData * IndexTuple
Definition: itup.h:53
#define IndexRelationGetNumberOfAttributes(relation)
Definition: rel.h:468
#define IndexRelationGetNumberOfKeyAttributes(relation)
Definition: rel.h:475
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:235
static bool BTreeTupleIsPosting(IndexTuple itup)
Definition: nbtree.h:369
#define Assert(condition)
Definition: c.h:738
#define INDEX_MAX_KEYS
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
#define ItemPointerGetOffsetNumber(pointer)
Definition: itemptr.h:117
#define P_HIKEY
Definition: nbtree.h:242
#define BT_PIVOT_HEAP_TID_ATTR
Definition: nbtree.h:349
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:213
#define PageGetItem(page, itemId)
Definition: bufpage.h:340
#define P_ISLEAF(opaque)
Definition: nbtree.h:214

◆ _bt_check_third_page()

void _bt_check_third_page ( Relation  rel,
Relation  heap,
bool  needheaptidspace,
Page  page,
IndexTuple  newtup 
)

Definition at line 2634 of file nbtutils.c.

References BTMaxItemSize, BTMaxItemSizeNoHeapTid, BTREE_NOVAC_VERSION, BTREE_VERSION, BTreeTupleGetHeapTID(), elog, ereport, errcode(), errdetail(), errhint(), errmsg(), ERROR, errtableconstraint(), IndexTupleSize, ItemPointerGetBlockNumber, ItemPointerGetOffsetNumber, MAXALIGN, P_ISLEAF, PageGetSpecialPointer, and RelationGetRelationName.

Referenced by _bt_buildadd(), and _bt_findinsertloc().

2636 {
2637  Size itemsz;
2638  BTPageOpaque opaque;
2639 
2640  itemsz = MAXALIGN(IndexTupleSize(newtup));
2641 
2642  /* Double check item size against limit */
2643  if (itemsz <= BTMaxItemSize(page))
2644  return;
2645 
2646  /*
2647  * Tuple is probably too large to fit on page, but it's possible that the
2648  * index uses version 2 or version 3, or that page is an internal page, in
2649  * which case a slightly higher limit applies.
2650  */
2651  if (!needheaptidspace && itemsz <= BTMaxItemSizeNoHeapTid(page))
2652  return;
2653 
2654  /*
2655  * Internal page insertions cannot fail here, because that would mean that
2656  * an earlier leaf level insertion that should have failed didn't
2657  */
2658  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2659  if (!P_ISLEAF(opaque))
2660  elog(ERROR, "cannot insert oversized tuple of size %zu on internal page of index \"%s\"",
2661  itemsz, RelationGetRelationName(rel));
2662 
2663  ereport(ERROR,
2664  (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
2665  errmsg("index row size %zu exceeds btree version %u maximum %zu for index \"%s\"",
2666  itemsz,
2667  needheaptidspace ? BTREE_VERSION : BTREE_NOVAC_VERSION,
2668  needheaptidspace ? BTMaxItemSize(page) :
2669  BTMaxItemSizeNoHeapTid(page),
2671  errdetail("Index row references tuple (%u,%u) in relation \"%s\".",
2674  RelationGetRelationName(heap)),
2675  errhint("Values larger than 1/3 of a buffer page cannot be indexed.\n"
2676  "Consider a function index of an MD5 hash of the value, "
2677  "or use full text indexing."),
2679 }
int errhint(const char *fmt,...)
Definition: elog.c:1071
static ItemPointer BTreeTupleGetHeapTID(IndexTuple itup)
Definition: nbtree.h:513
#define BTREE_VERSION
Definition: nbtree.h:143
#define BTMaxItemSizeNoHeapTid(page)
Definition: nbtree.h:163
int errcode(int sqlerrcode)
Definition: elog.c:610
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:69
int errtableconstraint(Relation rel, const char *conname)
Definition: relcache.c:5542
#define ERROR
Definition: elog.h:43
int errdetail(const char *fmt,...)
Definition: elog.c:957
#define RelationGetRelationName(relation)
Definition: rel.h:490
#define BTREE_NOVAC_VERSION
Definition: nbtree.h:145
#define ereport(elevel,...)
Definition: elog.h:144
size_t Size
Definition: c.h:466
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
#define MAXALIGN(LEN)
Definition: c.h:691
#define ItemPointerGetOffsetNumber(pointer)
Definition: itemptr.h:117
#define BTMaxItemSize(page)
Definition: nbtree.h:157
int errmsg(const char *fmt,...)
Definition: elog.c:824
#define elog(elevel,...)
Definition: elog.h:214
#define ItemPointerGetBlockNumber(pointer)
Definition: itemptr.h:98
#define IndexTupleSize(itup)
Definition: itup.h:71
#define P_ISLEAF(opaque)
Definition: nbtree.h:214

◆ _bt_checkkeys()

bool _bt_checkkeys ( IndexScanDesc  scan,
IndexTuple  tuple,
int  tupnatts,
ScanDirection  dir,
bool continuescan 
)

Definition at line 1355 of file nbtutils.c.

References _bt_check_rowcompare(), Assert, BTreeTupleGetNAtts, BTreeTupleIsPivot(), DatumGetBool, FunctionCall2Coll(), index_getattr, IndexScanDescData::indexRelation, sort-test::key, BTScanOpaqueData::keyData, BTScanOpaqueData::numberOfKeys, IndexScanDescData::opaque, RelationGetDescr, ScanDirectionIsBackward, ScanDirectionIsForward, ScanKeyData::sk_argument, ScanKeyData::sk_attno, SK_BT_NULLS_FIRST, SK_BT_REQBKWD, SK_BT_REQFWD, ScanKeyData::sk_collation, ScanKeyData::sk_flags, ScanKeyData::sk_func, SK_ISNULL, SK_ROW_HEADER, SK_SEARCHNOTNULL, SK_SEARCHNULL, and test().

Referenced by _bt_readpage().

1357 {
1358  TupleDesc tupdesc;
1359  BTScanOpaque so;
1360  int keysz;
1361  int ikey;
1362  ScanKey key;
1363 
1364  Assert(BTreeTupleGetNAtts(tuple, scan->indexRelation) == tupnatts);
1365 
1366  *continuescan = true; /* default assumption */
1367 
1368  tupdesc = RelationGetDescr(scan->indexRelation);
1369  so = (BTScanOpaque) scan->opaque;
1370  keysz = so->numberOfKeys;
1371 
1372  for (key = so->keyData, ikey = 0; ikey < keysz; key++, ikey++)
1373  {
1374  Datum datum;
1375  bool isNull;
1376  Datum test;
1377 
1378  if (key->sk_attno > tupnatts)
1379  {
1380  /*
1381  * This attribute is truncated (must be high key). The value for
1382  * this attribute in the first non-pivot tuple on the page to the
1383  * right could be any possible value. Assume that truncated
1384  * attribute passes the qual.
1385  */
1387  Assert(BTreeTupleIsPivot(tuple));
1388  continue;
1389  }
1390 
1391  /* row-comparison keys need special processing */
1392  if (key->sk_flags & SK_ROW_HEADER)
1393  {
1394  if (_bt_check_rowcompare(key, tuple, tupnatts, tupdesc, dir,
1395  continuescan))
1396  continue;
1397  return false;
1398  }
1399 
1400  datum = index_getattr(tuple,
1401  key->sk_attno,
1402  tupdesc,
1403  &isNull);
1404 
1405  if (key->sk_flags & SK_ISNULL)
1406  {
1407  /* Handle IS NULL/NOT NULL tests */
1408  if (key->sk_flags & SK_SEARCHNULL)
1409  {
1410  if (isNull)
1411  continue; /* tuple satisfies this qual */
1412  }
1413  else
1414  {
1416  if (!isNull)
1417  continue; /* tuple satisfies this qual */
1418  }
1419 
1420  /*
1421  * Tuple fails this qual. If it's a required qual for the current
1422  * scan direction, then we can conclude no further tuples will
1423  * pass, either.
1424  */
1425  if ((key->sk_flags & SK_BT_REQFWD) &&
1427  *continuescan = false;
1428  else if ((key->sk_flags & SK_BT_REQBKWD) &&
1430  *continuescan = false;
1431 
1432  /*
1433  * In any case, this indextuple doesn't match the qual.
1434  */
1435  return false;
1436  }
1437 
1438  if (isNull)
1439  {
1440  if (key->sk_flags & SK_BT_NULLS_FIRST)
1441  {
1442  /*
1443  * Since NULLs are sorted before non-NULLs, we know we have
1444  * reached the lower limit of the range of values for this
1445  * index attr. On a backward scan, we can stop if this qual
1446  * is one of the "must match" subset. We can stop regardless
1447  * of whether the qual is > or <, so long as it's required,
1448  * because it's not possible for any future tuples to pass. On
1449  * a forward scan, however, we must keep going, because we may
1450  * have initially positioned to the start of the index.
1451  */
1452  if ((key->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) &&
1454  *continuescan = false;
1455  }
1456  else
1457  {
1458  /*
1459  * Since NULLs are sorted after non-NULLs, we know we have
1460  * reached the upper limit of the range of values for this
1461  * index attr. On a forward scan, we can stop if this qual is
1462  * one of the "must match" subset. We can stop regardless of
1463  * whether the qual is > or <, so long as it's required,
1464  * because it's not possible for any future tuples to pass. On
1465  * a backward scan, however, we must keep going, because we
1466  * may have initially positioned to the end of the index.
1467  */
1468  if ((key->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) &&
1470  *continuescan = false;
1471  }
1472 
1473  /*
1474  * In any case, this indextuple doesn't match the qual.
1475  */
1476  return false;
1477  }
1478 
1479  test = FunctionCall2Coll(&key->sk_func, key->sk_collation,
1480  datum, key->sk_argument);
1481 
1482  if (!DatumGetBool(test))
1483  {
1484  /*
1485  * Tuple fails this qual. If it's a required qual for the current
1486  * scan direction, then we can conclude no further tuples will
1487  * pass, either.
1488  *
1489  * Note: because we stop the scan as soon as any required equality
1490  * qual fails, it is critical that equality quals be used for the
1491  * initial positioning in _bt_first() when they are available. See
1492  * comments in _bt_first().
1493  */
1494  if ((key->sk_flags & SK_BT_REQFWD) &&
1496  *continuescan = false;
1497  else if ((key->sk_flags & SK_BT_REQBKWD) &&
1499  *continuescan = false;
1500 
1501  /*
1502  * In any case, this indextuple doesn't match the qual.
1503  */
1504  return false;
1505  }
1506  }
1507 
1508  /* If we get here, the tuple passes all index quals. */
1509  return true;
1510 }
static void test(void)
static bool BTreeTupleIsPivot(IndexTuple itup)
Definition: nbtree.h:357
#define ScanDirectionIsForward(direction)
Definition: sdir.h:55
#define RelationGetDescr(relation)
Definition: rel.h:482
Datum FunctionCall2Coll(FmgrInfo *flinfo, Oid collation, Datum arg1, Datum arg2)
Definition: fmgr.c:1152
#define BTreeTupleGetNAtts(itup, rel)
Definition: nbtree.h:452
Relation indexRelation
Definition: relscan.h:103
#define ScanDirectionIsBackward(direction)
Definition: sdir.h:41
BTScanOpaqueData * BTScanOpaque
Definition: nbtree.h:952
FmgrInfo sk_func
Definition: skey.h:71
#define DatumGetBool(X)
Definition: postgres.h:393
#define SK_SEARCHNOTNULL
Definition: skey.h:122
#define SK_ISNULL
Definition: skey.h:115
#define SK_ROW_HEADER
Definition: skey.h:117
#define SK_BT_NULLS_FIRST
Definition: nbtree.h:963
uintptr_t Datum
Definition: postgres.h:367
static bool _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts, TupleDesc tupdesc, ScanDirection dir, bool *continuescan)
Definition: nbtutils.c:1522
#define SK_BT_REQFWD
Definition: nbtree.h:959
int sk_flags
Definition: skey.h:66
#define Assert(condition)
Definition: c.h:738
int numberOfKeys
Definition: nbtree.h:914
#define index_getattr(tup, attnum, tupleDesc, isnull)
Definition: itup.h:100
ScanKey keyData
Definition: nbtree.h:915
Oid sk_collation
Definition: skey.h:70
#define SK_BT_REQBKWD
Definition: nbtree.h:960
Datum sk_argument
Definition: skey.h:72
#define SK_SEARCHNULL
Definition: skey.h:121
AttrNumber sk_attno
Definition: skey.h:67

◆ _bt_checkpage()

void _bt_checkpage ( Relation  rel,
Buffer  buf 
)

Definition at line 718 of file nbtpage.c.

References BufferGetBlockNumber(), BufferGetPage, ereport, errcode(), errhint(), errmsg(), ERROR, MAXALIGN, PageGetSpecialSize, PageIsNew, and RelationGetRelationName.

Referenced by _bt_getbuf(), _bt_relandgetbuf(), _bt_search_insert(), btvacuumpage(), and palloc_btree_page().

719 {
720  Page page = BufferGetPage(buf);
721 
722  /*
723  * ReadBuffer verifies that every newly-read page passes
724  * PageHeaderIsValid, which means it either contains a reasonably sane
725  * page header or is all-zero. We have to defend against the all-zero
726  * case, however.
727  */
728  if (PageIsNew(page))
729  ereport(ERROR,
730  (errcode(ERRCODE_INDEX_CORRUPTED),
731  errmsg("index \"%s\" contains unexpected zero page at block %u",
734  errhint("Please REINDEX it.")));
735 
736  /*
737  * Additionally check that the special area looks sane.
738  */
739  if (PageGetSpecialSize(page) != MAXALIGN(sizeof(BTPageOpaqueData)))
740  ereport(ERROR,
741  (errcode(ERRCODE_INDEX_CORRUPTED),
742  errmsg("index \"%s\" contains corrupted page at block %u",
745  errhint("Please REINDEX it.")));
746 }
int errhint(const char *fmt,...)
Definition: elog.c:1071
int errcode(int sqlerrcode)
Definition: elog.c:610
#define ERROR
Definition: elog.h:43
static char * buf
Definition: pg_test_fsync.c:67
#define RelationGetRelationName(relation)
Definition: rel.h:490
#define BufferGetPage(buffer)
Definition: bufmgr.h:169
#define ereport(elevel,...)
Definition: elog.h:144
#define MAXALIGN(LEN)
Definition: c.h:691
#define PageGetSpecialSize(page)
Definition: bufpage.h:300
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2633
#define PageIsNew(page)
Definition: bufpage.h:229
int errmsg(const char *fmt,...)
Definition: elog.c:824
Pointer Page
Definition: bufpage.h:78

◆ _bt_compare()

int32 _bt_compare ( Relation  rel,
BTScanInsert  key,
Page  page,
OffsetNumber  offnum 
)

Definition at line 649 of file nbtsearch.c.

References _bt_check_natts(), BTScanInsertData::allequalimage, Assert, BTreeTupleGetHeapTID(), BTreeTupleGetMaxHeapTID(), BTreeTupleGetNAtts, BTreeTupleIsPosting(), DatumGetInt32, FunctionCall2Coll(), BTScanInsertData::heapkeyspace, i, index_getattr, IndexRelationGetNumberOfKeyAttributes, INVERT_COMPARE_RESULT, ItemPointerCompare(), BTScanInsertData::keysz, Min, P_FIRSTDATAKEY, P_ISLEAF, PageGetItem, PageGetItemId, PageGetSpecialPointer, BTScanInsertData::pivotsearch, RelationGetDescr, BTScanInsertData::scankeys, BTScanInsertData::scantid, ScanKeyData::sk_argument, ScanKeyData::sk_attno, SK_BT_DESC, SK_BT_NULLS_FIRST, ScanKeyData::sk_collation, ScanKeyData::sk_flags, ScanKeyData::sk_func, and SK_ISNULL.

Referenced by _bt_binsrch(), _bt_binsrch_insert(), _bt_check_unique(), _bt_findinsertloc(), _bt_moveright(), _bt_search_insert(), bt_rootdescend(), invariant_g_offset(), invariant_l_nontarget_offset(), invariant_l_offset(), and invariant_leq_offset().

653 {
654  TupleDesc itupdesc = RelationGetDescr(rel);
656  IndexTuple itup;
657  ItemPointer heapTid;
658  ScanKey scankey;
659  int ncmpkey;
660  int ntupatts;
661  int32 result;
662 
663  Assert(_bt_check_natts(rel, key->heapkeyspace, page, offnum));
665  Assert(key->heapkeyspace || key->scantid == NULL);
666 
667  /*
668  * Force result ">" if target item is first data item on an internal page
669  * --- see NOTE above.
670  */
671  if (!P_ISLEAF(opaque) && offnum == P_FIRSTDATAKEY(opaque))
672  return 1;
673 
674  itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
675  ntupatts = BTreeTupleGetNAtts(itup, rel);
676 
677  /*
678  * The scan key is set up with the attribute number associated with each
679  * term in the key. It is important that, if the index is multi-key, the
680  * scan contain the first k key attributes, and that they be in order. If
681  * you think about how multi-key ordering works, you'll understand why
682  * this is.
683  *
684  * We don't test for violation of this condition here, however. The
685  * initial setup for the index scan had better have gotten it right (see
686  * _bt_first).
687  */
688 
689  ncmpkey = Min(ntupatts, key->keysz);
690  Assert(key->heapkeyspace || ncmpkey == key->keysz);
691  Assert(!BTreeTupleIsPosting(itup) || key->allequalimage);
692  scankey = key->scankeys;
693  for (int i = 1; i <= ncmpkey; i++)
694  {
695  Datum datum;
696  bool isNull;
697 
698  datum = index_getattr(itup, scankey->sk_attno, itupdesc, &isNull);
699 
700  if (scankey->sk_flags & SK_ISNULL) /* key is NULL */
701  {
702  if (isNull)
703  result = 0; /* NULL "=" NULL */
704  else if (scankey->sk_flags & SK_BT_NULLS_FIRST)
705  result = -1; /* NULL "<" NOT_NULL */
706  else
707  result = 1; /* NULL ">" NOT_NULL */
708  }
709  else if (isNull) /* key is NOT_NULL and item is NULL */
710  {
711  if (scankey->sk_flags & SK_BT_NULLS_FIRST)
712  result = 1; /* NOT_NULL ">" NULL */
713  else
714  result = -1; /* NOT_NULL "<" NULL */
715  }
716  else
717  {
718  /*
719  * The sk_func needs to be passed the index value as left arg and
720  * the sk_argument as right arg (they might be of different
721  * types). Since it is convenient for callers to think of
722  * _bt_compare as comparing the scankey to the index item, we have
723  * to flip the sign of the comparison result. (Unless it's a DESC
724  * column, in which case we *don't* flip the sign.)
725  */
726  result = DatumGetInt32(FunctionCall2Coll(&scankey->sk_func,
727  scankey->sk_collation,
728  datum,
729  scankey->sk_argument));
730 
731  if (!(scankey->sk_flags & SK_BT_DESC))
732  INVERT_COMPARE_RESULT(result);
733  }
734 
735  /* if the keys are unequal, return the difference */
736  if (result != 0)
737  return result;
738 
739  scankey++;
740  }
741 
742  /*
743  * All non-truncated attributes (other than heap TID) were found to be
744  * equal. Treat truncated attributes as minus infinity when scankey has a
745  * key attribute value that would otherwise be compared directly.
746  *
747  * Note: it doesn't matter if ntupatts includes non-key attributes;
748  * scankey won't, so explicitly excluding non-key attributes isn't
749  * necessary.
750  */
751  if (key->keysz > ntupatts)
752  return 1;
753 
754  /*
755  * Use the heap TID attribute and scantid to try to break the tie. The
756  * rules are the same as any other key attribute -- only the
757  * representation differs.
758  */
759  heapTid = BTreeTupleGetHeapTID(itup);
760  if (key->scantid == NULL)
761  {
762  /*
763  * Most searches have a scankey that is considered greater than a
764  * truncated pivot tuple if and when the scankey has equal values for
765  * attributes up to and including the least significant untruncated
766  * attribute in tuple.
767  *
768  * For example, if an index has the minimum two attributes (single
769  * user key attribute, plus heap TID attribute), and a page's high key
770  * is ('foo', -inf), and scankey is ('foo', <omitted>), the search
771  * will not descend to the page to the left. The search will descend
772  * right instead. The truncated attribute in pivot tuple means that
773  * all non-pivot tuples on the page to the left are strictly < 'foo',
774  * so it isn't necessary to descend left. In other words, search
775  * doesn't have to descend left because it isn't interested in a match
776  * that has a heap TID value of -inf.
777  *
778  * However, some searches (pivotsearch searches) actually require that
779  * we descend left when this happens. -inf is treated as a possible
780  * match for omitted scankey attribute(s). This is needed by page
781  * deletion, which must re-find leaf pages that are targets for
782  * deletion using their high keys.
783  *
784  * Note: the heap TID part of the test ensures that scankey is being
785  * compared to a pivot tuple with one or more truncated key
786  * attributes.
787  *
788  * Note: pg_upgrade'd !heapkeyspace indexes must always descend to the
789  * left here, since they have no heap TID attribute (and cannot have
790  * any -inf key values in any case, since truncation can only remove
791  * non-key attributes). !heapkeyspace searches must always be
792  * prepared to deal with matches on both sides of the pivot once the
793  * leaf level is reached.
794  */
795  if (key->heapkeyspace && !key->pivotsearch &&
796  key->keysz == ntupatts && heapTid == NULL)
797  return 1;
798 
799  /* All provided scankey arguments found to be equal */
800  return 0;
801  }
802 
803  /*
804  * Treat truncated heap TID as minus infinity, since scankey has a key
805  * attribute value (scantid) that would otherwise be compared directly
806  */
808  if (heapTid == NULL)
809  return 1;
810 
811  /*
812  * Scankey must be treated as equal to a posting list tuple if its scantid
813  * value falls within the range of the posting list. In all other cases
814  * there can only be a single heap TID value, which is compared directly
815  * with scantid.
816  */
818  result = ItemPointerCompare(key->scantid, heapTid);
819  if (result <= 0 || !BTreeTupleIsPosting(itup))
820  return result;
821  else
822  {
823  result = ItemPointerCompare(key->scantid,
825  if (result > 0)
826  return 1;
827  }
828 
829  return 0;
830 }
int32 ItemPointerCompare(ItemPointer arg1, ItemPointer arg2)
Definition: itemptr.c:52
static ItemPointer BTreeTupleGetHeapTID(IndexTuple itup)
Definition: nbtree.h:513
#define DatumGetInt32(X)
Definition: postgres.h:472
#define RelationGetDescr(relation)
Definition: rel.h:482
ItemPointer scantid
Definition: nbtree.h:671
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:244
#define Min(x, y)
Definition: c.h:920
bool allequalimage
Definition: nbtree.h:667
Datum FunctionCall2Coll(FmgrInfo *flinfo, Oid collation, Datum arg1, Datum arg2)
Definition: fmgr.c:1152
#define BTreeTupleGetNAtts(itup, rel)
Definition: nbtree.h:452
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:69
signed int int32
Definition: c.h:355
IndexTupleData * IndexTuple
Definition: itup.h:53
FmgrInfo sk_func
Definition: skey.h:71
#define IndexRelationGetNumberOfKeyAttributes(relation)
Definition: rel.h:475
#define SK_ISNULL
Definition: skey.h:115
bool pivotsearch
Definition: nbtree.h:670
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:235
#define SK_BT_NULLS_FIRST
Definition: nbtree.h:963
static ItemPointer BTreeTupleGetMaxHeapTID(IndexTuple itup)
Definition: nbtree.h:539
uintptr_t Datum
Definition: postgres.h:367
static bool BTreeTupleIsPosting(IndexTuple itup)
Definition: nbtree.h:369
bool _bt_check_natts(Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum)
Definition: nbtutils.c:2466
int sk_flags
Definition: skey.h:66
#define Assert(condition)
Definition: c.h:738
#define SK_BT_DESC
Definition: nbtree.h:962
bool heapkeyspace
Definition: nbtree.h:666
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
#define index_getattr(tup, attnum, tupleDesc, isnull)
Definition: itup.h:100
ScanKeyData scankeys[INDEX_MAX_KEYS]
Definition: nbtree.h:673
Oid sk_collation
Definition: skey.h:70
int i
Datum sk_argument
Definition: skey.h:72
#define INVERT_COMPARE_RESULT(var)
Definition: c.h:1088
#define PageGetItem(page, itemId)
Definition: bufpage.h:340
AttrNumber sk_attno
Definition: skey.h:67
#define P_ISLEAF(opaque)
Definition: nbtree.h:214

◆ _bt_dedup_finish_pending()

Size _bt_dedup_finish_pending ( Page  newpage,
BTDedupState  state 
)

Definition at line 431 of file nbtdedup.c.

References _bt_form_posting(), Assert, BTDedupStateData::base, BTDedupInterval::baseoff, BTDedupStateData::baseoff, elog, ERROR, BTDedupStateData::htids, IndexTupleSize, BTDedupStateData::intervals, InvalidOffsetNumber, MAXALIGN, BTDedupStateData::nhtids, BTDedupStateData::nintervals, BTDedupInterval::nitems, BTDedupStateData::nitems, OffsetNumberNext, PageAddItem, PageGetMaxOffsetNumber, pfree(), and BTDedupStateData::phystupsize.

Referenced by _bt_dedup_one_page(), and btree_xlog_dedup().

432 {
433  OffsetNumber tupoff;
434  Size tuplesz;
435  Size spacesaving;
436 
437  Assert(state->nitems > 0);
438  Assert(state->nitems <= state->nhtids);
439  Assert(state->intervals[state->nintervals].baseoff == state->baseoff);
440 
441  tupoff = OffsetNumberNext(PageGetMaxOffsetNumber(newpage));
442  if (state->nitems == 1)
443  {
444  /* Use original, unchanged base tuple */
445  tuplesz = IndexTupleSize(state->base);
446  if (PageAddItem(newpage, (Item) state->base, tuplesz, tupoff,
447  false, false) == InvalidOffsetNumber)
448  elog(ERROR, "deduplication failed to add tuple to page");
449 
450  spacesaving = 0;
451  }
452  else
453  {
454  IndexTuple final;
455 
456  /* Form a tuple with a posting list */
457  final = _bt_form_posting(state->base, state->htids, state->nhtids);
458  tuplesz = IndexTupleSize(final);
459  Assert(tuplesz <= state->maxpostingsize);
460 
461  /* Save final number of items for posting list */
462  state->intervals[state->nintervals].nitems = state->nitems;
463 
464  Assert(tuplesz == MAXALIGN(IndexTupleSize(final)));
465  if (PageAddItem(newpage, (Item) final, tuplesz, tupoff, false,
466  false) == InvalidOffsetNumber)
467  elog(ERROR, "deduplication failed to add tuple to page");
468 
469  pfree(final);
470  spacesaving = state->phystupsize - (tuplesz + sizeof(ItemIdData));
471  /* Increment nintervals, since we wrote a new posting list tuple */
472  state->nintervals++;
473  Assert(spacesaving > 0 && spacesaving < BLCKSZ);
474  }
475 
476  /* Reset state for next pending posting list */
477  state->nhtids = 0;
478  state->nitems = 0;
479  state->phystupsize = 0;
480 
481  return spacesaving;
482 }
IndexTuple base
Definition: nbtree.h:752
IndexTuple _bt_form_posting(IndexTuple base, ItemPointer htids, int nhtids)
Definition: nbtdedup.c:601
OffsetNumber baseoff
Definition: nbtree.h:753
Pointer Item
Definition: item.h:17
#define PageAddItem(page, item, size, offsetNumber, overwrite, is_heap)
Definition: bufpage.h:416
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:357
uint16 OffsetNumber
Definition: off.h:24
ItemPointer htids
Definition: nbtree.h:757
void pfree(void *pointer)
Definition: mcxt.c:1056
Size phystupsize
Definition: nbtree.h:760
#define ERROR
Definition: elog.h:43
BTDedupInterval intervals[MaxIndexTuplesPerPage]
Definition: nbtree.h:769
struct ItemIdData ItemIdData
#define InvalidOffsetNumber
Definition: off.h:26
uint16 nitems
Definition: nbtree.h:724
#define Assert(condition)
Definition: c.h:738
#define OffsetNumberNext(offsetNumber)
Definition: off.h:52
size_t Size
Definition: c.h:466
#define MAXALIGN(LEN)
Definition: c.h:691
#define elog(elevel,...)
Definition: elog.h:214
OffsetNumber baseoff
Definition: nbtree.h:723
#define IndexTupleSize(itup)
Definition: itup.h:71

◆ _bt_dedup_one_page()

void _bt_dedup_one_page ( Relation  rel,
Buffer  buf,
Relation  heapRel,
IndexTuple  newitem,
Size  newitemsz,
bool  checkingunique 
)

Definition at line 56 of file nbtdedup.c.

References _bt_dedup_finish_pending(), _bt_dedup_save_htid(), _bt_dedup_start_pending(), _bt_delitems_delete(), _bt_do_singleval(), _bt_keep_natts_fast(), _bt_singleval_fillfactor(), Assert, BTDedupStateData::base, BTDedupStateData::baseoff, BTDedupStateData::basetupsize, BTMaxItemSize, BTP_HAS_GARBAGE, BTPageOpaqueData::btpo_flags, BufferGetPage, BTDedupStateData::deduplicate, elog, END_CRIT_SECTION, ERROR, BTDedupStateData::htids, INDEX_SIZE_MASK, IndexRelationGetNumberOfKeyAttributes, BTDedupStateData::intervals, InvalidOffsetNumber, ItemIdGetLength, ItemIdIsDead, MarkBufferDirty(), MaxIndexTuplesPerPage, BTDedupStateData::maxpostingsize, Min, BTDedupStateData::nhtids, xl_btree_dedup::nintervals, BTDedupStateData::nintervals, BTDedupStateData::nitems, OffsetNumberNext, P_FIRSTDATAKEY, P_HAS_GARBAGE, P_HIKEY, P_RIGHTMOST, PageAddItem, PageGetExactFreeSpace(), PageGetFreeSpace(), PageGetItem, PageGetItemId, PageGetLSN, PageGetMaxOffsetNumber, PageGetSpecialPointer, PageGetTempPageCopySpecial(), PageRestoreTempPage(), PageSetLSN, palloc(), pfree(), BTDedupStateData::phystupsize, REGBUF_STANDARD, RelationNeedsWAL, SizeOfBtreeDedup, START_CRIT_SECTION, XLOG_BTREE_DEDUP, XLogBeginInsert(), XLogInsert(), XLogRegisterBufData(), XLogRegisterBuffer(), and XLogRegisterData().

Referenced by _bt_findinsertloc().

58 {
59  OffsetNumber offnum,
60  minoff,
61  maxoff;
62  Page page = BufferGetPage(buf);
63  BTPageOpaque opaque;
64  Page newpage;
65  int newpagendataitems = 0;
68  int ndeletable = 0;
69  Size pagesaving = 0;
70  bool singlevalstrat = false;
71  int nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
72 
73  /*
74  * We can't assume that there are no LP_DEAD items. For one thing, VACUUM
75  * will clear the BTP_HAS_GARBAGE hint without reliably removing items
76  * that are marked LP_DEAD. We don't want to unnecessarily unset LP_DEAD
77  * bits when deduplicating items. Allowing it would be correct, though
78  * wasteful.
79  */
80  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
81  minoff = P_FIRSTDATAKEY(opaque);
82  maxoff = PageGetMaxOffsetNumber(page);
83  for (offnum = minoff;
84  offnum <= maxoff;
85  offnum = OffsetNumberNext(offnum))
86  {
87  ItemId itemid = PageGetItemId(page, offnum);
88 
89  if (ItemIdIsDead(itemid))
90  deletable[ndeletable++] = offnum;
91  }
92 
93  if (ndeletable > 0)
94  {
95  _bt_delitems_delete(rel, buf, deletable, ndeletable, heapRel);
96 
97  /*
98  * Return when a split will be avoided. This is equivalent to
99  * avoiding a split using the usual _bt_vacuum_one_page() path.
100  */
101  if (PageGetFreeSpace(page) >= newitemsz)
102  return;
103 
104  /*
105  * Reconsider number of items on page, in case _bt_delitems_delete()
106  * managed to delete an item or two
107  */
108  minoff = P_FIRSTDATAKEY(opaque);
109  maxoff = PageGetMaxOffsetNumber(page);
110  }
111 
112  /* Passed-in newitemsz is MAXALIGNED but does not include line pointer */
113  newitemsz += sizeof(ItemIdData);
114 
115  /*
116  * By here, it's clear that deduplication will definitely be attempted.
117  * Initialize deduplication state.
118  *
119  * It would be possible for maxpostingsize (limit on posting list tuple
120  * size) to be set to one third of the page. However, it seems like a
121  * good idea to limit the size of posting lists to one sixth of a page.
122  * That ought to leave us with a good split point when pages full of
123  * duplicates can be split several times.
124  */
125  state = (BTDedupState) palloc(sizeof(BTDedupStateData));
126  state->deduplicate = true;
127  state->maxpostingsize = Min(BTMaxItemSize(page) / 2, INDEX_SIZE_MASK);
128  /* Metadata about base tuple of current pending posting list */
129  state->base = NULL;
130  state->baseoff = InvalidOffsetNumber;
131  state->basetupsize = 0;
132  /* Metadata about current pending posting list TIDs */
133  state->htids = palloc(state->maxpostingsize);
134  state->nhtids = 0;
135  state->nitems = 0;
136  /* Size of all physical tuples to be replaced by pending posting list */
137  state->phystupsize = 0;
138  /* nintervals should be initialized to zero */
139  state->nintervals = 0;
140 
141  /* Determine if "single value" strategy should be used */
142  if (!checkingunique)
143  singlevalstrat = _bt_do_singleval(rel, page, state, minoff, newitem);
144 
145  /*
146  * Deduplicate items from page, and write them to newpage.
147  *
148  * Copy the original page's LSN into newpage copy. This will become the
149  * updated version of the page. We need this because XLogInsert will
150  * examine the LSN and possibly dump it in a page image.
151  */
152  newpage = PageGetTempPageCopySpecial(page);
153  PageSetLSN(newpage, PageGetLSN(page));
154 
155  /* Copy high key, if any */
156  if (!P_RIGHTMOST(opaque))
157  {
158  ItemId hitemid = PageGetItemId(page, P_HIKEY);
159  Size hitemsz = ItemIdGetLength(hitemid);
160  IndexTuple hitem = (IndexTuple) PageGetItem(page, hitemid);
161 
162  if (PageAddItem(newpage, (Item) hitem, hitemsz, P_HIKEY,
163  false, false) == InvalidOffsetNumber)
164  elog(ERROR, "deduplication failed to add highkey");
165  }
166 
167  for (offnum = minoff;
168  offnum <= maxoff;
169  offnum = OffsetNumberNext(offnum))
170  {
171  ItemId itemid = PageGetItemId(page, offnum);
172  IndexTuple itup = (IndexTuple) PageGetItem(page, itemid);
173 
174  Assert(!ItemIdIsDead(itemid));
175 
176  if (offnum == minoff)
177  {
178  /*
179  * No previous/base tuple for the data item -- use the data item
180  * as base tuple of pending posting list
181  */
182  _bt_dedup_start_pending(state, itup, offnum);
183  }
184  else if (state->deduplicate &&
185  _bt_keep_natts_fast(rel, state->base, itup) > nkeyatts &&
186  _bt_dedup_save_htid(state, itup))
187  {
188  /*
189  * Tuple is equal to base tuple of pending posting list. Heap
190  * TID(s) for itup have been saved in state.
191  */
192  }
193  else
194  {
195  /*
196  * Tuple is not equal to pending posting list tuple, or
197  * _bt_dedup_save_htid() opted to not merge current item into
198  * pending posting list for some other reason (e.g., adding more
199  * TIDs would have caused posting list to exceed current
200  * maxpostingsize).
201  *
202  * If state contains pending posting list with more than one item,
203  * form new posting tuple, and actually update the page. Else
204  * reset the state and move on without modifying the page.
205  */
206  pagesaving += _bt_dedup_finish_pending(newpage, state);
207  newpagendataitems++;
208 
209  if (singlevalstrat)
210  {
211  /*
212  * Single value strategy's extra steps.
213  *
214  * Lower maxpostingsize for sixth and final item that might be
215  * deduplicated by current deduplication pass. When sixth
216  * item formed/observed, stop deduplicating items.
217  *
218  * Note: It's possible that this will be reached even when
219  * current deduplication pass has yet to merge together some
220  * existing items. It doesn't matter whether or not the
221  * current call generated the maxpostingsize-capped duplicate
222  * tuples at the start of the page.
223  */
224  if (newpagendataitems == 5)
225  _bt_singleval_fillfactor(page, state, newitemsz);
226  else if (newpagendataitems == 6)
227  {
228  state->deduplicate = false;
229  singlevalstrat = false; /* won't be back here */
230  }
231  }
232 
233  /* itup starts new pending posting list */
234  _bt_dedup_start_pending(state, itup, offnum);
235  }
236  }
237 
238  /* Handle the last item */
239  pagesaving += _bt_dedup_finish_pending(newpage, state);
240  newpagendataitems++;
241 
242  /*
243  * If no items suitable for deduplication were found, newpage must be
244  * exactly the same as the original page, so just return from function.
245  *
246  * We could determine whether or not to proceed on the basis the space
247  * savings being sufficient to avoid an immediate page split instead. We
248  * don't do that because there is some small value in nbtsplitloc.c always
249  * operating against a page that is fully deduplicated (apart from
250  * newitem). Besides, most of the cost has already been paid.
251  */
252  if (state->nintervals == 0)
253  {
254  /* cannot leak memory here */
255  pfree(newpage);
256  pfree(state->htids);
257  pfree(state);
258  return;
259  }
260 
261  /*
262  * By here, it's clear that deduplication will definitely go ahead.
263  *
264  * Clear the BTP_HAS_GARBAGE page flag in the unlikely event that it is
265  * still falsely set, just to keep things tidy. (We can't rely on
266  * _bt_vacuum_one_page() having done this already, and we can't rely on a
267  * page split or VACUUM getting to it in the near future.)
268  */
269  if (P_HAS_GARBAGE(opaque))
270  {
271  BTPageOpaque nopaque = (BTPageOpaque) PageGetSpecialPointer(newpage);
272 
273  nopaque->btpo_flags &= ~BTP_HAS_GARBAGE;
274  }
275 
277 
278  PageRestoreTempPage(newpage, page);
280 
281  /* XLOG stuff */
282  if (RelationNeedsWAL(rel))
283  {
284  XLogRecPtr recptr;
285  xl_btree_dedup xlrec_dedup;
286 
287  xlrec_dedup.nintervals = state->nintervals;
288 
289  XLogBeginInsert();
291  XLogRegisterData((char *) &xlrec_dedup, SizeOfBtreeDedup);
292 
293  /*
294  * The intervals array is not in the buffer, but pretend that it is.
295  * When XLogInsert stores the whole buffer, the array need not be
296  * stored too.
297  */
298  XLogRegisterBufData(0, (char *) state->intervals,
299  state->nintervals * sizeof(BTDedupInterval));
300 
301  recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DEDUP);
302 
303  PageSetLSN(page, recptr);
304  }
305 
307 
308  /* Local space accounting should agree with page accounting */
309  Assert(pagesaving < newitemsz || PageGetExactFreeSpace(page) >= newitemsz);
310 
311  /* cannot leak memory here */
312  pfree(state->htids);
313  pfree(state);
314 }
void XLogRegisterBufData(uint8 block_id, char *data, int len)
Definition: xloginsert.c:362
IndexTuple base
Definition: nbtree.h:752
void PageRestoreTempPage(Page tempPage, Page oldPage)
Definition: bufpage.c:403
uint16 nintervals
Definition: nbtxlog.h:172
OffsetNumber baseoff
Definition: nbtree.h:753
void MarkBufferDirty(Buffer buffer)
Definition: bufmgr.c:1468
void XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags)
Definition: xloginsert.c:214
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:244
#define Min(x, y)
Definition: c.h:920
#define END_CRIT_SECTION()
Definition: miscadmin.h:134
Pointer Item
Definition: item.h:17
#define INDEX_SIZE_MASK
Definition: itup.h:65
#define P_HAS_GARBAGE(opaque)
Definition: nbtree.h:220
#define START_CRIT_SECTION()
Definition: miscadmin.h:132
#define PageAddItem(page, item, size, offsetNumber, overwrite, is_heap)
Definition: bufpage.h:416
void _bt_dedup_start_pending(BTDedupState state, IndexTuple base, OffsetNumber baseoff)
Definition: nbtdedup.c:326
#define ItemIdIsDead(itemId)
Definition: itemid.h:113
Size _bt_dedup_finish_pending(Page newpage, BTDedupState state)
Definition: nbtdedup.c:431
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:357
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:69
Size PageGetFreeSpace(Page page)
Definition: bufpage.c:574
uint16 OffsetNumber
Definition: off.h:24
Page PageGetTempPageCopySpecial(Page page)
Definition: bufpage.c:381
ItemPointer htids
Definition: nbtree.h:757
void pfree(void *pointer)
Definition: mcxt.c:1056
#define ItemIdGetLength(itemId)
Definition: itemid.h:59
Size phystupsize
Definition: nbtree.h:760
#define ERROR
Definition: elog.h:43
BTDedupInterval intervals[MaxIndexTuplesPerPage]
Definition: nbtree.h:769
static bool _bt_do_singleval(Relation rel, Page page, BTDedupState state, OffsetNumber minoff, IndexTuple newitem)
Definition: nbtdedup.c:519
void _bt_delitems_delete(Relation rel, Buffer buf, OffsetNumber *deletable, int ndeletable, Relation heapRel)
Definition: nbtpage.c:1176
static char * buf
Definition: pg_test_fsync.c:67
bool _bt_dedup_save_htid(BTDedupState state, IndexTuple itup)
Definition: nbtdedup.c:377
IndexTupleData * IndexTuple
Definition: itup.h:53
#define REGBUF_STANDARD
Definition: xloginsert.h:35
struct ItemIdData ItemIdData
#define SizeOfBtreeDedup
Definition: nbtxlog.h:177
#define IndexRelationGetNumberOfKeyAttributes(relation)
Definition: rel.h:475
#define BufferGetPage(buffer)
Definition: bufmgr.h:169
#define XLOG_BTREE_DEDUP
Definition: nbtxlog.h:32
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:235
void XLogRegisterData(char *data, int len)
Definition: xloginsert.c:324
XLogRecPtr XLogInsert(RmgrId rmid, uint8 info)
Definition: xloginsert.c:416
#define InvalidOffsetNumber
Definition: off.h:26
uint64 XLogRecPtr
Definition: xlogdefs.h:21
#define Assert(condition)
Definition: c.h:738
Definition: regguts.h:298
#define OffsetNumberNext(offsetNumber)
Definition: off.h:52
size_t Size
Definition: c.h:466
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
Size basetupsize
Definition: nbtree.h:754
#define RelationNeedsWAL(relation)
Definition: rel.h:562
Size maxpostingsize
Definition: nbtree.h:749
#define PageGetLSN(page)
Definition: bufpage.h:366
Size PageGetExactFreeSpace(Page page)
Definition: bufpage.c:625
#define BTMaxItemSize(page)
Definition: nbtree.h:157
#define P_HIKEY
Definition: nbtree.h:242
int _bt_keep_natts_fast(Relation rel, IndexTuple lastleft, IndexTuple firstright)
Definition: nbtutils.c:2419
#define MaxIndexTuplesPerPage
Definition: itup.h:145
void * palloc(Size size)
Definition: mcxt.c:949
bool deduplicate
Definition: nbtree.h:748
#define elog(elevel,...)
Definition: elog.h:214
BTDedupStateData * BTDedupState
Definition: nbtree.h:772
void XLogBeginInsert(void)
Definition: xloginsert.c:121
uint16 btpo_flags
Definition: nbtree.h:65
#define PageSetLSN(page, lsn)
Definition: bufpage.h:368
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:213
#define BTP_HAS_GARBAGE
Definition: nbtree.h:78
#define PageGetItem(page, itemId)
Definition: bufpage.h:340
Pointer Page
Definition: bufpage.h:78
static void _bt_singleval_fillfactor(Page page, BTDedupState state, Size newitemsz)
Definition: nbtdedup.c:559

◆ _bt_dedup_save_htid()

bool _bt_dedup_save_htid ( BTDedupState  state,
IndexTuple  itup 
)

Definition at line 377 of file nbtdedup.c.

References Assert, BTDedupStateData::basetupsize, BTreeTupleGetNPosting(), BTreeTupleGetPosting(), BTreeTupleIsPivot(), BTreeTupleIsPosting(), BTDedupStateData::htids, IndexTupleSize, MAXALIGN, BTDedupStateData::maxpostingsize, BTDedupStateData::nhtids, BTDedupStateData::nitems, BTDedupStateData::phystupsize, and IndexTupleData::t_tid.

Referenced by _bt_dedup_one_page(), _bt_load(), and btree_xlog_dedup().

378 {
379  int nhtids;
380  ItemPointer htids;
381  Size mergedtupsz;
382 
383  Assert(!BTreeTupleIsPivot(itup));
384 
385  if (!BTreeTupleIsPosting(itup))
386  {
387  nhtids = 1;
388  htids = &itup->t_tid;
389  }
390  else
391  {
392  nhtids = BTreeTupleGetNPosting(itup);
393  htids = BTreeTupleGetPosting(itup);
394  }
395 
396  /*
397  * Don't append (have caller finish pending posting list as-is) if
398  * appending heap TID(s) from itup would put us over maxpostingsize limit.
399  *
400  * This calculation needs to match the code used within _bt_form_posting()
401  * for new posting list tuples.
402  */
403  mergedtupsz = MAXALIGN(state->basetupsize +
404  (state->nhtids + nhtids) * sizeof(ItemPointerData));
405 
406  if (mergedtupsz > state->maxpostingsize)
407  return false;
408 
409  /*
410  * Save heap TIDs to pending posting list tuple -- itup can be merged into
411  * pending posting list
412  */
413  state->nitems++;
414  memcpy(state->htids + state->nhtids, htids,
415  sizeof(ItemPointerData) * nhtids);
416  state->nhtids += nhtids;
417  state->phystupsize += MAXALIGN(IndexTupleSize(itup)) + sizeof(ItemIdData);
418 
419  return true;
420 }
static bool BTreeTupleIsPivot(IndexTuple itup)
Definition: nbtree.h:357
static ItemPointer BTreeTupleGetPosting(IndexTuple posting)
Definition: nbtree.h:412
ItemPointerData t_tid
Definition: itup.h:37
ItemPointer htids
Definition: nbtree.h:757
Size phystupsize
Definition: nbtree.h:760
struct ItemIdData ItemIdData
static bool BTreeTupleIsPosting(IndexTuple itup)
Definition: nbtree.h:369
#define Assert(condition)
Definition: c.h:738
size_t Size
Definition: c.h:466
#define MAXALIGN(LEN)
Definition: c.h:691
Size basetupsize
Definition: nbtree.h:754
Size maxpostingsize
Definition: nbtree.h:749
static uint16 BTreeTupleGetNPosting(IndexTuple posting)
Definition: nbtree.h:393
#define IndexTupleSize(itup)
Definition: itup.h:71

◆ _bt_dedup_start_pending()

void _bt_dedup_start_pending ( BTDedupState  state,
IndexTuple  base,
OffsetNumber  baseoff 
)

Definition at line 326 of file nbtdedup.c.

References Assert, BTDedupStateData::base, BTDedupInterval::baseoff, BTDedupStateData::baseoff, BTDedupStateData::basetupsize, BTreeTupleGetNPosting(), BTreeTupleGetPosting(), BTreeTupleGetPostingOffset(), BTreeTupleIsPivot(), BTreeTupleIsPosting(), BTDedupStateData::htids, IndexTupleSize, BTDedupStateData::intervals, MAXALIGN, BTDedupStateData::nhtids, BTDedupStateData::nintervals, BTDedupStateData::nitems, BTDedupStateData::phystupsize, and IndexTupleData::t_tid.

Referenced by _bt_dedup_one_page(), _bt_load(), and btree_xlog_dedup().

328 {
329  Assert(state->nhtids == 0);
330  Assert(state->nitems == 0);
331  Assert(!BTreeTupleIsPivot(base));
332 
333  /*
334  * Copy heap TID(s) from new base tuple for new candidate posting list
335  * into working state's array
336  */
337  if (!BTreeTupleIsPosting(base))
338  {
339  memcpy(state->htids, &base->t_tid, sizeof(ItemPointerData));
340  state->nhtids = 1;
341  state->basetupsize = IndexTupleSize(base);
342  }
343  else
344  {
345  int nposting;
346 
347  nposting = BTreeTupleGetNPosting(base);
348  memcpy(state->htids, BTreeTupleGetPosting(base),
349  sizeof(ItemPointerData) * nposting);
350  state->nhtids = nposting;
351  /* basetupsize should not include existing posting list */
353  }
354 
355  /*
356  * Save new base tuple itself -- it'll be needed if we actually create a
357  * new posting list from new pending posting list.
358  *
359  * Must maintain physical size of all existing tuples (including line
360  * pointer overhead) so that we can calculate space savings on page.
361  */
362  state->nitems = 1;
363  state->base = base;
364  state->baseoff = baseoff;
365  state->phystupsize = MAXALIGN(IndexTupleSize(base)) + sizeof(ItemIdData);
366  /* Also save baseoff in pending state for interval */
367  state->intervals[state->nintervals].baseoff = state->baseoff;
368 }
IndexTuple base
Definition: nbtree.h:752
static bool BTreeTupleIsPivot(IndexTuple itup)
Definition: nbtree.h:357
OffsetNumber baseoff
Definition: nbtree.h:753
static ItemPointer BTreeTupleGetPosting(IndexTuple posting)
Definition: nbtree.h:412
ItemPointerData t_tid
Definition: itup.h:37
ItemPointer htids
Definition: nbtree.h:757
Size phystupsize
Definition: nbtree.h:760
BTDedupInterval intervals[MaxIndexTuplesPerPage]
Definition: nbtree.h:769
struct ItemIdData ItemIdData
static bool BTreeTupleIsPosting(IndexTuple itup)
Definition: nbtree.h:369
static uint32 BTreeTupleGetPostingOffset(IndexTuple posting)
Definition: nbtree.h:404
#define Assert(condition)
Definition: c.h:738
#define MAXALIGN(LEN)
Definition: c.h:691
Size basetupsize
Definition: nbtree.h:754
static uint16 BTreeTupleGetNPosting(IndexTuple posting)
Definition: nbtree.h:393
OffsetNumber baseoff
Definition: nbtree.h:723
#define IndexTupleSize(itup)
Definition: itup.h:71

◆ _bt_delitems_delete()

void _bt_delitems_delete ( Relation  rel,
Buffer  buf,
OffsetNumber deletable,
int  ndeletable,
Relation  heapRel 
)

Definition at line 1176 of file nbtpage.c.

References _bt_xid_horizon(), Assert, BTP_HAS_GARBAGE, BTPageOpaqueData::btpo_flags, BufferGetPage, END_CRIT_SECTION, InvalidTransactionId, xl_btree_delete::latestRemovedXid, MarkBufferDirty(), xl_btree_delete::ndeleted, PageGetSpecialPointer, PageIndexMultiDelete(), PageSetLSN, REGBUF_STANDARD, RelationNeedsWAL, SizeOfBtreeDelete, START_CRIT_SECTION, XLOG_BTREE_DELETE, XLogBeginInsert(), XLogInsert(), XLogRegisterBufData(), XLogRegisterBuffer(), XLogRegisterData(), and XLogStandbyInfoActive.

Referenced by _bt_dedup_one_page(), and _bt_vacuum_one_page().

1179 {
1180  Page page = BufferGetPage(buf);
1181  BTPageOpaque opaque;
1182  TransactionId latestRemovedXid = InvalidTransactionId;
1183 
1184  /* Shouldn't be called unless there's something to do */
1185  Assert(ndeletable > 0);
1186 
1188  latestRemovedXid =
1189  _bt_xid_horizon(rel, heapRel, page, deletable, ndeletable);
1190 
1191  /* No ereport(ERROR) until changes are logged */
1193 
1194  /* Fix the page */
1195  PageIndexMultiDelete(page, deletable, ndeletable);
1196 
1197  /*
1198  * Unlike _bt_delitems_vacuum, we *must not* clear the vacuum cycle ID,
1199  * because this is not called by VACUUM. Just clear the BTP_HAS_GARBAGE
1200  * page flag, since we deleted all items with their LP_DEAD bit set.
1201  */
1202  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1203  opaque->btpo_flags &= ~BTP_HAS_GARBAGE;
1204 
1206 
1207  /* XLOG stuff */
1208  if (RelationNeedsWAL(rel))
1209  {
1210  XLogRecPtr recptr;
1211  xl_btree_delete xlrec_delete;
1212 
1213  xlrec_delete.latestRemovedXid = latestRemovedXid;
1214  xlrec_delete.ndeleted = ndeletable;
1215 
1216  XLogBeginInsert();
1218  XLogRegisterData((char *) &xlrec_delete, SizeOfBtreeDelete);
1219 
1220  /*
1221  * The deletable array is not in the buffer, but pretend that it is.
1222  * When XLogInsert stores the whole buffer, the array need not be
1223  * stored too.
1224  */
1225  XLogRegisterBufData(0, (char *) deletable,
1226  ndeletable * sizeof(OffsetNumber));
1227 
1228  recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DELETE);
1229 
1230  PageSetLSN(page, recptr);
1231  }
1232 
1233  END_CRIT_SECTION();
1234 }
void XLogRegisterBufData(uint8 block_id, char *data, int len)
Definition: xloginsert.c:362
TransactionId latestRemovedXid
Definition: nbtxlog.h:189
uint32 TransactionId
Definition: c.h:513
void MarkBufferDirty(Buffer buffer)
Definition: bufmgr.c:1468
void XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags)
Definition: xloginsert.c:214
#define END_CRIT_SECTION()
Definition: miscadmin.h:134
#define START_CRIT_SECTION()
Definition: miscadmin.h:132
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:69
uint32 ndeleted
Definition: nbtxlog.h:190
uint16 OffsetNumber
Definition: off.h:24
static char * buf
Definition: pg_test_fsync.c:67
#define REGBUF_STANDARD
Definition: xloginsert.h:35
#define InvalidTransactionId
Definition: transam.h:31
#define BufferGetPage(buffer)
Definition: bufmgr.h:169
static TransactionId _bt_xid_horizon(Relation rel, Relation heapRel, Page page, OffsetNumber *deletable, int ndeletable)
Definition: nbtpage.c:1245
#define XLOG_BTREE_DELETE
Definition: nbtxlog.h:33
void XLogRegisterData(char *data, int len)
Definition: xloginsert.c:324
XLogRecPtr XLogInsert(RmgrId rmid, uint8 info)
Definition: xloginsert.c:416
#define XLogStandbyInfoActive()
Definition: xlog.h:197
uint64 XLogRecPtr
Definition: xlogdefs.h:21
#define Assert(condition)
Definition: c.h:738
void PageIndexMultiDelete(Page page, OffsetNumber *itemnos, int nitems)
Definition: bufpage.c:828
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
#define RelationNeedsWAL(relation)
Definition: rel.h:562
#define SizeOfBtreeDelete
Definition: nbtxlog.h:195
void XLogBeginInsert(void)
Definition: xloginsert.c:121
uint16 btpo_flags
Definition: nbtree.h:65
#define PageSetLSN(page, lsn)
Definition: bufpage.h:368
#define BTP_HAS_GARBAGE
Definition: nbtree.h:78
Pointer Page
Definition: bufpage.h:78

◆ _bt_delitems_vacuum()

void _bt_delitems_vacuum ( Relation  rel,
Buffer  buf,
OffsetNumber deletable,
int  ndeletable,
BTVacuumPosting updatable,
int  nupdatable 
)

Definition at line 1010 of file nbtpage.c.

References _bt_update_posting(), Assert, BTP_HAS_GARBAGE, BTPageOpaqueData::btpo_cycleid, BTPageOpaqueData::btpo_flags, BufferGetBlockNumber(), BufferGetPage, BTVacuumPostingData::deletetids, elog, END_CRIT_SECTION, i, IndexTupleSize, BTVacuumPostingData::itup, MarkBufferDirty(), MAXALIGN, MaxIndexTuplesPerPage, xl_btree_vacuum::ndeleted, xl_btree_update::ndeletedtids, BTVacuumPostingData::ndeletedtids, xl_btree_vacuum::nupdated, PageGetSpecialPointer, PageIndexMultiDelete(), PageIndexTupleOverwrite(), PageSetLSN, palloc(), PANIC, pfree(), REGBUF_STANDARD, RelationGetRelationName, RelationNeedsWAL, SizeOfBtreeUpdate, SizeOfBtreeVacuum, START_CRIT_SECTION, BTVacuumPostingData::updatedoffset, XLOG_BTREE_VACUUM, XLogBeginInsert(), XLogInsert(), XLogRegisterBufData(), XLogRegisterBuffer(), and XLogRegisterData().

Referenced by btvacuumpage().

1013 {
1014  Page page = BufferGetPage(buf);
1015  BTPageOpaque opaque;
1016  Size itemsz;
1017  char *updatedbuf = NULL;
1018  Size updatedbuflen = 0;
1019  OffsetNumber updatedoffsets[MaxIndexTuplesPerPage];
1020 
1021  /* Shouldn't be called unless there's something to do */
1022  Assert(ndeletable > 0 || nupdatable > 0);
1023 
1024  for (int i = 0; i < nupdatable; i++)
1025  {
1026  /* Replace work area IndexTuple with updated version */
1027  _bt_update_posting(updatable[i]);
1028 
1029  /* Maintain array of updatable page offsets for WAL record */
1030  updatedoffsets[i] = updatable[i]->updatedoffset;
1031  }
1032 
1033  /* XLOG stuff -- allocate and fill buffer before critical section */
1034  if (nupdatable > 0 && RelationNeedsWAL(rel))
1035  {
1036  Size offset = 0;
1037 
1038  for (int i = 0; i < nupdatable; i++)
1039  {
1040  BTVacuumPosting vacposting = updatable[i];
1041 
1042  itemsz = SizeOfBtreeUpdate +
1043  vacposting->ndeletedtids * sizeof(uint16);
1044  updatedbuflen += itemsz;
1045  }
1046 
1047  updatedbuf = palloc(updatedbuflen);
1048  for (int i = 0; i < nupdatable; i++)
1049  {
1050  BTVacuumPosting vacposting = updatable[i];
1051  xl_btree_update update;
1052 
1053  update.ndeletedtids = vacposting->ndeletedtids;
1054  memcpy(updatedbuf + offset, &update.ndeletedtids,
1056  offset += SizeOfBtreeUpdate;
1057 
1058  itemsz = update.ndeletedtids * sizeof(uint16);
1059  memcpy(updatedbuf + offset, vacposting->deletetids, itemsz);
1060  offset += itemsz;
1061  }
1062  }
1063 
1064  /* No ereport(ERROR) until changes are logged */
1066 
1067  /*
1068  * Handle posting tuple updates.
1069  *
1070  * Deliberately do this before handling simple deletes. If we did it the
1071  * other way around (i.e. WAL record order -- simple deletes before
1072  * updates) then we'd have to make compensating changes to the 'updatable'
1073  * array of offset numbers.
1074  *
1075  * PageIndexTupleOverwrite() won't unset each item's LP_DEAD bit when it
1076  * happens to already be set. Although we unset the BTP_HAS_GARBAGE page
1077  * level flag, unsetting individual LP_DEAD bits should still be avoided.
1078  */
1079  for (int i = 0; i < nupdatable; i++)
1080  {
1081  OffsetNumber updatedoffset = updatedoffsets[i];
1082  IndexTuple itup;
1083 
1084  itup = updatable[i]->itup;
1085  itemsz = MAXALIGN(IndexTupleSize(itup));
1086  if (!PageIndexTupleOverwrite(page, updatedoffset, (Item) itup,
1087  itemsz))
1088  elog(PANIC, "failed to update partially dead item in block %u of index \"%s\"",
1090  }
1091 
1092  /* Now handle simple deletes of entire tuples */
1093  if (ndeletable > 0)
1094  PageIndexMultiDelete(page, deletable, ndeletable);
1095 
1096  /*
1097  * We can clear the vacuum cycle ID since this page has certainly been
1098  * processed by the current vacuum scan.
1099  */
1100  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1101  opaque->btpo_cycleid = 0;
1102 
1103  /*
1104  * Mark the page as not containing any LP_DEAD items. This is not
1105  * certainly true (there might be some that have recently been marked, but
1106  * weren't targeted by VACUUM's heap scan), but it will be true often
1107  * enough. VACUUM does not delete items purely because they have their
1108  * LP_DEAD bit set, since doing so would necessitate explicitly logging a
1109  * latestRemovedXid cutoff (this is how _bt_delitems_delete works).
1110  *
1111  * The consequences of falsely unsetting BTP_HAS_GARBAGE should be fairly
1112  * limited, since we never falsely unset an LP_DEAD bit. Workloads that
1113  * are particularly dependent on LP_DEAD bits being set quickly will
1114  * usually manage to set the BTP_HAS_GARBAGE flag before the page fills up
1115  * again anyway. Furthermore, attempting a deduplication pass will remove
1116  * all LP_DEAD items, regardless of whether the BTP_HAS_GARBAGE hint bit
1117  * is set or not.
1118  */
1119  opaque->btpo_flags &= ~BTP_HAS_GARBAGE;
1120 
1122 
1123  /* XLOG stuff */
1124  if (RelationNeedsWAL(rel))
1125  {
1126  XLogRecPtr recptr;
1127  xl_btree_vacuum xlrec_vacuum;
1128 
1129  xlrec_vacuum.ndeleted = ndeletable;
1130  xlrec_vacuum.nupdated = nupdatable;
1131 
1132  XLogBeginInsert();
1134  XLogRegisterData((char *) &xlrec_vacuum, SizeOfBtreeVacuum);
1135 
1136  if (ndeletable > 0)
1137  XLogRegisterBufData(0, (char *) deletable,
1138  ndeletable * sizeof(OffsetNumber));
1139 
1140  if (nupdatable > 0)
1141  {
1142  XLogRegisterBufData(0, (char *) updatedoffsets,
1143  nupdatable * sizeof(OffsetNumber));
1144  XLogRegisterBufData(0, updatedbuf, updatedbuflen);
1145  }
1146 
1147  recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_VACUUM);
1148 
1149  PageSetLSN(page, recptr);
1150  }
1151 
1152  END_CRIT_SECTION();
1153 
1154  /* can't leak memory here */
1155  if (updatedbuf != NULL)
1156  pfree(updatedbuf);
1157  /* free tuples generated by calling _bt_update_posting() */
1158  for (int i = 0; i < nupdatable; i++)
1159  pfree(updatable[i]->itup);
1160 }
void XLogRegisterBufData(uint8 block_id, char *data, int len)
Definition: xloginsert.c:362
uint16 ndeletedtids
Definition: nbtree.h:788
void _bt_update_posting(BTVacuumPosting vacposting)
Definition: nbtdedup.c:661
#define SizeOfBtreeUpdate
Definition: nbtxlog.h:225
void MarkBufferDirty(Buffer buffer)
Definition: bufmgr.c:1468
void XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags)
Definition: xloginsert.c:214
OffsetNumber updatedoffset
Definition: nbtree.h:785
#define END_CRIT_SECTION()
Definition: miscadmin.h:134
IndexTuple itup
Definition: nbtree.h:784
Pointer Item
Definition: item.h:17
#define START_CRIT_SECTION()
Definition: miscadmin.h:132
uint16 nupdated
Definition: nbtxlog.h:243
#define PANIC
Definition: elog.h:53
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:69
uint16 OffsetNumber
Definition: off.h:24
unsigned short uint16
Definition: c.h:366
void pfree(void *pointer)
Definition: mcxt.c:1056
#define SizeOfBtreeVacuum
Definition: nbtxlog.h:250
BTCycleId btpo_cycleid
Definition: nbtree.h:66
bool PageIndexTupleOverwrite(Page page, OffsetNumber offnum, Item newtup, Size newsize)
Definition: bufpage.c:1060
static char * buf
Definition: pg_test_fsync.c:67
#define REGBUF_STANDARD
Definition: xloginsert.h:35
#define XLOG_BTREE_VACUUM
Definition: nbtxlog.h:38
#define RelationGetRelationName(relation)
Definition: rel.h:490
#define BufferGetPage(buffer)
Definition: bufmgr.h:169
uint16 ndeleted
Definition: nbtxlog.h:242
void XLogRegisterData(char *data, int len)
Definition: xloginsert.c:324
XLogRecPtr XLogInsert(RmgrId rmid, uint8 info)
Definition: xloginsert.c:416
uint16 deletetids[FLEXIBLE_ARRAY_MEMBER]
Definition: nbtree.h:789
uint64 XLogRecPtr
Definition: xlogdefs.h:21
#define Assert(condition)
Definition: c.h:738
void PageIndexMultiDelete(Page page, OffsetNumber *itemnos, int nitems)
Definition: bufpage.c:828
size_t Size
Definition: c.h:466
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
#define MAXALIGN(LEN)
Definition: c.h:691
#define RelationNeedsWAL(relation)
Definition: rel.h:562
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2633
#define MaxIndexTuplesPerPage
Definition: itup.h:145
void * palloc(Size size)
Definition: mcxt.c:949
#define elog(elevel,...)
Definition: elog.h:214
int i
void XLogBeginInsert(void)
Definition: xloginsert.c:121
uint16 btpo_flags
Definition: nbtree.h:65
#define PageSetLSN(page, lsn)
Definition: bufpage.h:368
#define BTP_HAS_GARBAGE
Definition: nbtree.h:78
uint16 ndeletedtids
Definition: nbtxlog.h:220
Pointer Page
Definition: bufpage.h:78
#define IndexTupleSize(itup)
Definition: itup.h:71

◆ _bt_doinsert()

bool _bt_doinsert ( Relation  rel,
IndexTuple  itup,
IndexUniqueCheck  checkUnique,
Relation  heapRel 
)

Definition at line 82 of file nbtinsert.c.

References _bt_check_unique(), _bt_findinsertloc(), _bt_freestack(), _bt_insertonpg(), _bt_mkscankey(), _bt_relbuf(), _bt_search_insert(), BTScanInsertData::anynullkeys, Assert, BTInsertStateData::bounds_valid, BTInsertStateData::buf, BufferGetBlockNumber(), CheckForSerializableConflictIn(), BTScanInsertData::heapkeyspace, IndexTupleSize, InvalidBuffer, BTInsertStateData::itemsz, BTInsertStateData::itup, BTInsertStateData::itup_key, MAXALIGN, pfree(), BTInsertStateData::postingoff, BTScanInsertData::scantid, SpeculativeInsertionWait(), IndexTupleData::t_tid, TransactionIdIsValid, UNIQUE_CHECK_EXISTING, UNIQUE_CHECK_NO, unlikely, XactLockTableWait(), and XLTW_InsertIndex.

Referenced by btinsert().

84 {
85  bool is_unique = false;
86  BTInsertStateData insertstate;
87  BTScanInsert itup_key;
88  BTStack stack;
89  bool checkingunique = (checkUnique != UNIQUE_CHECK_NO);
90 
91  /* we need an insertion scan key to do our search, so build one */
92  itup_key = _bt_mkscankey(rel, itup);
93 
94  if (checkingunique)
95  {
96  if (!itup_key->anynullkeys)
97  {
98  /* No (heapkeyspace) scantid until uniqueness established */
99  itup_key->scantid = NULL;
100  }
101  else
102  {
103  /*
104  * Scan key for new tuple contains NULL key values. Bypass
105  * checkingunique steps. They are unnecessary because core code
106  * considers NULL unequal to every value, including NULL.
107  *
108  * This optimization avoids O(N^2) behavior within the
109  * _bt_findinsertloc() heapkeyspace path when a unique index has a
110  * large number of "duplicates" with NULL key values.
111  */
112  checkingunique = false;
113  /* Tuple is unique in the sense that core code cares about */
114  Assert(checkUnique != UNIQUE_CHECK_EXISTING);
115  is_unique = true;
116  }
117  }
118 
119  /*
120  * Fill in the BTInsertState working area, to track the current page and
121  * position within the page to insert on.
122  *
123  * Note that itemsz is passed down to lower level code that deals with
124  * inserting the item. It must be MAXALIGN()'d. This ensures that space
125  * accounting code consistently considers the alignment overhead that we
126  * expect PageAddItem() will add later. (Actually, index_form_tuple() is
127  * already conservative about alignment, but we don't rely on that from
128  * this distance. Besides, preserving the "true" tuple size in index
129  * tuple headers for the benefit of nbtsplitloc.c might happen someday.
130  * Note that heapam does not MAXALIGN() each heap tuple's lp_len field.)
131  */
132  insertstate.itup = itup;
133  insertstate.itemsz = MAXALIGN(IndexTupleSize(itup));
134  insertstate.itup_key = itup_key;
135  insertstate.bounds_valid = false;
136  insertstate.buf = InvalidBuffer;
137  insertstate.postingoff = 0;
138 
139 search:
140 
141  /*
142  * Find and lock the leaf page that the tuple should be added to by
143  * searching from the root page. insertstate.buf will hold a buffer that
144  * is locked in exclusive mode afterwards.
145  */
146  stack = _bt_search_insert(rel, &insertstate);
147 
148  /*
149  * checkingunique inserts are not allowed to go ahead when two tuples with
150  * equal key attribute values would be visible to new MVCC snapshots once
151  * the xact commits. Check for conflicts in the locked page/buffer (if
152  * needed) here.
153  *
154  * It might be necessary to check a page to the right in _bt_check_unique,
155  * though that should be very rare. In practice the first page the value
156  * could be on (with scantid omitted) is almost always also the only page
157  * that a matching tuple might be found on. This is due to the behavior
158  * of _bt_findsplitloc with duplicate tuples -- a group of duplicates can
159  * only be allowed to cross a page boundary when there is no candidate
160  * leaf page split point that avoids it. Also, _bt_check_unique can use
161  * the leaf page high key to determine that there will be no duplicates on
162  * the right sibling without actually visiting it (it uses the high key in
163  * cases where the new item happens to belong at the far right of the leaf
164  * page).
165  *
166  * NOTE: obviously, _bt_check_unique can only detect keys that are already
167  * in the index; so it cannot defend against concurrent insertions of the
168  * same key. We protect against that by means of holding a write lock on
169  * the first page the value could be on, with omitted/-inf value for the
170  * implicit heap TID tiebreaker attribute. Any other would-be inserter of
171  * the same key must acquire a write lock on the same page, so only one
172  * would-be inserter can be making the check at one time. Furthermore,
173  * once we are past the check we hold write locks continuously until we
174  * have performed our insertion, so no later inserter can fail to see our
175  * insertion. (This requires some care in _bt_findinsertloc.)
176  *
177  * If we must wait for another xact, we release the lock while waiting,
178  * and then must perform a new search.
179  *
180  * For a partial uniqueness check, we don't wait for the other xact. Just
181  * let the tuple in and return false for possibly non-unique, or true for
182  * definitely unique.
183  */
184  if (checkingunique)
185  {
186  TransactionId xwait;
187  uint32 speculativeToken;
188 
189  xwait = _bt_check_unique(rel, &insertstate, heapRel, checkUnique,
190  &is_unique, &speculativeToken);
191 
192  if (unlikely(TransactionIdIsValid(xwait)))
193  {
194  /* Have to wait for the other guy ... */
195  _bt_relbuf(rel, insertstate.buf);
196  insertstate.buf = InvalidBuffer;
197 
198  /*
199  * If it's a speculative insertion, wait for it to finish (ie. to
200  * go ahead with the insertion, or kill the tuple). Otherwise
201  * wait for the transaction to finish as usual.
202  */
203  if (speculativeToken)
204  SpeculativeInsertionWait(xwait, speculativeToken);
205  else
206  XactLockTableWait(xwait, rel, &itup->t_tid, XLTW_InsertIndex);
207 
208  /* start over... */
209  if (stack)
210  _bt_freestack(stack);
211  goto search;
212  }
213 
214  /* Uniqueness is established -- restore heap tid as scantid */
215  if (itup_key->heapkeyspace)
216  itup_key->scantid = &itup->t_tid;
217  }
218 
219  if (checkUnique != UNIQUE_CHECK_EXISTING)
220  {
221  OffsetNumber newitemoff;
222 
223  /*
224  * The only conflict predicate locking cares about for indexes is when
225  * an index tuple insert conflicts with an existing lock. We don't
226  * know the actual page we're going to insert on for sure just yet in
227  * checkingunique and !heapkeyspace cases, but it's okay to use the
228  * first page the value could be on (with scantid omitted) instead.
229  */
231 
232  /*
233  * Do the insertion. Note that insertstate contains cached binary
234  * search bounds established within _bt_check_unique when insertion is
235  * checkingunique.
236  */
237  newitemoff = _bt_findinsertloc(rel, &insertstate, checkingunique,
238  stack, heapRel);
239  _bt_insertonpg(rel, itup_key, insertstate.buf, InvalidBuffer, stack,
240  itup, insertstate.itemsz, newitemoff,
241  insertstate.postingoff, false);
242  }
243  else
244  {
245  /* just release the buffer */
246  _bt_relbuf(rel, insertstate.buf);
247  }
248 
249  /* be tidy */
250  if (stack)
251  _bt_freestack(stack);
252  pfree(itup_key);
253 
254  return is_unique;
255 }
static void _bt_insertonpg(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, BTStack stack, IndexTuple itup, Size itemsz, OffsetNumber newitemoff, int postingoff, bool split_only_page)
Definition: nbtinsert.c:1100
void _bt_freestack(BTStack stack)
Definition: nbtutils.c:175
bool bounds_valid
Definition: nbtree.h:703
uint32 TransactionId
Definition: c.h:513
BTScanInsert _bt_mkscankey(Relation rel, IndexTuple itup)
Definition: nbtutils.c:90
ItemPointer scantid
Definition: nbtree.h:671
ItemPointerData t_tid
Definition: itup.h:37
static OffsetNumber _bt_findinsertloc(Relation rel, BTInsertState insertstate, bool checkingunique, BTStack stack, Relation heapRel)
Definition: nbtinsert.c:789
#define InvalidBuffer
Definition: buf.h:25
uint16 OffsetNumber
Definition: off.h:24
void pfree(void *pointer)
Definition: mcxt.c:1056
void SpeculativeInsertionWait(TransactionId xid, uint32 token)
Definition: lmgr.c:781
unsigned int uint32
Definition: c.h:367
IndexTuple itup
Definition: nbtree.h:691
static BTStack _bt_search_insert(Relation rel, BTInsertState insertstate)
Definition: nbtinsert.c:296
bool anynullkeys
Definition: nbtree.h:668
void CheckForSerializableConflictIn(Relation relation, ItemPointer tid, BlockNumber blkno)
Definition: predicate.c:4374
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:940
void XactLockTableWait(TransactionId xid, Relation rel, ItemPointer ctid, XLTW_Oper oper)
Definition: lmgr.c:624
BTScanInsert itup_key
Definition: nbtree.h:693
#define Assert(condition)
Definition: c.h:738
bool heapkeyspace
Definition: nbtree.h:666
#define MAXALIGN(LEN)
Definition: c.h:691
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2633
static TransactionId _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel, IndexUniqueCheck checkUnique, bool *is_unique, uint32 *speculativeToken)
Definition: nbtinsert.c:387
#define unlikely(x)
Definition: c.h:206
#define TransactionIdIsValid(xid)
Definition: transam.h:41
#define IndexTupleSize(itup)
Definition: itup.h:71

◆ _bt_end_vacuum()

void _bt_end_vacuum ( Relation  rel)

Definition at line 2021 of file nbtutils.c.

References LockRelId::dbId, i, LockInfoData::lockRelId, LW_EXCLUSIVE, LWLockAcquire(), LWLockRelease(), BTVacInfo::num_vacuums, RelationData::rd_lockInfo, LockRelId::relId, BTOneVacInfo::relid, and BTVacInfo::vacuums.

Referenced by _bt_end_vacuum_callback(), and btbulkdelete().

2022 {
2023  int i;
2024 
2025  LWLockAcquire(BtreeVacuumLock, LW_EXCLUSIVE);
2026 
2027  /* Find the array entry */
2028  for (i = 0; i < btvacinfo->num_vacuums; i++)
2029  {
2030  BTOneVacInfo *vac = &btvacinfo->vacuums[i];
2031 
2032  if (vac->relid.relId == rel->rd_lockInfo.lockRelId.relId &&
2033  vac->relid.dbId == rel->rd_lockInfo.lockRelId.dbId)
2034  {
2035  /* Remove it by shifting down the last entry */
2036  *vac = btvacinfo->vacuums[btvacinfo->num_vacuums - 1];
2038  break;
2039  }
2040  }
2041 
2042  LWLockRelease(BtreeVacuumLock);
2043 }
LockRelId lockRelId
Definition: rel.h:44
static BTVacInfo * btvacinfo
Definition: nbtutils.c:1917
LockRelId relid
Definition: nbtutils.c:1905
Oid dbId
Definition: rel.h:39
BTOneVacInfo vacuums[FLEXIBLE_ARRAY_MEMBER]
Definition: nbtutils.c:1914
void LWLockRelease(LWLock *lock)
Definition: lwlock.c:1727
LockInfoData rd_lockInfo
Definition: rel.h:112
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1123
int num_vacuums
Definition: nbtutils.c:1912
int i
Oid relId
Definition: rel.h:38

◆ _bt_end_vacuum_callback()

void _bt_end_vacuum_callback ( int  code,
Datum  arg 
)

Definition at line 2049 of file nbtutils.c.

References _bt_end_vacuum(), and DatumGetPointer.

Referenced by btbulkdelete().

2050 {
2052 }
void _bt_end_vacuum(Relation rel)
Definition: nbtutils.c:2021
#define DatumGetPointer(X)
Definition: postgres.h:549
void * arg

◆ _bt_findsplitloc()

OffsetNumber _bt_findsplitloc ( Relation  rel,
Page  page,
OffsetNumber  newitemoff,
Size  newitemsz,
IndexTuple  newitem,
bool newitemonleft 
)

Definition at line 127 of file nbtsplitloc.c.

References _bt_afternewitemoff(), _bt_bestsplitloc(), _bt_deltasortsplits(), _bt_recsplitloc(), _bt_strategy(), Assert, BTGetFillFactor, BTREE_NONLEAF_FILLFACTOR, BTREE_SINGLEVAL_FILLFACTOR, BTreeTupleIsPosting(), elog, ERROR, SplitPoint::firstoldonright, i, IndexRelationGetNumberOfKeyAttributes, FindSplitData::interval, FindSplitData::is_leaf, FindSplitData::is_rightmost, ItemIdGetLength, FindSplitData::leftspace, Max, MAX_INTERNAL_INTERVAL, MAX_LEAF_INTERVAL, MAXALIGN, FindSplitData::maxsplits, Min, FindSplitData::minfirstrightsz, FindSplitData::newitem, FindSplitData::newitemoff, SplitPoint::newitemonleft, FindSplitData::newitemsz, FindSplitData::nsplits, OffsetNumberNext, FindSplitData::olddataitemstotal, P_FIRSTDATAKEY, P_HIKEY, P_ISLEAF, P_RIGHTMOST, FindSplitData::page, PageGetExactFreeSpace(), PageGetItemId, PageGetMaxOffsetNumber, PageGetPageSize, PageGetSpecialPointer, palloc(), pfree(), FindSplitData::rel, RelationGetRelationName, FindSplitData::rightspace, SizeOfPageHeaderData, SPLIT_DEFAULT, SPLIT_MANY_DUPLICATES, SPLIT_SINGLE_VALUE, and FindSplitData::splits.

Referenced by _bt_split().

133 {
134  BTPageOpaque opaque;
135  int leftspace,
136  rightspace,
137  olddataitemstotal,
138  olddataitemstoleft,
139  perfectpenalty,
140  leaffillfactor;
142  FindSplitStrat strategy;
143  ItemId itemid;
144  OffsetNumber offnum,
145  maxoff,
146  foundfirstright;
147  double fillfactormult;
148  bool usemult;
149  SplitPoint leftpage,
150  rightpage;
151 
152  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
153  maxoff = PageGetMaxOffsetNumber(page);
154 
155  /* Total free space available on a btree page, after fixed overhead */
156  leftspace = rightspace =
158  MAXALIGN(sizeof(BTPageOpaqueData));
159 
160  /* The right page will have the same high key as the old page */
161  if (!P_RIGHTMOST(opaque))
162  {
163  itemid = PageGetItemId(page, P_HIKEY);
164  rightspace -= (int) (MAXALIGN(ItemIdGetLength(itemid)) +
165  sizeof(ItemIdData));
166  }
167 
168  /* Count up total space in data items before actually scanning 'em */
169  olddataitemstotal = rightspace - (int) PageGetExactFreeSpace(page);
170  leaffillfactor = BTGetFillFactor(rel);
171 
172  /* Passed-in newitemsz is MAXALIGNED but does not include line pointer */
173  newitemsz += sizeof(ItemIdData);
174  state.rel = rel;
175  state.page = page;
176  state.newitem = newitem;
177  state.newitemsz = newitemsz;
178  state.is_leaf = P_ISLEAF(opaque);
179  state.is_rightmost = P_RIGHTMOST(opaque);
180  state.leftspace = leftspace;
181  state.rightspace = rightspace;
182  state.olddataitemstotal = olddataitemstotal;
183  state.minfirstrightsz = SIZE_MAX;
184  state.newitemoff = newitemoff;
185 
186  /* newitem cannot be a posting list item */
187  Assert(!BTreeTupleIsPosting(newitem));
188 
189  /*
190  * maxsplits should never exceed maxoff because there will be at most as
191  * many candidate split points as there are points _between_ tuples, once
192  * you imagine that the new item is already on the original page (the
193  * final number of splits may be slightly lower because not all points
194  * between tuples will be legal).
195  */
196  state.maxsplits = maxoff;
197  state.splits = palloc(sizeof(SplitPoint) * state.maxsplits);
198  state.nsplits = 0;
199 
200  /*
201  * Scan through the data items and calculate space usage for a split at
202  * each possible position
203  */
204  olddataitemstoleft = 0;
205 
206  for (offnum = P_FIRSTDATAKEY(opaque);
207  offnum <= maxoff;
208  offnum = OffsetNumberNext(offnum))
209  {
210  Size itemsz;
211 
212  itemid = PageGetItemId(page, offnum);
213  itemsz = MAXALIGN(ItemIdGetLength(itemid)) + sizeof(ItemIdData);
214 
215  /*
216  * When item offset number is not newitemoff, neither side of the
217  * split can be newitem. Record a split after the previous data item
218  * from original page, but before the current data item from original
219  * page. (_bt_recsplitloc() will reject the split when there are no
220  * previous items, which we rely on.)
221  */
222  if (offnum < newitemoff)
223  _bt_recsplitloc(&state, offnum, false, olddataitemstoleft, itemsz);
224  else if (offnum > newitemoff)
225  _bt_recsplitloc(&state, offnum, true, olddataitemstoleft, itemsz);
226  else
227  {
228  /*
229  * Record a split after all "offnum < newitemoff" original page
230  * data items, but before newitem
231  */
232  _bt_recsplitloc(&state, offnum, false, olddataitemstoleft, itemsz);
233 
234  /*
235  * Record a split after newitem, but before data item from
236  * original page at offset newitemoff/current offset
237  */
238  _bt_recsplitloc(&state, offnum, true, olddataitemstoleft, itemsz);
239  }
240 
241  olddataitemstoleft += itemsz;
242  }
243 
244  /*
245  * Record a split after all original page data items, but before newitem.
246  * (Though only when it's possible that newitem will end up alone on new
247  * right page.)
248  */
249  Assert(olddataitemstoleft == olddataitemstotal);
250  if (newitemoff > maxoff)
251  _bt_recsplitloc(&state, newitemoff, false, olddataitemstotal, 0);
252 
253  /*
254  * I believe it is not possible to fail to find a feasible split, but just
255  * in case ...
256  */
257  if (state.nsplits == 0)
258  elog(ERROR, "could not find a feasible split point for index \"%s\"",
260 
261  /*
262  * Start search for a split point among list of legal split points. Give
263  * primary consideration to equalizing available free space in each half
264  * of the split initially (start with default strategy), while applying
265  * rightmost and split-after-new-item optimizations where appropriate.
266  * Either of the two other fallback strategies may be required for cases
267  * with a large number of duplicates around the original/space-optimal
268  * split point.
269  *
270  * Default strategy gives some weight to suffix truncation in deciding a
271  * split point on leaf pages. It attempts to select a split point where a
272  * distinguishing attribute appears earlier in the new high key for the
273  * left side of the split, in order to maximize the number of trailing
274  * attributes that can be truncated away. Only candidate split points
275  * that imply an acceptable balance of free space on each side are
276  * considered.
277  */
278  if (!state.is_leaf)
279  {
280  /* fillfactormult only used on rightmost page */
281  usemult = state.is_rightmost;
282  fillfactormult = BTREE_NONLEAF_FILLFACTOR / 100.0;
283  }
284  else if (state.is_rightmost)
285  {
286  /* Rightmost leaf page -- fillfactormult always used */
287  usemult = true;
288  fillfactormult = leaffillfactor / 100.0;
289  }
290  else if (_bt_afternewitemoff(&state, maxoff, leaffillfactor, &usemult))
291  {
292  /*
293  * New item inserted at rightmost point among a localized grouping on
294  * a leaf page -- apply "split after new item" optimization, either by
295  * applying leaf fillfactor multiplier, or by choosing the exact split
296  * point that leaves the new item as last on the left. (usemult is set
297  * for us.)
298  */
299  if (usemult)
300  {
301  /* fillfactormult should be set based on leaf fillfactor */
302  fillfactormult = leaffillfactor / 100.0;
303  }
304  else
305  {
306  /* find precise split point after newitemoff */
307  for (int i = 0; i < state.nsplits; i++)
308  {
309  SplitPoint *split = state.splits + i;
310 
311  if (split->newitemonleft &&
312  newitemoff == split->firstoldonright)
313  {
314  pfree(state.splits);
315  *newitemonleft = true;
316  return newitemoff;
317  }
318  }
319 
320  /*
321  * Cannot legally split after newitemoff; proceed with split
322  * without using fillfactor multiplier. This is defensive, and
323  * should never be needed in practice.
324  */
325  fillfactormult = 0.50;
326  }
327  }
328  else
329  {
330  /* Other leaf page. 50:50 page split. */
331  usemult = false;
332  /* fillfactormult not used, but be tidy */
333  fillfactormult = 0.50;
334  }
335 
336  /*
337  * Set an initial limit on the split interval/number of candidate split
338  * points as appropriate. The "Prefix B-Trees" paper refers to this as
339  * sigma l for leaf splits and sigma b for internal ("branch") splits.
340  * It's hard to provide a theoretical justification for the initial size
341  * of the split interval, though it's clear that a small split interval
342  * makes suffix truncation much more effective without noticeably
343  * affecting space utilization over time.
344  */
345  state.interval = Min(Max(1, state.nsplits * 0.05),
346  state.is_leaf ? MAX_LEAF_INTERVAL :
348 
349  /*
350  * Save leftmost and rightmost splits for page before original ordinal
351  * sort order is lost by delta/fillfactormult sort
352  */
353  leftpage = state.splits[0];
354  rightpage = state.splits[state.nsplits - 1];
355 
356  /* Give split points a fillfactormult-wise delta, and sort on deltas */
357  _bt_deltasortsplits(&state, fillfactormult, usemult);
358 
359  /*
360  * Determine if default strategy/split interval will produce a
361  * sufficiently distinguishing split, or if we should change strategies.
362  * Alternative strategies change the range of split points that are
363  * considered acceptable (split interval), and possibly change
364  * fillfactormult, in order to deal with pages with a large number of
365  * duplicates gracefully.
366  *
367  * Pass low and high splits for the entire page (actually, they're for an
368  * imaginary version of the page that includes newitem). These are used
369  * when the initial split interval encloses split points that are full of
370  * duplicates, and we need to consider if it's even possible to avoid
371  * appending a heap TID.
372  */
373  perfectpenalty = _bt_strategy(&state, &leftpage, &rightpage, &strategy);
374 
375  if (strategy == SPLIT_DEFAULT)
376  {
377  /*
378  * Default strategy worked out (always works out with internal page).
379  * Original split interval still stands.
380  */
381  }
382 
383  /*
384  * Many duplicates strategy is used when a heap TID would otherwise be
385  * appended, but the page isn't completely full of logical duplicates.
386  *
387  * The split interval is widened to include all legal candidate split
388  * points. There might be a few as two distinct values in the whole-page
389  * split interval, though it's also possible that most of the values on
390  * the page are unique. The final split point will either be to the
391  * immediate left or to the immediate right of the group of duplicate
392  * tuples that enclose the first/delta-optimal split point (perfect
393  * penalty was set so that the lowest delta split point that avoids
394  * appending a heap TID will be chosen). Maximizing the number of
395  * attributes that can be truncated away is not a goal of the many
396  * duplicates strategy.
397  *
398  * Single value strategy is used when it is impossible to avoid appending
399  * a heap TID. It arranges to leave the left page very full. This
400  * maximizes space utilization in cases where tuples with the same
401  * attribute values span many pages. Newly inserted duplicates will tend
402  * to have higher heap TID values, so we'll end up splitting to the right
403  * consistently. (Single value strategy is harmless though not
404  * particularly useful with !heapkeyspace indexes.)
405  */
406  else if (strategy == SPLIT_MANY_DUPLICATES)
407  {
408  Assert(state.is_leaf);
409  /* Shouldn't try to truncate away extra user attributes */
410  Assert(perfectpenalty ==
412  /* No need to resort splits -- no change in fillfactormult/deltas */
413  state.interval = state.nsplits;
414  }
415  else if (strategy == SPLIT_SINGLE_VALUE)
416  {
417  Assert(state.is_leaf);
418  /* Split near the end of the page */
419  usemult = true;
420  fillfactormult = BTREE_SINGLEVAL_FILLFACTOR / 100.0;
421  /* Resort split points with new delta */
422  _bt_deltasortsplits(&state, fillfactormult, usemult);
423  /* Appending a heap TID is unavoidable, so interval of 1 is fine */
424  state.interval = 1;
425  }
426 
427  /*
428  * Search among acceptable split points (using final split interval) for
429  * the entry that has the lowest penalty, and is therefore expected to
430  * maximize fan-out. Sets *newitemonleft for us.
431  */
432  foundfirstright = _bt_bestsplitloc(&state, perfectpenalty, newitemonleft,
433  strategy);
434  pfree(state.splits);
435 
436  return foundfirstright;
437 }
OffsetNumber firstoldonright
Definition: nbtsplitloc.c:40
bool is_rightmost
Definition: nbtsplitloc.c:53
Size minfirstrightsz
Definition: nbtsplitloc.c:58
#define MAX_LEAF_INTERVAL
Definition: nbtsplitloc.c:21
IndexTuple newitem
Definition: nbtsplitloc.c:50
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:244
#define Min(x, y)
Definition: c.h:920
bool newitemonleft
Definition: nbtsplitloc.c:41
#define SizeOfPageHeaderData
Definition: bufpage.h:216
static int _bt_strategy(FindSplitData *state, SplitPoint *leftpage, SplitPoint *rightpage, FindSplitStrat *strategy)
Definition: nbtsplitloc.c:872
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:357
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:69
SplitPoint * splits
Definition: nbtsplitloc.c:63
uint16 OffsetNumber
Definition: off.h:24
void pfree(void *pointer)
Definition: mcxt.c:1056
#define ItemIdGetLength(itemId)
Definition: itemid.h:59
#define BTREE_NONLEAF_FILLFACTOR
Definition: nbtree.h:195
#define ERROR
Definition: elog.h:43
static OffsetNumber _bt_bestsplitloc(FindSplitData *state, int perfectpenalty, bool *newitemonleft, FindSplitStrat strategy)
Definition: nbtsplitloc.c:797
#define PageGetPageSize(page)
Definition: bufpage.h:268
static bool _bt_afternewitemoff(FindSplitData *state, OffsetNumber maxoff, int leaffillfactor, bool *usemult)
Definition: nbtsplitloc.c:639
#define RelationGetRelationName(relation)
Definition: rel.h:490
struct ItemIdData ItemIdData
Relation rel
Definition: nbtsplitloc.c:48
#define IndexRelationGetNumberOfKeyAttributes(relation)
Definition: rel.h:475
#define BTREE_SINGLEVAL_FILLFACTOR
Definition: nbtree.h:196
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:235
#define BTGetFillFactor(relation)
Definition: nbtree.h:974
FindSplitStrat
Definition: nbtsplitloc.c:24
static bool BTreeTupleIsPosting(IndexTuple itup)
Definition: nbtree.h:369
#define Max(x, y)
Definition: c.h:914
#define MAX_INTERNAL_INTERVAL
Definition: nbtsplitloc.c:22
int olddataitemstotal
Definition: nbtsplitloc.c:57
#define Assert(condition)
Definition: c.h:738
Definition: regguts.h:298
OffsetNumber newitemoff
Definition: nbtsplitloc.c:54
#define OffsetNumberNext(offsetNumber)
Definition: off.h:52
size_t Size
Definition: c.h:466
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
#define MAXALIGN(LEN)
Definition: c.h:691
static void _bt_recsplitloc(FindSplitData *state, OffsetNumber firstoldonright, bool newitemonleft, int olddataitemstoleft, Size firstoldonrightsz)
Definition: nbtsplitloc.c:456
Size PageGetExactFreeSpace(Page page)
Definition: bufpage.c:625
#define P_HIKEY
Definition: nbtree.h:242
void * palloc(Size size)
Definition: mcxt.c:949
#define elog(elevel,...)
Definition: elog.h:214
int i
static void _bt_deltasortsplits(FindSplitData *state, double fillfactormult, bool usemult)
Definition: nbtsplitloc.c:570
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:213
#define P_ISLEAF(opaque)
Definition: nbtree.h:214

◆ _bt_finish_split()

void _bt_finish_split ( Relation  rel,
Buffer  lbuf,
BTStack  stack 
)

Definition at line 2172 of file nbtinsert.c.

References _bt_getbuf(), _bt_insert_parent(), _bt_relbuf(), Assert, BT_WRITE, BTMetaPageData::btm_root, BTPageGetMeta, BTPageOpaqueData::btpo_next, BTREE_METAPAGE, BufferGetBlockNumber(), BufferGetPage, DEBUG1, elog, P_INCOMPLETE_SPLIT, P_LEFTMOST, P_RIGHTMOST, and PageGetSpecialPointer.

Referenced by _bt_getstackbuf(), _bt_moveright(), and _bt_stepright().

2173 {
2174  Page lpage = BufferGetPage(lbuf);
2175  BTPageOpaque lpageop = (BTPageOpaque) PageGetSpecialPointer(lpage);
2176  Buffer rbuf;
2177  Page rpage;
2178  BTPageOpaque rpageop;
2179  bool was_root;
2180  bool was_only;
2181 
2182  Assert(P_INCOMPLETE_SPLIT(lpageop));
2183 
2184  /* Lock right sibling, the one missing the downlink */
2185  rbuf = _bt_getbuf(rel, lpageop->btpo_next, BT_WRITE);
2186  rpage = BufferGetPage(rbuf);
2187  rpageop = (BTPageOpaque) PageGetSpecialPointer(rpage);
2188 
2189  /* Could this be a root split? */
2190  if (!stack)
2191  {
2192  Buffer metabuf;
2193  Page metapg;
2194  BTMetaPageData *metad;
2195 
2196  /* acquire lock on the metapage */
2197  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
2198  metapg = BufferGetPage(metabuf);
2199  metad = BTPageGetMeta(metapg);
2200 
2201  was_root = (metad->btm_root == BufferGetBlockNumber(lbuf));
2202 
2203  _bt_relbuf(rel, metabuf);
2204  }
2205  else
2206  was_root = false;
2207 
2208  /* Was this the only page on the level before split? */
2209  was_only = (P_LEFTMOST(lpageop) && P_RIGHTMOST(rpageop));
2210 
2211  elog(DEBUG1, "finishing incomplete split of %u/%u",
2213 
2214  _bt_insert_parent(rel, lbuf, rbuf, stack, was_root, was_only);
2215 }
BlockNumber btpo_next
Definition: nbtree.h:59
#define DEBUG1
Definition: elog.h:25
Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access)
Definition: nbtpage.c:785
static void _bt_insert_parent(Relation rel, Buffer buf, Buffer rbuf, BTStack stack, bool is_root, bool is_only)
Definition: nbtinsert.c:2035
#define P_INCOMPLETE_SPLIT(opaque)
Definition: nbtree.h:221
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:69
#define BTPageGetMeta(p)
Definition: nbtree.h:114
#define P_LEFTMOST(opaque)
Definition: nbtree.h:212
#define BufferGetPage(buffer)
Definition: bufmgr.h:169
#define BTREE_METAPAGE
Definition: nbtree.h:141
BlockNumber btm_root
Definition: nbtree.h:102
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:940
#define Assert(condition)
Definition: c.h:738
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2633
#define elog(elevel,...)
Definition: elog.h:214
#define BT_WRITE
Definition: nbtree.h:595
int Buffer
Definition: buf.h:23
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:213
Pointer Page
Definition: bufpage.h:78

◆ _bt_first()

bool _bt_first ( IndexScanDesc  scan,
ScanDirection  dir 
)

Definition at line 853 of file nbtsearch.c.

References _bt_binsrch(), _bt_drop_lock_and_maybe_pin(), _bt_endpoint(), _bt_freestack(), _bt_initialize_more_data(), _bt_metaversion(), _bt_parallel_done(), _bt_parallel_readpage(), _bt_parallel_seize(), _bt_preprocess_keys(), _bt_readpage(), _bt_search(), _bt_steppage(), Assert, BT_READ, BTEqualStrategyNumber, BTGreaterEqualStrategyNumber, BTGreaterStrategyNumber, BTLessEqualStrategyNumber, BTLessStrategyNumber, BTORDER_PROC, BTScanPosInvalidate, BTScanPosIsValid, buf, BTScanPosData::buf, BUFFER_LOCK_UNLOCK, BufferGetBlockNumber(), BufferIsValid, cur, BTScanOpaqueData::currPos, BTScanOpaqueData::currTuples, DatumGetPointer, elog, ERROR, get_opfamily_proc(), i, index_getprocinfo(), INDEX_MAX_KEYS, IndexScanDescData::indexRelation, InvalidBlockNumber, InvalidOid, InvalidStrategy, BTScanPosData::itemIndex, BTScanPosData::items, BTScanOpaqueData::keyData, LockBuffer(), BTScanOpaqueData::numberOfKeys, OffsetNumberPrev, IndexScanDescData::opaque, P_NONE, IndexScanDescData::parallel_scan, pgstat_count_index_scan, PredicateLockPage(), PredicateLockRelation(), BTScanOpaqueData::qual_ok, RelationData::rd_opcintype, RelationData::rd_opfamily, RegProcedureIsValid, RelationGetRelationName, ScanDirectionIsBackward, ScanDirectionIsForward, ScanKeyEntryInitialize(), ScanKeyEntryInitializeWithInfo(), ScanKeyData::sk_argument, ScanKeyData::sk_attno, SK_BT_DESC, SK_BT_NULLS_FIRST, ScanKeyData::sk_collation, ScanKeyData::sk_flags, SK_ISNULL, SK_ROW_END, SK_ROW_HEADER, SK_ROW_MEMBER, SK_SEARCHNOTNULL, ScanKeyData::sk_strategy, ScanKeyData::sk_subtype, status(), IndexScanDescData::xs_heaptid, IndexScanDescData::xs_itup, IndexScanDescData::xs_snapshot, and IndexScanDescData::xs_want_itup.

Referenced by btgetbitmap(), and btgettuple().

854 {
855  Relation rel = scan->indexRelation;
856  BTScanOpaque so = (BTScanOpaque) scan->opaque;
857  Buffer buf;
858  BTStack stack;
859  OffsetNumber offnum;
860  StrategyNumber strat;
861  bool nextkey;
862  bool goback;
863  BTScanInsertData inskey;
864  ScanKey startKeys[INDEX_MAX_KEYS];
865  ScanKeyData notnullkeys[INDEX_MAX_KEYS];
866  int keysCount = 0;
867  int i;
868  bool status = true;
869  StrategyNumber strat_total;
870  BTScanPosItem *currItem;
871  BlockNumber blkno;
872 
874 
876 
877  /*
878  * Examine the scan keys and eliminate any redundant keys; also mark the
879  * keys that must be matched to continue the scan.
880  */
881  _bt_preprocess_keys(scan);
882 
883  /*
884  * Quit now if _bt_preprocess_keys() discovered that the scan keys can
885  * never be satisfied (eg, x == 1 AND x > 2).
886  */
887  if (!so->qual_ok)
888  return false;
889 
890  /*
891  * For parallel scans, get the starting page from shared state. If the
892  * scan has not started, proceed to find out first leaf page in the usual
893  * way while keeping other participating processes waiting. If the scan
894  * has already begun, use the page number from the shared structure.
895  */
896  if (scan->parallel_scan != NULL)
897  {
898  status = _bt_parallel_seize(scan, &blkno);
899  if (!status)
900  return false;
901  else if (blkno == P_NONE)
902  {
903  _bt_parallel_done(scan);
904  return false;
905  }
906  else if (blkno != InvalidBlockNumber)
907  {
908  if (!_bt_parallel_readpage(scan, blkno, dir))
909  return false;
910  goto readcomplete;
911  }
912  }
913 
914  /*----------
915  * Examine the scan keys to discover where we need to start the scan.
916  *
917  * We want to identify the keys that can be used as starting boundaries;
918  * these are =, >, or >= keys for a forward scan or =, <, <= keys for
919  * a backwards scan. We can use keys for multiple attributes so long as
920  * the prior attributes had only =, >= (resp. =, <=) keys. Once we accept
921  * a > or < boundary or find an attribute with no boundary (which can be
922  * thought of as the same as "> -infinity"), we can't use keys for any
923  * attributes to its right, because it would break our simplistic notion
924  * of what initial positioning strategy to use.
925  *
926  * When the scan keys include cross-type operators, _bt_preprocess_keys
927  * may not be able to eliminate redundant keys; in such cases we will
928  * arbitrarily pick a usable one for each attribute. This is correct
929  * but possibly not optimal behavior. (For example, with keys like
930  * "x >= 4 AND x >= 5" we would elect to scan starting at x=4 when
931  * x=5 would be more efficient.) Since the situation only arises given
932  * a poorly-worded query plus an incomplete opfamily, live with it.
933  *
934  * When both equality and inequality keys appear for a single attribute
935  * (again, only possible when cross-type operators appear), we *must*
936  * select one of the equality keys for the starting point, because
937  * _bt_checkkeys() will stop the scan as soon as an equality qual fails.
938  * For example, if we have keys like "x >= 4 AND x = 10" and we elect to
939  * start at x=4, we will fail and stop before reaching x=10. If multiple
940  * equality quals survive preprocessing, however, it doesn't matter which
941  * one we use --- by definition, they are either redundant or
942  * contradictory.
943  *
944  * Any regular (not SK_SEARCHNULL) key implies a NOT NULL qualifier.
945  * If the index stores nulls at the end of the index we'll be starting
946  * from, and we have no boundary key for the column (which means the key
947  * we deduced NOT NULL from is an inequality key that constrains the other
948  * end of the index), then we cons up an explicit SK_SEARCHNOTNULL key to
949  * use as a boundary key. If we didn't do this, we might find ourselves
950  * traversing a lot of null entries at the start of the scan.
951  *
952  * In this loop, row-comparison keys are treated the same as keys on their
953  * first (leftmost) columns. We'll add on lower-order columns of the row
954  * comparison below, if possible.
955  *
956  * The selected scan keys (at most one per index column) are remembered by
957  * storing their addresses into the local startKeys[] array.
958  *----------
959  */
960  strat_total = BTEqualStrategyNumber;
961  if (so->numberOfKeys > 0)
962  {
963  AttrNumber curattr;
964  ScanKey chosen;
965  ScanKey impliesNN;
966  ScanKey cur;
967 
968  /*
969  * chosen is the so-far-chosen key for the current attribute, if any.
970  * We don't cast the decision in stone until we reach keys for the
971  * next attribute.
972  */
973  curattr = 1;
974  chosen = NULL;
975  /* Also remember any scankey that implies a NOT NULL constraint */
976  impliesNN = NULL;
977 
978  /*
979  * Loop iterates from 0 to numberOfKeys inclusive; we use the last
980  * pass to handle after-last-key processing. Actual exit from the
981  * loop is at one of the "break" statements below.
982  */
983  for (cur = so->keyData, i = 0;; cur++, i++)
984  {
985  if (i >= so->numberOfKeys || cur->sk_attno != curattr)
986  {
987  /*
988  * Done looking at keys for curattr. If we didn't find a
989  * usable boundary key, see if we can deduce a NOT NULL key.
990  */
991  if (chosen == NULL && impliesNN != NULL &&
992  ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
995  {
996  /* Yes, so build the key in notnullkeys[keysCount] */
997  chosen = &notnullkeys[keysCount];
998  ScanKeyEntryInitialize(chosen,
1000  (impliesNN->sk_flags &
1002  curattr,
1003  ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
1006  InvalidOid,
1007  InvalidOid,
1008  InvalidOid,
1009  (Datum) 0);
1010  }
1011 
1012  /*
1013  * If we still didn't find a usable boundary key, quit; else
1014  * save the boundary key pointer in startKeys.
1015  */
1016  if (chosen == NULL)
1017  break;
1018  startKeys[keysCount++] = chosen;
1019 
1020  /*
1021  * Adjust strat_total, and quit if we have stored a > or <
1022  * key.
1023  */
1024  strat = chosen->sk_strategy;
1025  if (strat != BTEqualStrategyNumber)
1026  {
1027  strat_total = strat;
1028  if (strat == BTGreaterStrategyNumber ||
1029  strat == BTLessStrategyNumber)
1030  break;
1031  }
1032 
1033  /*
1034  * Done if that was the last attribute, or if next key is not
1035  * in sequence (implying no boundary key is available for the
1036  * next attribute).
1037  */
1038  if (i >= so->numberOfKeys ||
1039  cur->sk_attno != curattr + 1)
1040  break;
1041 
1042  /*
1043  * Reset for next attr.
1044  */
1045  curattr = cur->sk_attno;
1046  chosen = NULL;
1047  impliesNN = NULL;
1048  }
1049 
1050  /*
1051  * Can we use this key as a starting boundary for this attr?
1052  *
1053  * If not, does it imply a NOT NULL constraint? (Because
1054  * SK_SEARCHNULL keys are always assigned BTEqualStrategyNumber,
1055  * *any* inequality key works for that; we need not test.)
1056  */
1057  switch (cur->sk_strategy)
1058  {
1059  case BTLessStrategyNumber:
1061  if (chosen == NULL)
1062  {
1063  if (ScanDirectionIsBackward(dir))
1064  chosen = cur;
1065  else
1066  impliesNN = cur;
1067  }
1068  break;
1069  case BTEqualStrategyNumber:
1070  /* override any non-equality choice */
1071  chosen = cur;
1072  break;
1075  if (chosen == NULL)
1076  {
1077  if (ScanDirectionIsForward(dir))
1078  chosen = cur;
1079  else
1080  impliesNN = cur;
1081  }
1082  break;
1083  }
1084  }
1085  }
1086 
1087  /*
1088  * If we found no usable boundary keys, we have to start from one end of
1089  * the tree. Walk down that edge to the first or last key, and scan from
1090  * there.
1091  */
1092  if (keysCount == 0)
1093  {
1094  bool match;
1095 
1096  match = _bt_endpoint(scan, dir);
1097 
1098  if (!match)
1099  {
1100  /* No match, so mark (parallel) scan finished */
1101  _bt_parallel_done(scan);
1102  }
1103 
1104  return match;
1105  }
1106 
1107  /*
1108  * We want to start the scan somewhere within the index. Set up an
1109  * insertion scankey we can use to search for the boundary point we
1110  * identified above. The insertion scankey is built using the keys
1111  * identified by startKeys[]. (Remaining insertion scankey fields are
1112  * initialized after initial-positioning strategy is finalized.)
1113  */
1114  Assert(keysCount <= INDEX_MAX_KEYS);
1115  for (i = 0; i < keysCount; i++)
1116  {
1117  ScanKey cur = startKeys[i];
1118 
1119  Assert(cur->sk_attno == i + 1);
1120 
1121  if (cur->sk_flags & SK_ROW_HEADER)
1122  {
1123  /*
1124  * Row comparison header: look to the first row member instead.
1125  *
1126  * The member scankeys are already in insertion format (ie, they
1127  * have sk_func = 3-way-comparison function), but we have to watch
1128  * out for nulls, which _bt_preprocess_keys didn't check. A null
1129  * in the first row member makes the condition unmatchable, just
1130  * like qual_ok = false.
1131  */
1132  ScanKey subkey = (ScanKey) DatumGetPointer(cur->sk_argument);
1133 
1134  Assert(subkey->sk_flags & SK_ROW_MEMBER);
1135  if (subkey->sk_flags & SK_ISNULL)
1136  {
1137  _bt_parallel_done(scan);
1138  return false;
1139  }
1140  memcpy(inskey.scankeys + i, subkey, sizeof(ScanKeyData));
1141 
1142  /*
1143  * If the row comparison is the last positioning key we accepted,
1144  * try to add additional keys from the lower-order row members.
1145  * (If we accepted independent conditions on additional index
1146  * columns, we use those instead --- doesn't seem worth trying to
1147  * determine which is more restrictive.) Note that this is OK
1148  * even if the row comparison is of ">" or "<" type, because the
1149  * condition applied to all but the last row member is effectively
1150  * ">=" or "<=", and so the extra keys don't break the positioning
1151  * scheme. But, by the same token, if we aren't able to use all
1152  * the row members, then the part of the row comparison that we
1153  * did use has to be treated as just a ">=" or "<=" condition, and
1154  * so we'd better adjust strat_total accordingly.
1155  */
1156  if (i == keysCount - 1)
1157  {
1158  bool used_all_subkeys = false;
1159 
1160  Assert(!(subkey->sk_flags & SK_ROW_END));
1161  for (;;)
1162  {
1163  subkey++;
1164  Assert(subkey->sk_flags & SK_ROW_MEMBER);
1165  if (subkey->sk_attno != keysCount + 1)
1166  break; /* out-of-sequence, can't use it */
1167  if (subkey->sk_strategy != cur->sk_strategy)
1168  break; /* wrong direction, can't use it */
1169  if (subkey->sk_flags & SK_ISNULL)
1170  break; /* can't use null keys */
1171  Assert(keysCount < INDEX_MAX_KEYS);
1172  memcpy(inskey.scankeys + keysCount, subkey,
1173  sizeof(ScanKeyData));
1174  keysCount++;
1175  if (subkey->sk_flags & SK_ROW_END)
1176  {
1177  used_all_subkeys = true;
1178  break;
1179  }
1180  }
1181  if (!used_all_subkeys)
1182  {
1183  switch (strat_total)
1184  {
1185  case BTLessStrategyNumber:
1186  strat_total = BTLessEqualStrategyNumber;
1187  break;
1189  strat_total = BTGreaterEqualStrategyNumber;
1190  break;
1191  }
1192  }
1193  break; /* done with outer loop */
1194  }
1195  }
1196  else
1197  {
1198  /*
1199  * Ordinary comparison key. Transform the search-style scan key
1200  * to an insertion scan key by replacing the sk_func with the
1201  * appropriate btree comparison function.
1202  *
1203  * If scankey operator is not a cross-type comparison, we can use
1204  * the cached comparison function; otherwise gotta look it up in
1205  * the catalogs. (That can't lead to infinite recursion, since no
1206  * indexscan initiated by syscache lookup will use cross-data-type
1207  * operators.)
1208  *
1209  * We support the convention that sk_subtype == InvalidOid means
1210  * the opclass input type; this is a hack to simplify life for
1211  * ScanKeyInit().
1212  */
1213  if (cur->sk_subtype == rel->rd_opcintype[i] ||
1214  cur->sk_subtype == InvalidOid)
1215  {
1216  FmgrInfo *procinfo;
1217 
1218  procinfo = index_getprocinfo(rel, cur->sk_attno, BTORDER_PROC);
1219  ScanKeyEntryInitializeWithInfo(inskey.scankeys + i,
1220  cur->sk_flags,
1221  cur->sk_attno,
1223  cur->sk_subtype,
1224  cur->sk_collation,
1225  procinfo,
1226  cur->sk_argument);
1227  }
1228  else
1229  {
1230  RegProcedure cmp_proc;
1231 
1232  cmp_proc = get_opfamily_proc(rel->rd_opfamily[i],
1233  rel->rd_opcintype[i],
1234  cur->sk_subtype,
1235  BTORDER_PROC);
1236  if (!RegProcedureIsValid(cmp_proc))
1237  elog(ERROR, "missing support function %d(%u,%u) for attribute %d of index \"%s\"",
1238  BTORDER_PROC, rel->rd_opcintype[i], cur->sk_subtype,
1239  cur->sk_attno, RelationGetRelationName(rel));
1240  ScanKeyEntryInitialize(inskey.scankeys + i,
1241  cur->sk_flags,
1242  cur->sk_attno,
1244  cur->sk_subtype,
1245  cur->sk_collation,
1246  cmp_proc,
1247  cur->sk_argument);
1248  }
1249  }
1250  }
1251 
1252  /*----------
1253  * Examine the selected initial-positioning strategy to determine exactly
1254  * where we need to start the scan, and set flag variables to control the
1255  * code below.
1256  *
1257  * If nextkey = false, _bt_search and _bt_binsrch will locate the first
1258  * item >= scan key. If nextkey = true, they will locate the first
1259  * item > scan key.
1260  *
1261  * If goback = true, we will then step back one item, while if
1262  * goback = false, we will start the scan on the located item.
1263  *----------
1264  */
1265  switch (strat_total)
1266  {
1267  case BTLessStrategyNumber:
1268 
1269  /*
1270  * Find first item >= scankey, then back up one to arrive at last
1271  * item < scankey. (Note: this positioning strategy is only used
1272  * for a backward scan, so that is always the correct starting
1273  * position.)
1274  */
1275  nextkey = false;
1276  goback = true;
1277  break;
1278 
1280 
1281  /*
1282  * Find first item > scankey, then back up one to arrive at last
1283  * item <= scankey. (Note: this positioning strategy is only used
1284  * for a backward scan, so that is always the correct starting
1285  * position.)
1286  */
1287  nextkey = true;
1288  goback = true;
1289  break;
1290 
1291  case BTEqualStrategyNumber:
1292 
1293  /*
1294  * If a backward scan was specified, need to start with last equal
1295  * item not first one.
1296  */
1297  if (ScanDirectionIsBackward(dir))
1298  {
1299  /*
1300  * This is the same as the <= strategy. We will check at the
1301  * end whether the found item is actually =.
1302  */
1303  nextkey = true;
1304  goback = true;
1305  }
1306  else
1307  {
1308  /*
1309  * This is the same as the >= strategy. We will check at the
1310  * end whether the found item is actually =.
1311  */
1312  nextkey = false;
1313  goback = false;
1314  }
1315  break;
1316 
1318 
1319  /*
1320  * Find first item >= scankey. (This is only used for forward
1321  * scans.)
1322  */
1323  nextkey = false;
1324  goback = false;
1325  break;
1326 
1328 
1329  /*
1330  * Find first item > scankey. (This is only used for forward
1331  * scans.)
1332  */
1333  nextkey = true;
1334  goback = false;
1335  break;
1336 
1337  default:
1338  /* can't get here, but keep compiler quiet */
1339  elog(ERROR, "unrecognized strat_total: %d", (int) strat_total);
1340  return false;
1341  }
1342 
1343  /* Initialize remaining insertion scan key fields */
1344  _bt_metaversion(rel, &inskey.heapkeyspace, &inskey.allequalimage);
1345  inskey.anynullkeys = false; /* unused */
1346  inskey.nextkey = nextkey;
1347  inskey.pivotsearch = false;
1348  inskey.scantid = NULL;
1349  inskey.keysz = keysCount;
1350 
1351  /*
1352  * Use the manufactured insertion scan key to descend the tree and
1353  * position ourselves on the target leaf page.
1354  */
1355  stack = _bt_search(rel, &inskey, &buf, BT_READ, scan->xs_snapshot);
1356 
1357  /* don't need to keep the stack around... */
1358  _bt_freestack(stack);
1359 
1360  if (!BufferIsValid(buf))
1361  {
1362  /*
1363  * We only get here if the index is completely empty. Lock relation
1364  * because nothing finer to lock exists.
1365  */
1366  PredicateLockRelation(rel, scan->xs_snapshot);
1367 
1368  /*
1369  * mark parallel scan as done, so that all the workers can finish
1370  * their scan
1371  */
1372  _bt_parallel_done(scan);
1374 
1375  return false;
1376  }
1377  else
1379  scan->xs_snapshot);
1380 
1381  _bt_initialize_more_data(so, dir);
1382 
1383  /* position to the precise item on the page */
1384  offnum = _bt_binsrch(rel, &inskey, buf);
1385 
1386  /*
1387  * If nextkey = false, we are positioned at the first item >= scan key, or
1388  * possibly at the end of a page on which all the existing items are less
1389  * than the scan key and we know that everything on later pages is greater
1390  * than or equal to scan key.
1391  *
1392  * If nextkey = true, we are positioned at the first item > scan key, or
1393  * possibly at the end of a page on which all the existing items are less
1394  * than or equal to the scan key and we know that everything on later
1395  * pages is greater than scan key.
1396  *
1397  * The actually desired starting point is either this item or the prior
1398  * one, or in the end-of-page case it's the first item on the next page or
1399  * the last item on this page. Adjust the starting offset if needed. (If
1400  * this results in an offset before the first item or after the last one,
1401  * _bt_readpage will report no items found, and then we'll step to the
1402  * next page as needed.)
1403  */
1404  if (goback)
1405  offnum = OffsetNumberPrev(offnum);
1406 
1407  /* remember which buffer we have pinned, if any */
1409  so->currPos.buf = buf;
1410 
1411  /*
1412  * Now load data from the first page of the scan.
1413  */
1414  if (!_bt_readpage(scan, dir, offnum))
1415  {
1416  /*
1417  * There's no actually-matching data on this page. Try to advance to
1418  * the next page. Return false if there's no matching data at all.
1419  */
1421  if (!_bt_steppage(scan, dir))
1422  return false;
1423  }
1424  else
1425  {
1426  /* Drop the lock, and maybe the pin, on the current page */
1428  }
1429 
1430 readcomplete:
1431  /* OK, itemIndex says what to return */
1432  currItem = &so->currPos.items[so->currPos.itemIndex];
1433  scan->xs_heaptid = currItem->heapTid;
1434  if (scan->xs_want_itup)
1435  scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
1436 
1437  return true;
1438 }
#define InvalidStrategy
Definition: stratnum.h:24
Oid sk_subtype
Definition: skey.h:69
#define BUFFER_LOCK_UNLOCK
Definition: bufmgr.h:96
Definition: fmgr.h:56
#define SK_ROW_MEMBER
Definition: skey.h:118
void _bt_freestack(BTStack stack)
Definition: nbtutils.c:175
void PredicateLockPage(Relation relation, BlockNumber blkno, Snapshot snapshot)
Definition: predicate.c:2523
#define BTORDER_PROC
Definition: nbtree.h:582
#define BTGreaterStrategyNumber
Definition: stratnum.h:33
FmgrInfo * index_getprocinfo(Relation irel, AttrNumber attnum, uint16 procnum)
Definition: indexam.c:800
#define ScanDirectionIsForward(direction)
Definition: sdir.h:55
IndexTuple xs_itup
Definition: relscan.h:127
void ScanKeyEntryInitializeWithInfo(ScanKey entry, int flags, AttrNumber attributeNumber, StrategyNumber strategy, Oid subtype, Oid collation, FmgrInfo *finfo, Datum argument)
Definition: scankey.c:101
void _bt_parallel_done(IndexScanDesc scan)
Definition: nbtree.c:728
static bool _bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
Definition: nbtsearch.c:2147
static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp)
Definition: nbtsearch.c:65
int itemIndex
Definition: nbtree.h:861
void PredicateLockRelation(Relation relation, Snapshot snapshot)
Definition: predicate.c:2500
char * currTuples
Definition: nbtree.h:935
regproc RegProcedure
Definition: c.h:511
#define P_NONE
Definition: nbtree.h:206
struct cursor * cur
Definition: ecpg.c:28
uint16 StrategyNumber
Definition: stratnum.h:22
#define BTScanPosIsValid(scanpos)
Definition: nbtree.h:885
struct SnapshotData * xs_snapshot
Definition: relscan.h:104
uint32 BlockNumber
Definition: block.h:31
static OffsetNumber _bt_binsrch(Relation rel, BTScanInsert key, Buffer buf)
Definition: nbtsearch.c:348
#define BTLessEqualStrategyNumber
Definition: stratnum.h:30
Relation indexRelation
Definition: relscan.h:103
uint16 OffsetNumber
Definition: off.h:24
#define SK_ROW_END
Definition: skey.h:119
#define ScanDirectionIsBackward(direction)
Definition: sdir.h:41
#define BT_READ
Definition: nbtree.h:594
#define ERROR
Definition: elog.h:43
BTScanOpaqueData * BTScanOpaque
Definition: nbtree.h:952
StrategyNumber sk_strategy
Definition: skey.h:68
ItemPointerData xs_heaptid
Definition: relscan.h:132
static char * buf
Definition: pg_test_fsync.c:67
void ScanKeyEntryInitialize(ScanKey entry, int flags, AttrNumber attributeNumber, StrategyNumber strategy, Oid subtype, Oid collation, RegProcedure procedure, Datum argument)
Definition: scankey.c:32
ScanKeyData * ScanKey
Definition: skey.h:75
static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:2380
#define RegProcedureIsValid(p)
Definition: c.h:646
IndexTupleData * IndexTuple
Definition: itup.h:53
#define RelationGetRelationName(relation)
Definition: rel.h:490
#define pgstat_count_index_scan(rel)
Definition: pgstat.h:1417
#define SK_SEARCHNOTNULL
Definition: skey.h:122
bool _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno)
Definition: nbtree.c:647
Oid * rd_opfamily
Definition: rel.h:189
#define SK_ISNULL
Definition: skey.h:115
#define SK_ROW_HEADER
Definition: skey.h:117
void _bt_metaversion(Relation rel, bool *heapkeyspace, bool *allequalimage)
Definition: nbtpage.c:660
void _bt_preprocess_keys(IndexScanDesc scan)
Definition: nbtutils.c:742
#define SK_BT_NULLS_FIRST
Definition: nbtree.h:963
static void _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir)
Definition: nbtsearch.c:2470
struct ParallelIndexScanDescData * parallel_scan
Definition: relscan.h:151
uintptr_t Datum
Definition: postgres.h:367
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:3722
#define InvalidOid
Definition: postgres_ext.h:36
int sk_flags
Definition: skey.h:66
#define Assert(condition)
Definition: c.h:738
#define SK_BT_DESC
Definition: nbtree.h:962
Oid get_opfamily_proc(Oid opfamily, Oid lefttype, Oid righttype, int16 procnum)
Definition: lsyscache.c:745
#define BTScanPosInvalidate(scanpos)
Definition: nbtree.h:891
#define INDEX_MAX_KEYS
#define InvalidBlockNumber
Definition: block.h:33
#define OffsetNumberPrev(offsetNumber)
Definition: off.h:54
int numberOfKeys
Definition: nbtree.h:914
#define BufferIsValid(bufnum)
Definition: bufmgr.h:123
BTScanPosItem items[MaxTIDsPerBTreePage]
Definition: nbtree.h:863
#define DatumGetPointer(X)
Definition: postgres.h:549
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2633
BTScanPosData currPos
Definition: nbtree.h:948
ScanKey keyData
Definition: nbtree.h:915
Oid sk_collation
Definition: skey.h:70
#define elog(elevel,...)
Definition: elog.h:214
int i
static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:1862
Buffer buf
Definition: nbtree.h:831
Oid * rd_opcintype
Definition: rel.h:190
static void static void status(const char *fmt,...) pg_attribute_printf(1
Definition: pg_regress.c:225
#define BTLessStrategyNumber
Definition: stratnum.h:29
int Buffer
Definition: buf.h:23
Datum sk_argument
Definition: skey.h:72
static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
Definition: nbtsearch.c:1510
int16 AttrNumber
Definition: attnum.h:21
BTStack _bt_search(Relation rel, BTScanInsert key, Buffer *bufP, int access, Snapshot snapshot)
Definition: nbtsearch.c:100
#define BTEqualStrategyNumber
Definition: stratnum.h:31
#define BTGreaterEqualStrategyNumber
Definition: stratnum.h:32
AttrNumber sk_attno
Definition: skey.h:67

◆ _bt_form_posting()

IndexTuple _bt_form_posting ( IndexTuple  base,
ItemPointer  htids,
int  nhtids 
)

Definition at line 601 of file nbtdedup.c.

References Assert, BTreeTupleGetPosting(), BTreeTupleGetPostingOffset(), BTreeTupleIsPivot(), BTreeTupleIsPosting(), BTreeTupleSetPosting(), INDEX_ALT_TID_MASK, INDEX_SIZE_MASK, IndexTupleSize, ItemPointerCopy, ItemPointerIsValid, MAXALIGN, palloc0(), PG_UINT16_MAX, IndexTupleData::t_info, and IndexTupleData::t_tid.

Referenced by _bt_dedup_finish_pending(), _bt_sort_dedup_finish_pending(), and bt_posting_plain_tuple().

602 {
603  uint32 keysize,
604  newsize;
605  IndexTuple itup;
606 
607  if (BTreeTupleIsPosting(base))
608  keysize = BTreeTupleGetPostingOffset(base);
609  else
610  keysize = IndexTupleSize(base);
611 
612  Assert(!BTreeTupleIsPivot(base));
613  Assert(nhtids > 0 && nhtids <= PG_UINT16_MAX);
614  Assert(keysize == MAXALIGN(keysize));
615 
616  /* Determine final size of new tuple */
617  if (nhtids > 1)
618  newsize = MAXALIGN(keysize +
619  nhtids * sizeof(ItemPointerData));
620  else
621  newsize = keysize;
622 
623  Assert(newsize <= INDEX_SIZE_MASK);
624  Assert(newsize == MAXALIGN(newsize));
625 
626  /* Allocate memory using palloc0() (matches index_form_tuple()) */
627  itup = palloc0(newsize);
628  memcpy(itup, base, keysize);
629  itup->t_info &= ~INDEX_SIZE_MASK;
630  itup->t_info |= newsize;
631  if (nhtids > 1)
632  {
633  /* Form posting list tuple */
634  BTreeTupleSetPosting(itup, nhtids, keysize);
635  memcpy(BTreeTupleGetPosting(itup), htids,
636  sizeof(ItemPointerData) * nhtids);
637  Assert(_bt_posting_valid(itup));
638  }
639  else
640  {
641  /* Form standard non-pivot tuple */
642  itup->t_info &= ~INDEX_ALT_TID_MASK;
643  ItemPointerCopy(htids, &itup->t_tid);
645  }
646 
647  return itup;
648 }
#define ItemPointerIsValid(pointer)
Definition: itemptr.h:82
static bool BTreeTupleIsPivot(IndexTuple itup)
Definition: nbtree.h:357
static ItemPointer BTreeTupleGetPosting(IndexTuple posting)
Definition: nbtree.h:412
ItemPointerData t_tid
Definition: itup.h:37
static void BTreeTupleSetPosting(IndexTuple itup, int nhtids, int postingoffset)
Definition: nbtree.h:381
#define INDEX_SIZE_MASK
Definition: itup.h:65
#define INDEX_ALT_TID_MASK
Definition: nbtree.h:344
#define PG_UINT16_MAX
Definition: c.h:448
unsigned int uint32
Definition: c.h:367
void * palloc0(Size size)
Definition: mcxt.c:980
static bool BTreeTupleIsPosting(IndexTuple itup)
Definition: nbtree.h:369
static uint32 BTreeTupleGetPostingOffset(IndexTuple posting)
Definition: nbtree.h:404
#define Assert(condition)
Definition: c.h:738
#define MAXALIGN(LEN)
Definition: c.h:691
unsigned short t_info
Definition: itup.h:49
#define IndexTupleSize(itup)
Definition: itup.h:71
#define ItemPointerCopy(fromPointer, toPointer)
Definition: itemptr.h:161

◆ _bt_freestack()

void _bt_freestack ( BTStack  stack)

Definition at line 175 of file nbtutils.c.

References BTStackData::bts_parent, and pfree().

Referenced by _bt_doinsert(), _bt_first(), and bt_rootdescend().

176 {
177  BTStack ostack;
178 
179  while (stack != NULL)
180  {
181  ostack = stack;
182  stack = stack->bts_parent;
183  pfree(ostack);
184  }
185 }
void pfree(void *pointer)
Definition: mcxt.c:1056
struct BTStackData * bts_parent
Definition: nbtree.h:611

◆ _bt_get_endpoint()

Buffer _bt_get_endpoint ( Relation  rel,
uint32  level,
bool  rightmost,
Snapshot  snapshot 
)

Definition at line 2296 of file nbtsearch.c.

References _bt_getroot(), _bt_gettrueroot(), _bt_relandgetbuf(), BT_READ, BTPageOpaqueData::btpo, BTPageOpaqueData::btpo_next, BTreeTupleGetDownLink(), buf, BufferGetPage, BufferIsValid, elog, ereport, errcode(), errmsg_internal(), ERROR, InvalidBuffer, BTPageOpaqueData::level, P_FIRSTDATAKEY, P_IGNORE, P_NONE, P_RIGHTMOST, PageGetItem, PageGetItemId, PageGetMaxOffsetNumber, PageGetSpecialPointer, RelationGetRelationName, and TestForOldSnapshot().

Referenced by _bt_endpoint(), and _bt_insert_parent().

2298 {
2299  Buffer buf;
2300  Page page;
2301  BTPageOpaque opaque;
2302  OffsetNumber offnum;
2303  BlockNumber blkno;
2304  IndexTuple itup;
2305 
2306  /*
2307  * If we are looking for a leaf page, okay to descend from fast root;
2308  * otherwise better descend from true root. (There is no point in being
2309  * smarter about intermediate levels.)
2310  */
2311  if (level == 0)
2312  buf = _bt_getroot(rel, BT_READ);
2313  else
2314  buf = _bt_gettrueroot(rel);
2315 
2316  if (!BufferIsValid(buf))
2317  return InvalidBuffer;
2318 
2319  page = BufferGetPage(buf);
2320  TestForOldSnapshot(snapshot, rel, page);
2321  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2322 
2323  for (;;)
2324  {
2325  /*
2326  * If we landed on a deleted page, step right to find a live page
2327  * (there must be one). Also, if we want the rightmost page, step
2328  * right if needed to get to it (this could happen if the page split
2329  * since we obtained a pointer to it).
2330  */
2331  while (P_IGNORE(opaque) ||
2332  (rightmost && !P_RIGHTMOST(opaque)))
2333  {
2334  blkno = opaque->btpo_next;
2335  if (blkno == P_NONE)
2336  elog(ERROR, "fell off the end of index \"%s\"",
2338  buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2339  page = BufferGetPage(buf);
2340  TestForOldSnapshot(snapshot, rel, page);
2341  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2342  }
2343 
2344  /* Done? */
2345  if (opaque->btpo.level == level)
2346  break;
2347  if (opaque->btpo.level < level)
2348  ereport(ERROR,
2349  (errcode(ERRCODE_INDEX_CORRUPTED),
2350  errmsg_internal("btree level %u not found in index \"%s\"",
2351  level, RelationGetRelationName(rel))));
2352 
2353  /* Descend to leftmost or rightmost child page */
2354  if (rightmost)
2355  offnum = PageGetMaxOffsetNumber(page);
2356  else
2357  offnum = P_FIRSTDATAKEY(opaque);
2358 
2359  itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
2360  blkno = BTreeTupleGetDownLink(itup);
2361 
2362  buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2363  page = BufferGetPage(buf);
2364  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2365  }
2366 
2367  return buf;
2368 }
BlockNumber btpo_next
Definition: nbtree.h:59
static void TestForOldSnapshot(Snapshot snapshot, Relation relation, Page page)
Definition: bufmgr.h:277
#define P_IGNORE(opaque)
Definition: nbtree.h:219
Buffer _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
Definition: nbtpage.c:921
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:244
union BTPageOpaqueData::@45 btpo
#define P_NONE
Definition: nbtree.h:206
static BlockNumber BTreeTupleGetDownLink(IndexTuple pivot)
Definition: nbtree.h:431
#define InvalidBuffer
Definition: buf.h:25
int errcode(int sqlerrcode)
Definition: elog.c:610
uint32 BlockNumber
Definition: block.h:31
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:357
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:69
uint16 OffsetNumber
Definition: off.h:24
#define BT_READ
Definition: nbtree.h:594
#define ERROR
Definition: elog.h:43
static char * buf
Definition: pg_test_fsync.c:67
IndexTupleData * IndexTuple
Definition: itup.h:53
#define RelationGetRelationName(relation)
Definition: rel.h:490
Buffer _bt_getroot(Relation rel, int access)
Definition: nbtpage.c:264
#define BufferGetPage(buffer)
Definition: bufmgr.h:169
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:235
uint32 level
Definition: nbtree.h:62
Buffer _bt_gettrueroot(Relation rel)
Definition: nbtpage.c:500
#define ereport(elevel,...)
Definition: elog.h:144
int errmsg_internal(const char *fmt,...)
Definition: elog.c:911
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
#define BufferIsValid(bufnum)
Definition: bufmgr.h:123
#define elog(elevel,...)
Definition: elog.h:214
int Buffer
Definition: buf.h:23
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:213
#define PageGetItem(page, itemId)
Definition: bufpage.h:340
Pointer Page
Definition: bufpage.h:78

◆ _bt_getbuf()

Buffer _bt_getbuf ( Relation  rel,
BlockNumber  blkno,
int  access 
)

Definition at line 785 of file nbtpage.c.

References _bt_checkpage(), _bt_log_reuse_page(), _bt_page_recyclable(), _bt_pageinit(), _bt_relbuf(), Assert, BT_WRITE, BTPageOpaqueData::btpo, buf, BufferGetPage, BufferGetPageSize, ConditionalLockBuffer(), DEBUG2, elog, ExclusiveLock, GetFreeIndexPage(), InvalidBlockNumber, LockBuffer(), LockRelationForExtension(), P_NEW, PageGetSpecialPointer, PageIsNew, ReadBuffer(), RELATION_IS_LOCAL, RelationNeedsWAL, ReleaseBuffer(), UnlockRelationForExtension(), BTPageOpaqueData::xact, and XLogStandbyInfoActive.

Referenced by _bt_finish_split(), _bt_getroot(), _bt_getrootheight(), _bt_getstackbuf(), _bt_gettrueroot(), _bt_insertonpg(), _bt_is_page_halfdead(), _bt_killitems(), _bt_lock_branch_parent(), _bt_metaversion(), _bt_moveright(), _bt_newroot(), _bt_pagedel(), _bt_readnextpage(), _bt_split(), _bt_unlink_halfdead_page(), _bt_update_meta_cleanup_info(), _bt_vacuum_needs_cleanup(), and _bt_walk_left().

786 {
787  Buffer buf;
788 
789  if (blkno != P_NEW)
790  {
791  /* Read an existing block of the relation */
792  buf = ReadBuffer(rel, blkno);
793  LockBuffer(buf, access);
794  _bt_checkpage(rel, buf);
795  }
796  else
797  {
798  bool needLock;
799  Page page;
800 
801  Assert(access == BT_WRITE);
802 
803  /*
804  * First see if the FSM knows of any free pages.
805  *
806  * We can't trust the FSM's report unreservedly; we have to check that
807  * the page is still free. (For example, an already-free page could
808  * have been re-used between the time the last VACUUM scanned it and
809  * the time the VACUUM made its FSM updates.)
810  *
811  * In fact, it's worse than that: we can't even assume that it's safe
812  * to take a lock on the reported page. If somebody else has a lock
813  * on it, or even worse our own caller does, we could deadlock. (The
814  * own-caller scenario is actually not improbable. Consider an index
815  * on a serial or timestamp column. Nearly all splits will be at the
816  * rightmost page, so it's entirely likely that _bt_split will call us
817  * while holding a lock on the page most recently acquired from FSM. A
818  * VACUUM running concurrently with the previous split could well have
819  * placed that page back in FSM.)
820  *
821  * To get around that, we ask for only a conditional lock on the
822  * reported page. If we fail, then someone else is using the page,
823  * and we may reasonably assume it's not free. (If we happen to be
824  * wrong, the worst consequence is the page will be lost to use till
825  * the next VACUUM, which is no big problem.)
826  */
827  for (;;)
828  {
829  blkno = GetFreeIndexPage(rel);
830  if (blkno == InvalidBlockNumber)
831  break;
832  buf = ReadBuffer(rel, blkno);
833  if (ConditionalLockBuffer(buf))
834  {
835  page = BufferGetPage(buf);
836  if (_bt_page_recyclable(page))
837  {
838  /*
839  * If we are generating WAL for Hot Standby then create a
840  * WAL record that will allow us to conflict with queries
841  * running on standby, in case they have snapshots older
842  * than btpo.xact. This can only apply if the page does
843  * have a valid btpo.xact value, ie not if it's new. (We
844  * must check that because an all-zero page has no special
845  * space.)
846  */
847  if (XLogStandbyInfoActive() && RelationNeedsWAL(rel) &&
848  !PageIsNew(page))
849  {
851 
852  _bt_log_reuse_page(rel, blkno, opaque->btpo.xact);
853  }
854 
855  /* Okay to use page. Re-initialize and return it */
856  _bt_pageinit(page, BufferGetPageSize(buf));
857  return buf;
858  }
859  elog(DEBUG2, "FSM returned nonrecyclable page");
860  _bt_relbuf(rel, buf);
861  }
862  else
863  {
864  elog(DEBUG2, "FSM returned nonlockable page");
865  /* couldn't get lock, so just drop pin */
866  ReleaseBuffer(buf);
867  }
868  }
869 
870  /*
871  * Extend the relation by one page.
872  *
873  * We have to use a lock to ensure no one else is extending the rel at
874  * the same time, else we will both try to initialize the same new
875  * page. We can skip locking for new or temp relations, however,
876  * since no one else could be accessing them.
877  */
878  needLock = !RELATION_IS_LOCAL(rel);
879 
880  if (needLock)
882 
883  buf = ReadBuffer(rel, P_NEW);
884 
885  /* Acquire buffer lock on new page */
886  LockBuffer(buf, BT_WRITE);
887 
888  /*
889  * Release the file-extension lock; it's now OK for someone else to
890  * extend the relation some more. Note that we cannot release this
891  * lock before we have buffer lock on the new page, or we risk a race
892  * condition against btvacuumscan --- see comments therein.
893  */
894  if (needLock)