PostgreSQL Source Code  git master
nbtree.h File Reference
#include "access/amapi.h"
#include "access/itup.h"
#include "access/sdir.h"
#include "access/xlogreader.h"
#include "catalog/pg_am_d.h"
#include "catalog/pg_index.h"
#include "lib/stringinfo.h"
#include "storage/bufmgr.h"
#include "storage/shm_toc.h"
Include dependency graph for nbtree.h:
This graph shows which files directly or indirectly include this file:

Go to the source code of this file.

Data Structures

struct  BTPageOpaqueData
 
struct  BTMetaPageData
 
struct  BTStackData
 
struct  BTScanInsertData
 
struct  BTInsertStateData
 
struct  BTDedupInterval
 
struct  BTDedupStateData
 
struct  BTVacuumPostingData
 
struct  BTScanPosItem
 
struct  BTScanPosData
 
struct  BTArrayKeyInfo
 
struct  BTScanOpaqueData
 
struct  BTOptions
 

Macros

#define BTP_LEAF   (1 << 0) /* leaf page, i.e. not internal page */
 
#define BTP_ROOT   (1 << 1) /* root page (has no parent) */
 
#define BTP_DELETED   (1 << 2) /* page has been deleted from tree */
 
#define BTP_META   (1 << 3) /* meta-page */
 
#define BTP_HALF_DEAD   (1 << 4) /* empty, but still in tree */
 
#define BTP_SPLIT_END   (1 << 5) /* rightmost page of split group */
 
#define BTP_HAS_GARBAGE   (1 << 6) /* page has LP_DEAD tuples */
 
#define BTP_INCOMPLETE_SPLIT   (1 << 7) /* right sibling's downlink is missing */
 
#define MAX_BT_CYCLE_ID   0xFF7F
 
#define BTPageGetMeta(p)   ((BTMetaPageData *) PageGetContents(p))
 
#define BTREE_METAPAGE   0 /* first page is meta */
 
#define BTREE_MAGIC   0x053162 /* magic number in metapage */
 
#define BTREE_VERSION   4 /* current version number */
 
#define BTREE_MIN_VERSION   2 /* minimum supported version */
 
#define BTREE_NOVAC_VERSION   3 /* version with all meta fields set */
 
#define BTMaxItemSize(page)
 
#define BTMaxItemSizeNoHeapTid(page)
 
#define MaxTIDsPerBTreePage
 
#define BTREE_MIN_FILLFACTOR   10
 
#define BTREE_DEFAULT_FILLFACTOR   90
 
#define BTREE_NONLEAF_FILLFACTOR   70
 
#define BTREE_SINGLEVAL_FILLFACTOR   96
 
#define P_NONE   0
 
#define P_LEFTMOST(opaque)   ((opaque)->btpo_prev == P_NONE)
 
#define P_RIGHTMOST(opaque)   ((opaque)->btpo_next == P_NONE)
 
#define P_ISLEAF(opaque)   (((opaque)->btpo_flags & BTP_LEAF) != 0)
 
#define P_ISROOT(opaque)   (((opaque)->btpo_flags & BTP_ROOT) != 0)
 
#define P_ISDELETED(opaque)   (((opaque)->btpo_flags & BTP_DELETED) != 0)
 
#define P_ISMETA(opaque)   (((opaque)->btpo_flags & BTP_META) != 0)
 
#define P_ISHALFDEAD(opaque)   (((opaque)->btpo_flags & BTP_HALF_DEAD) != 0)
 
#define P_IGNORE(opaque)   (((opaque)->btpo_flags & (BTP_DELETED|BTP_HALF_DEAD)) != 0)
 
#define P_HAS_GARBAGE(opaque)   (((opaque)->btpo_flags & BTP_HAS_GARBAGE) != 0)
 
#define P_INCOMPLETE_SPLIT(opaque)   (((opaque)->btpo_flags & BTP_INCOMPLETE_SPLIT) != 0)
 
#define P_HIKEY   ((OffsetNumber) 1)
 
#define P_FIRSTKEY   ((OffsetNumber) 2)
 
#define P_FIRSTDATAKEY(opaque)   (P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY)
 
#define INDEX_ALT_TID_MASK   INDEX_AM_RESERVED_BIT
 
#define BT_OFFSET_MASK   0x0FFF
 
#define BT_STATUS_OFFSET_MASK   0xF000
 
#define BT_PIVOT_HEAP_TID_ATTR   0x1000
 
#define BT_IS_POSTING   0x2000
 
#define BTreeTupleGetNAtts(itup, rel)
 
#define BTCommuteStrategyNumber(strat)   (BTMaxStrategyNumber + 1 - (strat))
 
#define BTORDER_PROC   1
 
#define BTSORTSUPPORT_PROC   2
 
#define BTINRANGE_PROC   3
 
#define BTEQUALIMAGE_PROC   4
 
#define BTOPTIONS_PROC   5
 
#define BTNProcs   5
 
#define BT_READ   BUFFER_LOCK_SHARE
 
#define BT_WRITE   BUFFER_LOCK_EXCLUSIVE
 
#define BTScanPosIsPinned(scanpos)
 
#define BTScanPosUnpin(scanpos)
 
#define BTScanPosUnpinIfPinned(scanpos)
 
#define BTScanPosIsValid(scanpos)
 
#define BTScanPosInvalidate(scanpos)
 
#define SK_BT_REQFWD   0x00010000 /* required to continue forward scan */
 
#define SK_BT_REQBKWD   0x00020000 /* required to continue backward scan */
 
#define SK_BT_INDOPTION_SHIFT   24 /* must clear the above bits */
 
#define SK_BT_DESC   (INDOPTION_DESC << SK_BT_INDOPTION_SHIFT)
 
#define SK_BT_NULLS_FIRST   (INDOPTION_NULLS_FIRST << SK_BT_INDOPTION_SHIFT)
 
#define BTGetFillFactor(relation)
 
#define BTGetTargetPageFreeSpace(relation)   (BLCKSZ * (100 - BTGetFillFactor(relation)) / 100)
 
#define BTGetDeduplicateItems(relation)
 
#define PROGRESS_BTREE_PHASE_INDEXBUILD_TABLESCAN   2
 
#define PROGRESS_BTREE_PHASE_PERFORMSORT_1   3
 
#define PROGRESS_BTREE_PHASE_PERFORMSORT_2   4
 
#define PROGRESS_BTREE_PHASE_LEAF_LOAD   5
 

Typedefs

typedef uint16 BTCycleId
 
typedef struct BTPageOpaqueData BTPageOpaqueData
 
typedef BTPageOpaqueDataBTPageOpaque
 
typedef struct BTMetaPageData BTMetaPageData
 
typedef struct BTStackData BTStackData
 
typedef BTStackDataBTStack
 
typedef struct BTScanInsertData BTScanInsertData
 
typedef BTScanInsertDataBTScanInsert
 
typedef struct BTInsertStateData BTInsertStateData
 
typedef BTInsertStateDataBTInsertState
 
typedef struct BTDedupInterval BTDedupInterval
 
typedef struct BTDedupStateData BTDedupStateData
 
typedef BTDedupStateDataBTDedupState
 
typedef struct BTVacuumPostingData BTVacuumPostingData
 
typedef BTVacuumPostingDataBTVacuumPosting
 
typedef struct BTScanPosItem BTScanPosItem
 
typedef struct BTScanPosData BTScanPosData
 
typedef BTScanPosDataBTScanPos
 
typedef struct BTArrayKeyInfo BTArrayKeyInfo
 
typedef struct BTScanOpaqueData BTScanOpaqueData
 
typedef BTScanOpaqueDataBTScanOpaque
 
typedef struct BTOptions BTOptions
 

Functions

static bool BTreeTupleIsPivot (IndexTuple itup)
 
static bool BTreeTupleIsPosting (IndexTuple itup)
 
static void BTreeTupleSetPosting (IndexTuple itup, uint16 nhtids, int postingoffset)
 
static uint16 BTreeTupleGetNPosting (IndexTuple posting)
 
static uint32 BTreeTupleGetPostingOffset (IndexTuple posting)
 
static ItemPointer BTreeTupleGetPosting (IndexTuple posting)
 
static ItemPointer BTreeTupleGetPostingN (IndexTuple posting, int n)
 
static BlockNumber BTreeTupleGetDownLink (IndexTuple pivot)
 
static void BTreeTupleSetDownLink (IndexTuple pivot, BlockNumber blkno)
 
static void BTreeTupleSetNAtts (IndexTuple itup, uint16 nkeyatts, bool heaptid)
 
static BlockNumber BTreeTupleGetTopParent (IndexTuple leafhikey)
 
static void BTreeTupleSetTopParent (IndexTuple leafhikey, BlockNumber blkno)
 
static ItemPointer BTreeTupleGetHeapTID (IndexTuple itup)
 
static ItemPointer BTreeTupleGetMaxHeapTID (IndexTuple itup)
 
void btbuildempty (Relation index)
 
bool btinsert (Relation rel, Datum *values, bool *isnull, ItemPointer ht_ctid, Relation heapRel, IndexUniqueCheck checkUnique, struct IndexInfo *indexInfo)
 
IndexScanDesc btbeginscan (Relation rel, int nkeys, int norderbys)
 
Size btestimateparallelscan (void)
 
void btinitparallelscan (void *target)
 
bool btgettuple (IndexScanDesc scan, ScanDirection dir)
 
int64 btgetbitmap (IndexScanDesc scan, TIDBitmap *tbm)
 
void btrescan (IndexScanDesc scan, ScanKey scankey, int nscankeys, ScanKey orderbys, int norderbys)
 
void btparallelrescan (IndexScanDesc scan)
 
void btendscan (IndexScanDesc scan)
 
void btmarkpos (IndexScanDesc scan)
 
void btrestrpos (IndexScanDesc scan)
 
IndexBulkDeleteResultbtbulkdelete (IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state)
 
IndexBulkDeleteResultbtvacuumcleanup (IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
 
bool btcanreturn (Relation index, int attno)
 
bool _bt_parallel_seize (IndexScanDesc scan, BlockNumber *pageno)
 
void _bt_parallel_release (IndexScanDesc scan, BlockNumber scan_page)
 
void _bt_parallel_done (IndexScanDesc scan)
 
void _bt_parallel_advance_array_keys (IndexScanDesc scan)
 
void _bt_dedup_one_page (Relation rel, Buffer buf, Relation heapRel, IndexTuple newitem, Size newitemsz, bool checkingunique)
 
void _bt_dedup_start_pending (BTDedupState state, IndexTuple base, OffsetNumber baseoff)
 
bool _bt_dedup_save_htid (BTDedupState state, IndexTuple itup)
 
Size _bt_dedup_finish_pending (Page newpage, BTDedupState state)
 
IndexTuple _bt_form_posting (IndexTuple base, ItemPointer htids, int nhtids)
 
void _bt_update_posting (BTVacuumPosting vacposting)
 
IndexTuple _bt_swap_posting (IndexTuple newitem, IndexTuple oposting, int postingoff)
 
bool _bt_doinsert (Relation rel, IndexTuple itup, IndexUniqueCheck checkUnique, Relation heapRel)
 
void _bt_finish_split (Relation rel, Buffer lbuf, BTStack stack)
 
Buffer _bt_getstackbuf (Relation rel, BTStack stack, BlockNumber child)
 
OffsetNumber _bt_findsplitloc (Relation rel, Page origpage, OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem, bool *newitemonleft)
 
void _bt_initmetapage (Page page, BlockNumber rootbknum, uint32 level, bool allequalimage)
 
void _bt_update_meta_cleanup_info (Relation rel, TransactionId oldestBtpoXact, float8 numHeapTuples)
 
void _bt_upgrademetapage (Page page)
 
Buffer _bt_getroot (Relation rel, int access)
 
Buffer _bt_gettrueroot (Relation rel)
 
int _bt_getrootheight (Relation rel)
 
void _bt_metaversion (Relation rel, bool *heapkeyspace, bool *allequalimage)
 
void _bt_checkpage (Relation rel, Buffer buf)
 
Buffer _bt_getbuf (Relation rel, BlockNumber blkno, int access)
 
Buffer _bt_relandgetbuf (Relation rel, Buffer obuf, BlockNumber blkno, int access)
 
void _bt_relbuf (Relation rel, Buffer buf)
 
void _bt_lockbuf (Relation rel, Buffer buf, int access)
 
void _bt_unlockbuf (Relation rel, Buffer buf)
 
bool _bt_conditionallockbuf (Relation rel, Buffer buf)
 
void _bt_upgradelockbufcleanup (Relation rel, Buffer buf)
 
void _bt_pageinit (Page page, Size size)
 
bool _bt_page_recyclable (Page page)
 
void _bt_delitems_vacuum (Relation rel, Buffer buf, OffsetNumber *deletable, int ndeletable, BTVacuumPosting *updatable, int nupdatable)
 
void _bt_delitems_delete (Relation rel, Buffer buf, OffsetNumber *deletable, int ndeletable, Relation heapRel)
 
uint32 _bt_pagedel (Relation rel, Buffer leafbuf, TransactionId *oldestBtpoXact)
 
BTStack _bt_search (Relation rel, BTScanInsert key, Buffer *bufP, int access, Snapshot snapshot)
 
Buffer _bt_moveright (Relation rel, BTScanInsert key, Buffer buf, bool forupdate, BTStack stack, int access, Snapshot snapshot)
 
OffsetNumber _bt_binsrch_insert (Relation rel, BTInsertState insertstate)
 
int32 _bt_compare (Relation rel, BTScanInsert key, Page page, OffsetNumber offnum)
 
bool _bt_first (IndexScanDesc scan, ScanDirection dir)
 
bool _bt_next (IndexScanDesc scan, ScanDirection dir)
 
Buffer _bt_get_endpoint (Relation rel, uint32 level, bool rightmost, Snapshot snapshot)
 
BTScanInsert _bt_mkscankey (Relation rel, IndexTuple itup)
 
void _bt_freestack (BTStack stack)
 
void _bt_preprocess_array_keys (IndexScanDesc scan)
 
void _bt_start_array_keys (IndexScanDesc scan, ScanDirection dir)
 
bool _bt_advance_array_keys (IndexScanDesc scan, ScanDirection dir)
 
void _bt_mark_array_keys (IndexScanDesc scan)
 
void _bt_restore_array_keys (IndexScanDesc scan)
 
void _bt_preprocess_keys (IndexScanDesc scan)
 
bool _bt_checkkeys (IndexScanDesc scan, IndexTuple tuple, int tupnatts, ScanDirection dir, bool *continuescan)
 
void _bt_killitems (IndexScanDesc scan)
 
BTCycleId _bt_vacuum_cycleid (Relation rel)
 
BTCycleId _bt_start_vacuum (Relation rel)
 
void _bt_end_vacuum (Relation rel)
 
void _bt_end_vacuum_callback (int code, Datum arg)
 
Size BTreeShmemSize (void)
 
void BTreeShmemInit (void)
 
byteabtoptions (Datum reloptions, bool validate)
 
bool btproperty (Oid index_oid, int attno, IndexAMProperty prop, const char *propname, bool *res, bool *isnull)
 
char * btbuildphasename (int64 phasenum)
 
IndexTuple _bt_truncate (Relation rel, IndexTuple lastleft, IndexTuple firstright, BTScanInsert itup_key)
 
int _bt_keep_natts_fast (Relation rel, IndexTuple lastleft, IndexTuple firstright)
 
bool _bt_check_natts (Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum)
 
void _bt_check_third_page (Relation rel, Relation heap, bool needheaptidspace, Page page, IndexTuple newtup)
 
bool _bt_allequalimage (Relation rel, bool debugmessage)
 
bool btvalidate (Oid opclassoid)
 
void btadjustmembers (Oid opfamilyoid, Oid opclassoid, List *operators, List *functions)
 
IndexBuildResultbtbuild (Relation heap, Relation index, struct IndexInfo *indexInfo)
 
void _bt_parallel_build_main (dsm_segment *seg, shm_toc *toc)
 

Macro Definition Documentation

◆ BT_IS_POSTING

#define BT_IS_POSTING   0x2000

Definition at line 341 of file nbtree.h.

Referenced by BTreeTupleIsPivot(), BTreeTupleIsPosting(), and BTreeTupleSetPosting().

◆ BT_OFFSET_MASK

#define BT_OFFSET_MASK   0x0FFF

Definition at line 337 of file nbtree.h.

Referenced by _bt_check_natts(), and BTreeTupleGetNPosting().

◆ BT_PIVOT_HEAP_TID_ATTR

#define BT_PIVOT_HEAP_TID_ATTR   0x1000

Definition at line 340 of file nbtree.h.

Referenced by _bt_check_natts(), BTreeTupleGetHeapTID(), and BTreeTupleSetNAtts().

◆ BT_READ

◆ BT_STATUS_OFFSET_MASK

#define BT_STATUS_OFFSET_MASK   0xF000

Definition at line 338 of file nbtree.h.

Referenced by BTreeTupleSetNAtts(), and BTreeTupleSetPosting().

◆ BT_WRITE

◆ BTCommuteStrategyNumber

#define BTCommuteStrategyNumber (   strat)    (BTMaxStrategyNumber + 1 - (strat))

Definition at line 553 of file nbtree.h.

Referenced by _bt_compare_scankey_args(), and _bt_fix_scankey_strategy().

◆ BTEQUALIMAGE_PROC

#define BTEQUALIMAGE_PROC   4

Definition at line 578 of file nbtree.h.

Referenced by _bt_allequalimage(), assignProcTypes(), and btvalidate().

◆ BTGetDeduplicateItems

#define BTGetDeduplicateItems (   relation)
Value:
(AssertMacro(relation->rd_rel->relkind == RELKIND_INDEX && \
relation->rd_rel->relam == BTREE_AM_OID), \
((relation)->rd_options ? \
((BTOptions *) (relation)->rd_options)->deduplicate_items : true))
#define AssertMacro(condition)
Definition: c.h:746

Definition at line 976 of file nbtree.h.

Referenced by _bt_findinsertloc(), and _bt_load().

◆ BTGetFillFactor

#define BTGetFillFactor (   relation)
Value:
(AssertMacro(relation->rd_rel->relkind == RELKIND_INDEX && \
relation->rd_rel->relam == BTREE_AM_OID), \
(relation)->rd_options ? \
((BTOptions *) (relation)->rd_options)->fillfactor : \
BTREE_DEFAULT_FILLFACTOR)
#define AssertMacro(condition)
Definition: c.h:746

Definition at line 968 of file nbtree.h.

Referenced by _bt_findsplitloc().

◆ BTGetTargetPageFreeSpace

#define BTGetTargetPageFreeSpace (   relation)    (BLCKSZ * (100 - BTGetFillFactor(relation)) / 100)

Definition at line 974 of file nbtree.h.

Referenced by _bt_pagestate().

◆ BTINRANGE_PROC

#define BTINRANGE_PROC   3

Definition at line 577 of file nbtree.h.

Referenced by assignProcTypes(), btvalidate(), and transformFrameOffset().

◆ BTMaxItemSize

#define BTMaxItemSize (   page)
Value:
3*sizeof(ItemIdData) + \
3*sizeof(ItemPointerData)) - \
MAXALIGN(sizeof(BTPageOpaqueData))) / 3)
#define SizeOfPageHeaderData
Definition: bufpage.h:216
#define PageGetPageSize(page)
Definition: bufpage.h:268
#define MAXALIGN(LEN)
Definition: c.h:698
#define MAXALIGN_DOWN(LEN)
Definition: c.h:710

Definition at line 157 of file nbtree.h.

Referenced by _bt_buildadd(), _bt_check_third_page(), _bt_dedup_one_page(), _bt_findinsertloc(), _bt_load(), bt_target_page_check(), and btree_xlog_dedup().

◆ BTMaxItemSizeNoHeapTid

#define BTMaxItemSizeNoHeapTid (   page)
Value:
MAXALIGN(sizeof(BTPageOpaqueData))) / 3)
#define SizeOfPageHeaderData
Definition: bufpage.h:216
#define PageGetPageSize(page)
Definition: bufpage.h:268
#define MAXALIGN(LEN)
Definition: c.h:698
#define MAXALIGN_DOWN(LEN)
Definition: c.h:710

Definition at line 163 of file nbtree.h.

Referenced by _bt_check_third_page(), and bt_target_page_check().

◆ BTNProcs

#define BTNProcs   5

Definition at line 580 of file nbtree.h.

Referenced by bthandler().

◆ BTOPTIONS_PROC

#define BTOPTIONS_PROC   5

Definition at line 579 of file nbtree.h.

Referenced by bthandler(), and btvalidate().

◆ BTORDER_PROC

◆ BTP_DELETED

#define BTP_DELETED   (1 << 2) /* page has been deleted from tree */

Definition at line 74 of file nbtree.h.

Referenced by _bt_unlink_halfdead_page(), and btree_xlog_unlink_page().

◆ BTP_HALF_DEAD

#define BTP_HALF_DEAD   (1 << 4) /* empty, but still in tree */

◆ BTP_HAS_GARBAGE

#define BTP_HAS_GARBAGE   (1 << 6) /* page has LP_DEAD tuples */

◆ BTP_INCOMPLETE_SPLIT

#define BTP_INCOMPLETE_SPLIT   (1 << 7) /* right sibling's downlink is missing */

◆ BTP_LEAF

#define BTP_LEAF   (1 << 0) /* leaf page, i.e. not internal page */

◆ BTP_META

#define BTP_META   (1 << 3) /* meta-page */

Definition at line 75 of file nbtree.h.

Referenced by _bt_initmetapage(), _bt_restore_meta(), and _bt_upgrademetapage().

◆ BTP_ROOT

#define BTP_ROOT   (1 << 1) /* root page (has no parent) */

Definition at line 73 of file nbtree.h.

Referenced by _bt_getroot(), _bt_newroot(), _bt_split(), _bt_uppershutdown(), and btree_xlog_newroot().

◆ BTP_SPLIT_END

#define BTP_SPLIT_END   (1 << 5) /* rightmost page of split group */

Definition at line 77 of file nbtree.h.

Referenced by _bt_split(), btree_mask(), and btvacuumpage().

◆ BTPageGetMeta

◆ BTREE_DEFAULT_FILLFACTOR

#define BTREE_DEFAULT_FILLFACTOR   90

Definition at line 194 of file nbtree.h.

◆ BTREE_MAGIC

#define BTREE_MAGIC   0x053162 /* magic number in metapage */

◆ BTREE_METAPAGE

◆ BTREE_MIN_FILLFACTOR

#define BTREE_MIN_FILLFACTOR   10

Definition at line 193 of file nbtree.h.

◆ BTREE_MIN_VERSION

#define BTREE_MIN_VERSION   2 /* minimum supported version */

◆ BTREE_NONLEAF_FILLFACTOR

#define BTREE_NONLEAF_FILLFACTOR   70

Definition at line 195 of file nbtree.h.

Referenced by _bt_findsplitloc(), and _bt_pagestate().

◆ BTREE_NOVAC_VERSION

◆ BTREE_SINGLEVAL_FILLFACTOR

#define BTREE_SINGLEVAL_FILLFACTOR   96

Definition at line 196 of file nbtree.h.

Referenced by _bt_findsplitloc(), and _bt_singleval_fillfactor().

◆ BTREE_VERSION

#define BTREE_VERSION   4 /* current version number */

◆ BTreeTupleGetNAtts

#define BTreeTupleGetNAtts (   itup,
  rel 
)
Value:
( \
(BTreeTupleIsPivot(itup)) ? \
( \
) \
: \
)
#define ItemPointerGetOffsetNumberNoCheck(pointer)
Definition: itemptr.h:108
static bool BTreeTupleIsPivot(IndexTuple itup)
Definition: nbtree.h:348
#define BT_OFFSET_MASK
Definition: nbtree.h:337
#define IndexRelationGetNumberOfAttributes(relation)
Definition: rel.h:468

Definition at line 445 of file nbtree.h.

Referenced by _bt_buildadd(), _bt_check_natts(), _bt_checkkeys(), _bt_compare(), _bt_insertonpg(), _bt_mkscankey(), _bt_newroot(), _bt_readpage(), _bt_split(), _bt_uppershutdown(), and bt_target_page_check().

◆ BTScanPosInvalidate

#define BTScanPosInvalidate (   scanpos)
Value:
do { \
(scanpos).currPage = InvalidBlockNumber; \
(scanpos).nextPage = InvalidBlockNumber; \
(scanpos).buf = InvalidBuffer; \
(scanpos).lsn = InvalidXLogRecPtr; \
(scanpos).nextTupleOffset = 0; \
} while (0)
#define InvalidXLogRecPtr
Definition: xlogdefs.h:28
#define InvalidBuffer
Definition: buf.h:25
static char * buf
Definition: pg_test_fsync.c:68
#define InvalidBlockNumber
Definition: block.h:33

Definition at line 885 of file nbtree.h.

Referenced by _bt_endpoint(), _bt_first(), _bt_readnextpage(), _bt_steppage(), btbeginscan(), btmarkpos(), btrescan(), and btrestrpos().

◆ BTScanPosIsPinned

#define BTScanPosIsPinned (   scanpos)
Value:
( \
AssertMacro(BlockNumberIsValid((scanpos).currPage) || \
!BufferIsValid((scanpos).buf)), \
BufferIsValid((scanpos).buf) \
)
static char * buf
Definition: pg_test_fsync.c:68
#define BlockNumberIsValid(blockNumber)
Definition: block.h:70
#define BufferIsValid(bufnum)
Definition: bufmgr.h:123

Definition at line 862 of file nbtree.h.

Referenced by _bt_killitems(), _bt_readnextpage(), _bt_readpage(), _bt_steppage(), and btrestrpos().

◆ BTScanPosIsValid

#define BTScanPosIsValid (   scanpos)
Value:
( \
AssertMacro(BlockNumberIsValid((scanpos).currPage) || \
!BufferIsValid((scanpos).buf)), \
BlockNumberIsValid((scanpos).currPage) \
)
static char * buf
Definition: pg_test_fsync.c:68
#define BlockNumberIsValid(blockNumber)
Definition: block.h:70
#define BufferIsValid(bufnum)
Definition: bufmgr.h:123

Definition at line 879 of file nbtree.h.

Referenced by _bt_first(), _bt_killitems(), _bt_steppage(), btendscan(), btgettuple(), btmarkpos(), btrescan(), and btrestrpos().

◆ BTScanPosUnpin

#define BTScanPosUnpin (   scanpos)
Value:
do { \
ReleaseBuffer((scanpos).buf); \
(scanpos).buf = InvalidBuffer; \
} while (0)
#define InvalidBuffer
Definition: buf.h:25
static char * buf
Definition: pg_test_fsync.c:68

Definition at line 868 of file nbtree.h.

◆ BTScanPosUnpinIfPinned

#define BTScanPosUnpinIfPinned (   scanpos)
Value:
do { \
if (BTScanPosIsPinned(scanpos)) \
BTScanPosUnpin(scanpos); \
} while (0)
#define BTScanPosIsPinned(scanpos)
Definition: nbtree.h:862

Definition at line 873 of file nbtree.h.

Referenced by _bt_readnextpage(), _bt_steppage(), btendscan(), btmarkpos(), btrescan(), and btrestrpos().

◆ BTSORTSUPPORT_PROC

#define BTSORTSUPPORT_PROC   2

Definition at line 576 of file nbtree.h.

Referenced by assignProcTypes(), btvalidate(), FinishSortSupportFunction(), and MJExamineQuals().

◆ INDEX_ALT_TID_MASK

◆ MAX_BT_CYCLE_ID

#define MAX_BT_CYCLE_ID   0xFF7F

Definition at line 88 of file nbtree.h.

Referenced by _bt_start_vacuum().

◆ MaxTIDsPerBTreePage

#define MaxTIDsPerBTreePage
Value:
(int) ((BLCKSZ - SizeOfPageHeaderData - sizeof(BTPageOpaqueData)) / \
sizeof(ItemPointerData))
#define SizeOfPageHeaderData
Definition: bufpage.h:216

Definition at line 179 of file nbtree.h.

Referenced by _bt_readpage(), bt_check_every_level(), and btgettuple().

◆ P_FIRSTDATAKEY

◆ P_FIRSTKEY

#define P_FIRSTKEY   ((OffsetNumber) 2)

Definition at line 243 of file nbtree.h.

Referenced by _bt_afternewitemoff(), _bt_buildadd(), _bt_newroot(), and _bt_slideleft().

◆ P_HAS_GARBAGE

#define P_HAS_GARBAGE (   opaque)    (((opaque)->btpo_flags & BTP_HAS_GARBAGE) != 0)

◆ P_HIKEY

◆ P_IGNORE

◆ P_INCOMPLETE_SPLIT

◆ P_ISDELETED

◆ P_ISHALFDEAD

#define P_ISHALFDEAD (   opaque)    (((opaque)->btpo_flags & BTP_HALF_DEAD) != 0)

◆ P_ISLEAF

◆ P_ISMETA

#define P_ISMETA (   opaque)    (((opaque)->btpo_flags & BTP_META) != 0)

Definition at line 217 of file nbtree.h.

Referenced by _bt_getmeta(), _bt_gettrueroot(), bt_page_items_bytea(), and palloc_btree_page().

◆ P_ISROOT

#define P_ISROOT (   opaque)    (((opaque)->btpo_flags & BTP_ROOT) != 0)

◆ P_LEFTMOST

#define P_LEFTMOST (   opaque)    ((opaque)->btpo_prev == P_NONE)

◆ P_NONE

◆ P_RIGHTMOST

◆ PROGRESS_BTREE_PHASE_INDEXBUILD_TABLESCAN

#define PROGRESS_BTREE_PHASE_INDEXBUILD_TABLESCAN   2

Definition at line 987 of file nbtree.h.

Referenced by _bt_spools_heapscan(), and btbuildphasename().

◆ PROGRESS_BTREE_PHASE_LEAF_LOAD

#define PROGRESS_BTREE_PHASE_LEAF_LOAD   5

Definition at line 990 of file nbtree.h.

Referenced by _bt_leafbuild(), and btbuildphasename().

◆ PROGRESS_BTREE_PHASE_PERFORMSORT_1

#define PROGRESS_BTREE_PHASE_PERFORMSORT_1   3

Definition at line 988 of file nbtree.h.

Referenced by _bt_leafbuild(), and btbuildphasename().

◆ PROGRESS_BTREE_PHASE_PERFORMSORT_2

#define PROGRESS_BTREE_PHASE_PERFORMSORT_2   4

Definition at line 989 of file nbtree.h.

Referenced by _bt_leafbuild(), and btbuildphasename().

◆ SK_BT_DESC

◆ SK_BT_INDOPTION_SHIFT

#define SK_BT_INDOPTION_SHIFT   24 /* must clear the above bits */

Definition at line 955 of file nbtree.h.

Referenced by _bt_fix_scankey_strategy(), and _bt_mkscankey().

◆ SK_BT_NULLS_FIRST

◆ SK_BT_REQBKWD

#define SK_BT_REQBKWD   0x00020000 /* required to continue backward scan */

Definition at line 954 of file nbtree.h.

Referenced by _bt_check_rowcompare(), _bt_checkkeys(), and _bt_mark_scankey_required().

◆ SK_BT_REQFWD

#define SK_BT_REQFWD   0x00010000 /* required to continue forward scan */

Definition at line 953 of file nbtree.h.

Referenced by _bt_check_rowcompare(), _bt_checkkeys(), and _bt_mark_scankey_required().

Typedef Documentation

◆ BTArrayKeyInfo

◆ BTCycleId

typedef uint16 BTCycleId

Definition at line 28 of file nbtree.h.

◆ BTDedupInterval

◆ BTDedupState

Definition at line 766 of file nbtree.h.

◆ BTDedupStateData

◆ BTInsertState

Definition at line 708 of file nbtree.h.

◆ BTInsertStateData

◆ BTMetaPageData

◆ BTOptions

typedef struct BTOptions BTOptions

◆ BTPageOpaque

Definition at line 69 of file nbtree.h.

◆ BTPageOpaqueData

◆ BTScanInsert

Definition at line 669 of file nbtree.h.

◆ BTScanInsertData

◆ BTScanOpaque

Definition at line 946 of file nbtree.h.

◆ BTScanOpaqueData

◆ BTScanPos

Definition at line 860 of file nbtree.h.

◆ BTScanPosData

typedef struct BTScanPosData BTScanPosData

◆ BTScanPosItem

typedef struct BTScanPosItem BTScanPosItem

◆ BTStack

typedef BTStackData* BTStack

Definition at line 607 of file nbtree.h.

◆ BTStackData

typedef struct BTStackData BTStackData

◆ BTVacuumPosting

Definition at line 786 of file nbtree.h.

◆ BTVacuumPostingData

Function Documentation

◆ _bt_advance_array_keys()

bool _bt_advance_array_keys ( IndexScanDesc  scan,
ScanDirection  dir 
)

Definition at line 544 of file nbtutils.c.

References _bt_parallel_advance_array_keys(), BTScanOpaqueData::arrayKeyData, BTScanOpaqueData::arrayKeys, BTArrayKeyInfo::cur_elem, BTArrayKeyInfo::elem_values, i, BTArrayKeyInfo::num_elems, BTScanOpaqueData::numArrayKeys, IndexScanDescData::opaque, IndexScanDescData::parallel_scan, BTArrayKeyInfo::scan_key, ScanDirectionIsBackward, and ScanKeyData::sk_argument.

Referenced by btgetbitmap(), and btgettuple().

545 {
546  BTScanOpaque so = (BTScanOpaque) scan->opaque;
547  bool found = false;
548  int i;
549 
550  /*
551  * We must advance the last array key most quickly, since it will
552  * correspond to the lowest-order index column among the available
553  * qualifications. This is necessary to ensure correct ordering of output
554  * when there are multiple array keys.
555  */
556  for (i = so->numArrayKeys - 1; i >= 0; i--)
557  {
558  BTArrayKeyInfo *curArrayKey = &so->arrayKeys[i];
559  ScanKey skey = &so->arrayKeyData[curArrayKey->scan_key];
560  int cur_elem = curArrayKey->cur_elem;
561  int num_elems = curArrayKey->num_elems;
562 
563  if (ScanDirectionIsBackward(dir))
564  {
565  if (--cur_elem < 0)
566  {
567  cur_elem = num_elems - 1;
568  found = false; /* need to advance next array key */
569  }
570  else
571  found = true;
572  }
573  else
574  {
575  if (++cur_elem >= num_elems)
576  {
577  cur_elem = 0;
578  found = false; /* need to advance next array key */
579  }
580  else
581  found = true;
582  }
583 
584  curArrayKey->cur_elem = cur_elem;
585  skey->sk_argument = curArrayKey->elem_values[cur_elem];
586  if (found)
587  break;
588  }
589 
590  /* advance parallel scan */
591  if (scan->parallel_scan != NULL)
593 
594  return found;
595 }
Datum * elem_values
Definition: nbtree.h:901
#define ScanDirectionIsBackward(direction)
Definition: sdir.h:41
BTScanOpaqueData * BTScanOpaque
Definition: nbtree.h:946
int cur_elem
Definition: nbtree.h:898
int numArrayKeys
Definition: nbtree.h:913
ScanKey arrayKeyData
Definition: nbtree.h:912
struct ParallelIndexScanDescData * parallel_scan
Definition: relscan.h:163
int num_elems
Definition: nbtree.h:900
void _bt_parallel_advance_array_keys(IndexScanDesc scan)
Definition: nbtree.c:769
BTArrayKeyInfo * arrayKeys
Definition: nbtree.h:917
int i
Datum sk_argument
Definition: skey.h:72
int scan_key
Definition: nbtree.h:897

◆ _bt_allequalimage()

bool _bt_allequalimage ( Relation  rel,
bool  debugmessage 
)

Definition at line 2691 of file nbtutils.c.

References BTEQUALIMAGE_PROC, BTSortArrayContext::collation, DatumGetBool, DEBUG1, elog, get_opfamily_proc(), i, IndexRelationGetNumberOfAttributes, IndexRelationGetNumberOfKeyAttributes, IsSystemRelation(), ObjectIdGetDatum, OidFunctionCall1Coll(), OidIsValid, RelationData::rd_indcollation, RelationData::rd_opcintype, RelationData::rd_opfamily, and RelationGetRelationName.

Referenced by _bt_leafbuild(), bt_index_check_internal(), and btbuildempty().

2692 {
2693  bool allequalimage = true;
2694 
2695  /* INCLUDE indexes don't support deduplication */
2698  return false;
2699 
2700  /*
2701  * There is no special reason why deduplication cannot work with system
2702  * relations (i.e. with system catalog indexes and TOAST indexes). We
2703  * deem deduplication unsafe for these indexes all the same, since the
2704  * alternative is to force users to always use deduplication, without
2705  * being able to opt out. (ALTER INDEX is not supported with system
2706  * indexes, so users would have no way to set the deduplicate_items
2707  * storage parameter to 'off'.)
2708  */
2709  if (IsSystemRelation(rel))
2710  return false;
2711 
2712  for (int i = 0; i < IndexRelationGetNumberOfKeyAttributes(rel); i++)
2713  {
2714  Oid opfamily = rel->rd_opfamily[i];
2715  Oid opcintype = rel->rd_opcintype[i];
2716  Oid collation = rel->rd_indcollation[i];
2717  Oid equalimageproc;
2718 
2719  equalimageproc = get_opfamily_proc(opfamily, opcintype, opcintype,
2721 
2722  /*
2723  * If there is no BTEQUALIMAGE_PROC then deduplication is assumed to
2724  * be unsafe. Otherwise, actually call proc and see what it says.
2725  */
2726  if (!OidIsValid(equalimageproc) ||
2727  !DatumGetBool(OidFunctionCall1Coll(equalimageproc, collation,
2728  ObjectIdGetDatum(opcintype))))
2729  {
2730  allequalimage = false;
2731  break;
2732  }
2733  }
2734 
2735  /*
2736  * Don't elog() until here to avoid reporting on a system relation index
2737  * or an INCLUDE index
2738  */
2739  if (debugmessage)
2740  {
2741  if (allequalimage)
2742  elog(DEBUG1, "index \"%s\" can safely use deduplication",
2744  else
2745  elog(DEBUG1, "index \"%s\" cannot use deduplication",
2747  }
2748 
2749  return allequalimage;
2750 }
#define DEBUG1
Definition: elog.h:25
bool IsSystemRelation(Relation relation)
Definition: catalog.c:68
#define BTEQUALIMAGE_PROC
Definition: nbtree.h:578
unsigned int Oid
Definition: postgres_ext.h:31
#define OidIsValid(objectId)
Definition: c.h:651
Oid * rd_indcollation
Definition: rel.h:199
#define ObjectIdGetDatum(X)
Definition: postgres.h:507
#define DatumGetBool(X)
Definition: postgres.h:393
#define IndexRelationGetNumberOfAttributes(relation)
Definition: rel.h:468
#define RelationGetRelationName(relation)
Definition: rel.h:490
Oid * rd_opfamily
Definition: rel.h:189
#define IndexRelationGetNumberOfKeyAttributes(relation)
Definition: rel.h:475
Datum OidFunctionCall1Coll(Oid functionId, Oid collation, Datum arg1)
Definition: fmgr.c:1414
Oid get_opfamily_proc(Oid opfamily, Oid lefttype, Oid righttype, int16 procnum)
Definition: lsyscache.c:794
#define elog(elevel,...)
Definition: elog.h:214
int i
Oid * rd_opcintype
Definition: rel.h:190

◆ _bt_binsrch_insert()

OffsetNumber _bt_binsrch_insert ( Relation  rel,
BTInsertState  insertstate 
)

Definition at line 447 of file nbtsearch.c.

References _bt_binsrch_posting(), _bt_compare(), Assert, BTInsertStateData::bounds_valid, BTInsertStateData::buf, BufferGetPage, InvalidOffsetNumber, BTInsertStateData::itup_key, sort-test::key, BTInsertStateData::low, BTScanInsertData::nextkey, P_FIRSTDATAKEY, P_ISLEAF, PageGetMaxOffsetNumber, PageGetSpecialPointer, BTInsertStateData::postingoff, BTScanInsertData::scantid, BTInsertStateData::stricthigh, and unlikely.

Referenced by _bt_check_unique(), _bt_findinsertloc(), and bt_rootdescend().

448 {
449  BTScanInsert key = insertstate->itup_key;
450  Page page;
451  BTPageOpaque opaque;
452  OffsetNumber low,
453  high,
454  stricthigh;
455  int32 result,
456  cmpval;
457 
458  page = BufferGetPage(insertstate->buf);
459  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
460 
461  Assert(P_ISLEAF(opaque));
462  Assert(!key->nextkey);
463  Assert(insertstate->postingoff == 0);
464 
465  if (!insertstate->bounds_valid)
466  {
467  /* Start new binary search */
468  low = P_FIRSTDATAKEY(opaque);
469  high = PageGetMaxOffsetNumber(page);
470  }
471  else
472  {
473  /* Restore result of previous binary search against same page */
474  low = insertstate->low;
475  high = insertstate->stricthigh;
476  }
477 
478  /* If there are no keys on the page, return the first available slot */
479  if (unlikely(high < low))
480  {
481  /* Caller can't reuse bounds */
482  insertstate->low = InvalidOffsetNumber;
483  insertstate->stricthigh = InvalidOffsetNumber;
484  insertstate->bounds_valid = false;
485  return low;
486  }
487 
488  /*
489  * Binary search to find the first key on the page >= scan key. (nextkey
490  * is always false when inserting).
491  *
492  * The loop invariant is: all slots before 'low' are < scan key, all slots
493  * at or after 'high' are >= scan key. 'stricthigh' is > scan key, and is
494  * maintained to save additional search effort for caller.
495  *
496  * We can fall out when high == low.
497  */
498  if (!insertstate->bounds_valid)
499  high++; /* establish the loop invariant for high */
500  stricthigh = high; /* high initially strictly higher */
501 
502  cmpval = 1; /* !nextkey comparison value */
503 
504  while (high > low)
505  {
506  OffsetNumber mid = low + ((high - low) / 2);
507 
508  /* We have low <= mid < high, so mid points at a real slot */
509 
510  result = _bt_compare(rel, key, page, mid);
511 
512  if (result >= cmpval)
513  low = mid + 1;
514  else
515  {
516  high = mid;
517  if (result != 0)
518  stricthigh = high;
519  }
520 
521  /*
522  * If tuple at offset located by binary search is a posting list whose
523  * TID range overlaps with caller's scantid, perform posting list
524  * binary search to set postingoff for caller. Caller must split the
525  * posting list when postingoff is set. This should happen
526  * infrequently.
527  */
528  if (unlikely(result == 0 && key->scantid != NULL))
529  insertstate->postingoff = _bt_binsrch_posting(key, page, mid);
530  }
531 
532  /*
533  * On a leaf page, a binary search always returns the first key >= scan
534  * key (at least in !nextkey case), which could be the last slot + 1. This
535  * is also the lower bound of cached search.
536  *
537  * stricthigh may also be the last slot + 1, which prevents caller from
538  * using bounds directly, but is still useful to us if we're called a
539  * second time with cached bounds (cached low will be < stricthigh when
540  * that happens).
541  */
542  insertstate->low = low;
543  insertstate->stricthigh = stricthigh;
544  insertstate->bounds_valid = true;
545 
546  return low;
547 }
bool bounds_valid
Definition: nbtree.h:696
ItemPointer scantid
Definition: nbtree.h:664
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:244
OffsetNumber stricthigh
Definition: nbtree.h:698
OffsetNumber low
Definition: nbtree.h:697
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:357
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:69
signed int int32
Definition: c.h:362
uint16 OffsetNumber
Definition: off.h:24
static int _bt_binsrch_posting(BTScanInsert key, Page page, OffsetNumber offnum)
Definition: nbtsearch.c:558
int32 _bt_compare(Relation rel, BTScanInsert key, Page page, OffsetNumber offnum)
Definition: nbtsearch.c:644
#define BufferGetPage(buffer)
Definition: bufmgr.h:169
#define InvalidOffsetNumber
Definition: off.h:26
BTScanInsert itup_key
Definition: nbtree.h:686
#define Assert(condition)
Definition: c.h:745
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
#define unlikely(x)
Definition: c.h:206
Pointer Page
Definition: bufpage.h:78
#define P_ISLEAF(opaque)
Definition: nbtree.h:214

◆ _bt_check_natts()

bool _bt_check_natts ( Relation  rel,
bool  heapkeyspace,
Page  page,
OffsetNumber  offnum 
)

Definition at line 2465 of file nbtutils.c.

References Assert, BT_OFFSET_MASK, BT_PIVOT_HEAP_TID_ATTR, BTreeTupleGetHeapTID(), BTreeTupleGetNAtts, BTreeTupleIsPivot(), BTreeTupleIsPosting(), FirstOffsetNumber, INDEX_MAX_KEYS, IndexRelationGetNumberOfAttributes, IndexRelationGetNumberOfKeyAttributes, ItemPointerGetOffsetNumber, ItemPointerGetOffsetNumberNoCheck, P_FIRSTDATAKEY, P_HIKEY, P_IGNORE, P_ISLEAF, P_RIGHTMOST, PageGetItem, PageGetItemId, PageGetMaxOffsetNumber, PageGetSpecialPointer, StaticAssertStmt, and IndexTupleData::t_tid.

Referenced by _bt_compare(), and bt_target_page_check().

2466 {
2470  IndexTuple itup;
2471  int tupnatts;
2472 
2473  /*
2474  * We cannot reliably test a deleted or half-dead page, since they have
2475  * dummy high keys
2476  */
2477  if (P_IGNORE(opaque))
2478  return true;
2479 
2480  Assert(offnum >= FirstOffsetNumber &&
2481  offnum <= PageGetMaxOffsetNumber(page));
2482 
2483  /*
2484  * Mask allocated for number of keys in index tuple must be able to fit
2485  * maximum possible number of index attributes
2486  */
2488  "BT_OFFSET_MASK can't fit INDEX_MAX_KEYS");
2489 
2490  itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
2491  tupnatts = BTreeTupleGetNAtts(itup, rel);
2492 
2493  /* !heapkeyspace indexes do not support deduplication */
2494  if (!heapkeyspace && BTreeTupleIsPosting(itup))
2495  return false;
2496 
2497  /* Posting list tuples should never have "pivot heap TID" bit set */
2498  if (BTreeTupleIsPosting(itup) &&
2500  BT_PIVOT_HEAP_TID_ATTR) != 0)
2501  return false;
2502 
2503  /* INCLUDE indexes do not support deduplication */
2504  if (natts != nkeyatts && BTreeTupleIsPosting(itup))
2505  return false;
2506 
2507  if (P_ISLEAF(opaque))
2508  {
2509  if (offnum >= P_FIRSTDATAKEY(opaque))
2510  {
2511  /*
2512  * Non-pivot tuple should never be explicitly marked as a pivot
2513  * tuple
2514  */
2515  if (BTreeTupleIsPivot(itup))
2516  return false;
2517 
2518  /*
2519  * Leaf tuples that are not the page high key (non-pivot tuples)
2520  * should never be truncated. (Note that tupnatts must have been
2521  * inferred, even with a posting list tuple, because only pivot
2522  * tuples store tupnatts directly.)
2523  */
2524  return tupnatts == natts;
2525  }
2526  else
2527  {
2528  /*
2529  * Rightmost page doesn't contain a page high key, so tuple was
2530  * checked above as ordinary leaf tuple
2531  */
2532  Assert(!P_RIGHTMOST(opaque));
2533 
2534  /*
2535  * !heapkeyspace high key tuple contains only key attributes. Note
2536  * that tupnatts will only have been explicitly represented in
2537  * !heapkeyspace indexes that happen to have non-key attributes.
2538  */
2539  if (!heapkeyspace)
2540  return tupnatts == nkeyatts;
2541 
2542  /* Use generic heapkeyspace pivot tuple handling */
2543  }
2544  }
2545  else /* !P_ISLEAF(opaque) */
2546  {
2547  if (offnum == P_FIRSTDATAKEY(opaque))
2548  {
2549  /*
2550  * The first tuple on any internal page (possibly the first after
2551  * its high key) is its negative infinity tuple. Negative
2552  * infinity tuples are always truncated to zero attributes. They
2553  * are a particular kind of pivot tuple.
2554  */
2555  if (heapkeyspace)
2556  return tupnatts == 0;
2557 
2558  /*
2559  * The number of attributes won't be explicitly represented if the
2560  * negative infinity tuple was generated during a page split that
2561  * occurred with a version of Postgres before v11. There must be
2562  * a problem when there is an explicit representation that is
2563  * non-zero, or when there is no explicit representation and the
2564  * tuple is evidently not a pre-pg_upgrade tuple.
2565  *
2566  * Prior to v11, downlinks always had P_HIKEY as their offset.
2567  * Accept that as an alternative indication of a valid
2568  * !heapkeyspace negative infinity tuple.
2569  */
2570  return tupnatts == 0 ||
2572  }
2573  else
2574  {
2575  /*
2576  * !heapkeyspace downlink tuple with separator key contains only
2577  * key attributes. Note that tupnatts will only have been
2578  * explicitly represented in !heapkeyspace indexes that happen to
2579  * have non-key attributes.
2580  */
2581  if (!heapkeyspace)
2582  return tupnatts == nkeyatts;
2583 
2584  /* Use generic heapkeyspace pivot tuple handling */
2585  }
2586 
2587  }
2588 
2589  /* Handle heapkeyspace pivot tuples (excluding minus infinity items) */
2590  Assert(heapkeyspace);
2591 
2592  /*
2593  * Explicit representation of the number of attributes is mandatory with
2594  * heapkeyspace index pivot tuples, regardless of whether or not there are
2595  * non-key attributes.
2596  */
2597  if (!BTreeTupleIsPivot(itup))
2598  return false;
2599 
2600  /* Pivot tuple should not use posting list representation (redundant) */
2601  if (BTreeTupleIsPosting(itup))
2602  return false;
2603 
2604  /*
2605  * Heap TID is a tiebreaker key attribute, so it cannot be untruncated
2606  * when any other key attribute is truncated
2607  */
2608  if (BTreeTupleGetHeapTID(itup) != NULL && tupnatts != nkeyatts)
2609  return false;
2610 
2611  /*
2612  * Pivot tuple must have at least one untruncated key attribute (minus
2613  * infinity pivot tuples are the only exception). Pivot tuples can never
2614  * represent that there is a value present for a key attribute that
2615  * exceeds pg_index.indnkeyatts for the index.
2616  */
2617  return tupnatts > 0 && tupnatts <= nkeyatts;
2618 }
signed short int16
Definition: c.h:361
#define ItemPointerGetOffsetNumberNoCheck(pointer)
Definition: itemptr.h:108
#define P_IGNORE(opaque)
Definition: nbtree.h:219
static bool BTreeTupleIsPivot(IndexTuple itup)
Definition: nbtree.h:348
static ItemPointer BTreeTupleGetHeapTID(IndexTuple itup)
Definition: nbtree.h:506
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:244
ItemPointerData t_tid
Definition: itup.h:37
#define BT_OFFSET_MASK
Definition: nbtree.h:337
#define BTreeTupleGetNAtts(itup, rel)
Definition: nbtree.h:445
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:357
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:69
#define StaticAssertStmt(condition, errmessage)
Definition: c.h:859
#define FirstOffsetNumber
Definition: off.h:27
IndexTupleData * IndexTuple
Definition: itup.h:53
#define IndexRelationGetNumberOfAttributes(relation)
Definition: rel.h:468
#define IndexRelationGetNumberOfKeyAttributes(relation)
Definition: rel.h:475
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:235
static bool BTreeTupleIsPosting(IndexTuple itup)
Definition: nbtree.h:360
#define Assert(condition)
Definition: c.h:745
#define INDEX_MAX_KEYS
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
#define ItemPointerGetOffsetNumber(pointer)
Definition: itemptr.h:117
#define P_HIKEY
Definition: nbtree.h:242
#define BT_PIVOT_HEAP_TID_ATTR
Definition: nbtree.h:340
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:213
#define PageGetItem(page, itemId)
Definition: bufpage.h:340
#define P_ISLEAF(opaque)
Definition: nbtree.h:214

◆ _bt_check_third_page()

void _bt_check_third_page ( Relation  rel,
Relation  heap,
bool  needheaptidspace,
Page  page,
IndexTuple  newtup 
)

Definition at line 2633 of file nbtutils.c.

References BTMaxItemSize, BTMaxItemSizeNoHeapTid, BTREE_NOVAC_VERSION, BTREE_VERSION, BTreeTupleGetHeapTID(), elog, ereport, errcode(), errdetail(), errhint(), errmsg(), ERROR, errtableconstraint(), IndexTupleSize, ItemPointerGetBlockNumber, ItemPointerGetOffsetNumber, MAXALIGN, P_ISLEAF, PageGetSpecialPointer, and RelationGetRelationName.

Referenced by _bt_buildadd(), and _bt_findinsertloc().

2635 {
2636  Size itemsz;
2637  BTPageOpaque opaque;
2638 
2639  itemsz = MAXALIGN(IndexTupleSize(newtup));
2640 
2641  /* Double check item size against limit */
2642  if (itemsz <= BTMaxItemSize(page))
2643  return;
2644 
2645  /*
2646  * Tuple is probably too large to fit on page, but it's possible that the
2647  * index uses version 2 or version 3, or that page is an internal page, in
2648  * which case a slightly higher limit applies.
2649  */
2650  if (!needheaptidspace && itemsz <= BTMaxItemSizeNoHeapTid(page))
2651  return;
2652 
2653  /*
2654  * Internal page insertions cannot fail here, because that would mean that
2655  * an earlier leaf level insertion that should have failed didn't
2656  */
2657  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2658  if (!P_ISLEAF(opaque))
2659  elog(ERROR, "cannot insert oversized tuple of size %zu on internal page of index \"%s\"",
2660  itemsz, RelationGetRelationName(rel));
2661 
2662  ereport(ERROR,
2663  (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
2664  errmsg("index row size %zu exceeds btree version %u maximum %zu for index \"%s\"",
2665  itemsz,
2666  needheaptidspace ? BTREE_VERSION : BTREE_NOVAC_VERSION,
2667  needheaptidspace ? BTMaxItemSize(page) :
2668  BTMaxItemSizeNoHeapTid(page),
2670  errdetail("Index row references tuple (%u,%u) in relation \"%s\".",
2673  RelationGetRelationName(heap)),
2674  errhint("Values larger than 1/3 of a buffer page cannot be indexed.\n"
2675  "Consider a function index of an MD5 hash of the value, "
2676  "or use full text indexing."),
2678 }
int errhint(const char *fmt,...)
Definition: elog.c:1071
static ItemPointer BTreeTupleGetHeapTID(IndexTuple itup)
Definition: nbtree.h:506
#define BTREE_VERSION
Definition: nbtree.h:143
#define BTMaxItemSizeNoHeapTid(page)
Definition: nbtree.h:163
int errcode(int sqlerrcode)
Definition: elog.c:610
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:69
int errtableconstraint(Relation rel, const char *conname)
Definition: relcache.c:5551
#define ERROR
Definition: elog.h:43
int errdetail(const char *fmt,...)
Definition: elog.c:957
#define RelationGetRelationName(relation)
Definition: rel.h:490
#define BTREE_NOVAC_VERSION
Definition: nbtree.h:145
#define ereport(elevel,...)
Definition: elog.h:144
size_t Size
Definition: c.h:473
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
#define MAXALIGN(LEN)
Definition: c.h:698
#define ItemPointerGetOffsetNumber(pointer)
Definition: itemptr.h:117
#define BTMaxItemSize(page)
Definition: nbtree.h:157
int errmsg(const char *fmt,...)
Definition: elog.c:824
#define elog(elevel,...)
Definition: elog.h:214
#define ItemPointerGetBlockNumber(pointer)
Definition: itemptr.h:98
#define IndexTupleSize(itup)
Definition: itup.h:71
#define P_ISLEAF(opaque)
Definition: nbtree.h:214

◆ _bt_checkkeys()

bool _bt_checkkeys ( IndexScanDesc  scan,
IndexTuple  tuple,
int  tupnatts,
ScanDirection  dir,
bool continuescan 
)

Definition at line 1355 of file nbtutils.c.

References _bt_check_rowcompare(), Assert, BTreeTupleGetNAtts, BTreeTupleIsPivot(), DatumGetBool, FunctionCall2Coll(), index_getattr, IndexScanDescData::indexRelation, sort-test::key, BTScanOpaqueData::keyData, BTScanOpaqueData::numberOfKeys, IndexScanDescData::opaque, RelationGetDescr, ScanDirectionIsBackward, ScanDirectionIsForward, ScanKeyData::sk_argument, ScanKeyData::sk_attno, SK_BT_NULLS_FIRST, SK_BT_REQBKWD, SK_BT_REQFWD, ScanKeyData::sk_collation, ScanKeyData::sk_flags, ScanKeyData::sk_func, SK_ISNULL, SK_ROW_HEADER, SK_SEARCHNOTNULL, SK_SEARCHNULL, and test().

Referenced by _bt_readpage().

1357 {
1358  TupleDesc tupdesc;
1359  BTScanOpaque so;
1360  int keysz;
1361  int ikey;
1362  ScanKey key;
1363 
1364  Assert(BTreeTupleGetNAtts(tuple, scan->indexRelation) == tupnatts);
1365 
1366  *continuescan = true; /* default assumption */
1367 
1368  tupdesc = RelationGetDescr(scan->indexRelation);
1369  so = (BTScanOpaque) scan->opaque;
1370  keysz = so->numberOfKeys;
1371 
1372  for (key = so->keyData, ikey = 0; ikey < keysz; key++, ikey++)
1373  {
1374  Datum datum;
1375  bool isNull;
1376  Datum test;
1377 
1378  if (key->sk_attno > tupnatts)
1379  {
1380  /*
1381  * This attribute is truncated (must be high key). The value for
1382  * this attribute in the first non-pivot tuple on the page to the
1383  * right could be any possible value. Assume that truncated
1384  * attribute passes the qual.
1385  */
1387  Assert(BTreeTupleIsPivot(tuple));
1388  continue;
1389  }
1390 
1391  /* row-comparison keys need special processing */
1392  if (key->sk_flags & SK_ROW_HEADER)
1393  {
1394  if (_bt_check_rowcompare(key, tuple, tupnatts, tupdesc, dir,
1395  continuescan))
1396  continue;
1397  return false;
1398  }
1399 
1400  datum = index_getattr(tuple,
1401  key->sk_attno,
1402  tupdesc,
1403  &isNull);
1404 
1405  if (key->sk_flags & SK_ISNULL)
1406  {
1407  /* Handle IS NULL/NOT NULL tests */
1408  if (key->sk_flags & SK_SEARCHNULL)
1409  {
1410  if (isNull)
1411  continue; /* tuple satisfies this qual */
1412  }
1413  else
1414  {
1416  if (!isNull)
1417  continue; /* tuple satisfies this qual */
1418  }
1419 
1420  /*
1421  * Tuple fails this qual. If it's a required qual for the current
1422  * scan direction, then we can conclude no further tuples will
1423  * pass, either.
1424  */
1425  if ((key->sk_flags & SK_BT_REQFWD) &&
1427  *continuescan = false;
1428  else if ((key->sk_flags & SK_BT_REQBKWD) &&
1430  *continuescan = false;
1431 
1432  /*
1433  * In any case, this indextuple doesn't match the qual.
1434  */
1435  return false;
1436  }
1437 
1438  if (isNull)
1439  {
1440  if (key->sk_flags & SK_BT_NULLS_FIRST)
1441  {
1442  /*
1443  * Since NULLs are sorted before non-NULLs, we know we have
1444  * reached the lower limit of the range of values for this
1445  * index attr. On a backward scan, we can stop if this qual
1446  * is one of the "must match" subset. We can stop regardless
1447  * of whether the qual is > or <, so long as it's required,
1448  * because it's not possible for any future tuples to pass. On
1449  * a forward scan, however, we must keep going, because we may
1450  * have initially positioned to the start of the index.
1451  */
1452  if ((key->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) &&
1454  *continuescan = false;
1455  }
1456  else
1457  {
1458  /*
1459  * Since NULLs are sorted after non-NULLs, we know we have
1460  * reached the upper limit of the range of values for this
1461  * index attr. On a forward scan, we can stop if this qual is
1462  * one of the "must match" subset. We can stop regardless of
1463  * whether the qual is > or <, so long as it's required,
1464  * because it's not possible for any future tuples to pass. On
1465  * a backward scan, however, we must keep going, because we
1466  * may have initially positioned to the end of the index.
1467  */
1468  if ((key->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) &&
1470  *continuescan = false;
1471  }
1472 
1473  /*
1474  * In any case, this indextuple doesn't match the qual.
1475  */
1476  return false;
1477  }
1478 
1479  test = FunctionCall2Coll(&key->sk_func, key->sk_collation,
1480  datum, key->sk_argument);
1481 
1482  if (!DatumGetBool(test))
1483  {
1484  /*
1485  * Tuple fails this qual. If it's a required qual for the current
1486  * scan direction, then we can conclude no further tuples will
1487  * pass, either.
1488  *
1489  * Note: because we stop the scan as soon as any required equality
1490  * qual fails, it is critical that equality quals be used for the
1491  * initial positioning in _bt_first() when they are available. See
1492  * comments in _bt_first().
1493  */
1494  if ((key->sk_flags & SK_BT_REQFWD) &&
1496  *continuescan = false;
1497  else if ((key->sk_flags & SK_BT_REQBKWD) &&
1499  *continuescan = false;
1500 
1501  /*
1502  * In any case, this indextuple doesn't match the qual.
1503  */
1504  return false;
1505  }
1506  }
1507 
1508  /* If we get here, the tuple passes all index quals. */
1509  return true;
1510 }
static void test(void)
static bool BTreeTupleIsPivot(IndexTuple itup)
Definition: nbtree.h:348
#define ScanDirectionIsForward(direction)
Definition: sdir.h:55
#define RelationGetDescr(relation)
Definition: rel.h:482
Datum FunctionCall2Coll(FmgrInfo *flinfo, Oid collation, Datum arg1, Datum arg2)
Definition: fmgr.c:1152
#define BTreeTupleGetNAtts(itup, rel)
Definition: nbtree.h:445
Relation indexRelation
Definition: relscan.h:115
#define ScanDirectionIsBackward(direction)
Definition: sdir.h:41
BTScanOpaqueData * BTScanOpaque
Definition: nbtree.h:946
FmgrInfo sk_func
Definition: skey.h:71
#define DatumGetBool(X)
Definition: postgres.h:393
#define SK_SEARCHNOTNULL
Definition: skey.h:122
#define SK_ISNULL
Definition: skey.h:115
#define SK_ROW_HEADER
Definition: skey.h:117
#define SK_BT_NULLS_FIRST
Definition: nbtree.h:957
uintptr_t Datum
Definition: postgres.h:367
static bool _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts, TupleDesc tupdesc, ScanDirection dir, bool *continuescan)
Definition: nbtutils.c:1522
#define SK_BT_REQFWD
Definition: nbtree.h:953
int sk_flags
Definition: skey.h:66
#define Assert(condition)
Definition: c.h:745
int numberOfKeys
Definition: nbtree.h:908
#define index_getattr(tup, attnum, tupleDesc, isnull)
Definition: itup.h:100
ScanKey keyData
Definition: nbtree.h:909
Oid sk_collation
Definition: skey.h:70
#define SK_BT_REQBKWD
Definition: nbtree.h:954
Datum sk_argument
Definition: skey.h:72
#define SK_SEARCHNULL
Definition: skey.h:121
AttrNumber sk_attno
Definition: skey.h:67

◆ _bt_checkpage()

void _bt_checkpage ( Relation  rel,
Buffer  buf 
)

Definition at line 726 of file nbtpage.c.

References BufferGetBlockNumber(), BufferGetPage, ereport, errcode(), errhint(), errmsg(), ERROR, MAXALIGN, PageGetSpecialSize, PageIsNew, and RelationGetRelationName.

Referenced by _bt_getbuf(), _bt_relandgetbuf(), _bt_search_insert(), bt_recheck_sibling_links(), btvacuumpage(), and palloc_btree_page().

727 {
728  Page page = BufferGetPage(buf);
729 
730  /*
731  * ReadBuffer verifies that every newly-read page passes
732  * PageHeaderIsValid, which means it either contains a reasonably sane
733  * page header or is all-zero. We have to defend against the all-zero
734  * case, however.
735  */
736  if (PageIsNew(page))
737  ereport(ERROR,
738  (errcode(ERRCODE_INDEX_CORRUPTED),
739  errmsg("index \"%s\" contains unexpected zero page at block %u",
742  errhint("Please REINDEX it.")));
743 
744  /*
745  * Additionally check that the special area looks sane.
746  */
747  if (PageGetSpecialSize(page) != MAXALIGN(sizeof(BTPageOpaqueData)))
748  ereport(ERROR,
749  (errcode(ERRCODE_INDEX_CORRUPTED),
750  errmsg("index \"%s\" contains corrupted page at block %u",
753  errhint("Please REINDEX it.")));
754 }
int errhint(const char *fmt,...)
Definition: elog.c:1071
int errcode(int sqlerrcode)
Definition: elog.c:610
#define ERROR
Definition: elog.h:43
static char * buf
Definition: pg_test_fsync.c:68
#define RelationGetRelationName(relation)
Definition: rel.h:490
#define BufferGetPage(buffer)
Definition: bufmgr.h:169
#define ereport(elevel,...)
Definition: elog.h:144
#define MAXALIGN(LEN)
Definition: c.h:698
#define PageGetSpecialSize(page)
Definition: bufpage.h:300
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2661
#define PageIsNew(page)
Definition: bufpage.h:229
int errmsg(const char *fmt,...)
Definition: elog.c:824
Pointer Page
Definition: bufpage.h:78

◆ _bt_compare()

int32 _bt_compare ( Relation  rel,
BTScanInsert  key,
Page  page,
OffsetNumber  offnum 
)

Definition at line 644 of file nbtsearch.c.

References _bt_check_natts(), BTScanInsertData::allequalimage, Assert, BTreeTupleGetHeapTID(), BTreeTupleGetMaxHeapTID(), BTreeTupleGetNAtts, BTreeTupleIsPosting(), DatumGetInt32, FunctionCall2Coll(), BTScanInsertData::heapkeyspace, i, index_getattr, IndexRelationGetNumberOfKeyAttributes, INVERT_COMPARE_RESULT, ItemPointerCompare(), BTScanInsertData::keysz, Min, P_FIRSTDATAKEY, P_ISLEAF, PageGetItem, PageGetItemId, PageGetSpecialPointer, BTScanInsertData::pivotsearch, RelationGetDescr, BTScanInsertData::scankeys, BTScanInsertData::scantid, ScanKeyData::sk_argument, ScanKeyData::sk_attno, SK_BT_DESC, SK_BT_NULLS_FIRST, ScanKeyData::sk_collation, ScanKeyData::sk_flags, ScanKeyData::sk_func, and SK_ISNULL.

Referenced by _bt_binsrch(), _bt_binsrch_insert(), _bt_check_unique(), _bt_findinsertloc(), _bt_moveright(), _bt_search_insert(), bt_rootdescend(), invariant_g_offset(), invariant_l_nontarget_offset(), invariant_l_offset(), and invariant_leq_offset().

648 {
649  TupleDesc itupdesc = RelationGetDescr(rel);
651  IndexTuple itup;
652  ItemPointer heapTid;
653  ScanKey scankey;
654  int ncmpkey;
655  int ntupatts;
656  int32 result;
657 
658  Assert(_bt_check_natts(rel, key->heapkeyspace, page, offnum));
660  Assert(key->heapkeyspace || key->scantid == NULL);
661 
662  /*
663  * Force result ">" if target item is first data item on an internal page
664  * --- see NOTE above.
665  */
666  if (!P_ISLEAF(opaque) && offnum == P_FIRSTDATAKEY(opaque))
667  return 1;
668 
669  itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
670  ntupatts = BTreeTupleGetNAtts(itup, rel);
671 
672  /*
673  * The scan key is set up with the attribute number associated with each
674  * term in the key. It is important that, if the index is multi-key, the
675  * scan contain the first k key attributes, and that they be in order. If
676  * you think about how multi-key ordering works, you'll understand why
677  * this is.
678  *
679  * We don't test for violation of this condition here, however. The
680  * initial setup for the index scan had better have gotten it right (see
681  * _bt_first).
682  */
683 
684  ncmpkey = Min(ntupatts, key->keysz);
685  Assert(key->heapkeyspace || ncmpkey == key->keysz);
686  Assert(!BTreeTupleIsPosting(itup) || key->allequalimage);
687  scankey = key->scankeys;
688  for (int i = 1; i <= ncmpkey; i++)
689  {
690  Datum datum;
691  bool isNull;
692 
693  datum = index_getattr(itup, scankey->sk_attno, itupdesc, &isNull);
694 
695  if (scankey->sk_flags & SK_ISNULL) /* key is NULL */
696  {
697  if (isNull)
698  result = 0; /* NULL "=" NULL */
699  else if (scankey->sk_flags & SK_BT_NULLS_FIRST)
700  result = -1; /* NULL "<" NOT_NULL */
701  else
702  result = 1; /* NULL ">" NOT_NULL */
703  }
704  else if (isNull) /* key is NOT_NULL and item is NULL */
705  {
706  if (scankey->sk_flags & SK_BT_NULLS_FIRST)
707  result = 1; /* NOT_NULL ">" NULL */
708  else
709  result = -1; /* NOT_NULL "<" NULL */
710  }
711  else
712  {
713  /*
714  * The sk_func needs to be passed the index value as left arg and
715  * the sk_argument as right arg (they might be of different
716  * types). Since it is convenient for callers to think of
717  * _bt_compare as comparing the scankey to the index item, we have
718  * to flip the sign of the comparison result. (Unless it's a DESC
719  * column, in which case we *don't* flip the sign.)
720  */
721  result = DatumGetInt32(FunctionCall2Coll(&scankey->sk_func,
722  scankey->sk_collation,
723  datum,
724  scankey->sk_argument));
725 
726  if (!(scankey->sk_flags & SK_BT_DESC))
727  INVERT_COMPARE_RESULT(result);
728  }
729 
730  /* if the keys are unequal, return the difference */
731  if (result != 0)
732  return result;
733 
734  scankey++;
735  }
736 
737  /*
738  * All non-truncated attributes (other than heap TID) were found to be
739  * equal. Treat truncated attributes as minus infinity when scankey has a
740  * key attribute value that would otherwise be compared directly.
741  *
742  * Note: it doesn't matter if ntupatts includes non-key attributes;
743  * scankey won't, so explicitly excluding non-key attributes isn't
744  * necessary.
745  */
746  if (key->keysz > ntupatts)
747  return 1;
748 
749  /*
750  * Use the heap TID attribute and scantid to try to break the tie. The
751  * rules are the same as any other key attribute -- only the
752  * representation differs.
753  */
754  heapTid = BTreeTupleGetHeapTID(itup);
755  if (key->scantid == NULL)
756  {
757  /*
758  * Most searches have a scankey that is considered greater than a
759  * truncated pivot tuple if and when the scankey has equal values for
760  * attributes up to and including the least significant untruncated
761  * attribute in tuple.
762  *
763  * For example, if an index has the minimum two attributes (single
764  * user key attribute, plus heap TID attribute), and a page's high key
765  * is ('foo', -inf), and scankey is ('foo', <omitted>), the search
766  * will not descend to the page to the left. The search will descend
767  * right instead. The truncated attribute in pivot tuple means that
768  * all non-pivot tuples on the page to the left are strictly < 'foo',
769  * so it isn't necessary to descend left. In other words, search
770  * doesn't have to descend left because it isn't interested in a match
771  * that has a heap TID value of -inf.
772  *
773  * However, some searches (pivotsearch searches) actually require that
774  * we descend left when this happens. -inf is treated as a possible
775  * match for omitted scankey attribute(s). This is needed by page
776  * deletion, which must re-find leaf pages that are targets for
777  * deletion using their high keys.
778  *
779  * Note: the heap TID part of the test ensures that scankey is being
780  * compared to a pivot tuple with one or more truncated key
781  * attributes.
782  *
783  * Note: pg_upgrade'd !heapkeyspace indexes must always descend to the
784  * left here, since they have no heap TID attribute (and cannot have
785  * any -inf key values in any case, since truncation can only remove
786  * non-key attributes). !heapkeyspace searches must always be
787  * prepared to deal with matches on both sides of the pivot once the
788  * leaf level is reached.
789  */
790  if (key->heapkeyspace && !key->pivotsearch &&
791  key->keysz == ntupatts && heapTid == NULL)
792  return 1;
793 
794  /* All provided scankey arguments found to be equal */
795  return 0;
796  }
797 
798  /*
799  * Treat truncated heap TID as minus infinity, since scankey has a key
800  * attribute value (scantid) that would otherwise be compared directly
801  */
803  if (heapTid == NULL)
804  return 1;
805 
806  /*
807  * Scankey must be treated as equal to a posting list tuple if its scantid
808  * value falls within the range of the posting list. In all other cases
809  * there can only be a single heap TID value, which is compared directly
810  * with scantid.
811  */
813  result = ItemPointerCompare(key->scantid, heapTid);
814  if (result <= 0 || !BTreeTupleIsPosting(itup))
815  return result;
816  else
817  {
818  result = ItemPointerCompare(key->scantid,
820  if (result > 0)
821  return 1;
822  }
823 
824  return 0;
825 }
int32 ItemPointerCompare(ItemPointer arg1, ItemPointer arg2)
Definition: itemptr.c:52
static ItemPointer BTreeTupleGetHeapTID(IndexTuple itup)
Definition: nbtree.h:506
#define DatumGetInt32(X)
Definition: postgres.h:472
#define RelationGetDescr(relation)
Definition: rel.h:482
ItemPointer scantid
Definition: nbtree.h:664
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:244
#define Min(x, y)
Definition: c.h:927
bool allequalimage
Definition: nbtree.h:660
Datum FunctionCall2Coll(FmgrInfo *flinfo, Oid collation, Datum arg1, Datum arg2)
Definition: fmgr.c:1152
#define BTreeTupleGetNAtts(itup, rel)
Definition: nbtree.h:445
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:69
signed int int32
Definition: c.h:362
IndexTupleData * IndexTuple
Definition: itup.h:53
FmgrInfo sk_func
Definition: skey.h:71
#define IndexRelationGetNumberOfKeyAttributes(relation)
Definition: rel.h:475
#define SK_ISNULL
Definition: skey.h:115
bool pivotsearch
Definition: nbtree.h:663
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:235
#define SK_BT_NULLS_FIRST
Definition: nbtree.h:957
static ItemPointer BTreeTupleGetMaxHeapTID(IndexTuple itup)
Definition: nbtree.h:532
uintptr_t Datum
Definition: postgres.h:367
static bool BTreeTupleIsPosting(IndexTuple itup)
Definition: nbtree.h:360
bool _bt_check_natts(Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum)
Definition: nbtutils.c:2465
int sk_flags
Definition: skey.h:66
#define Assert(condition)
Definition: c.h:745
#define SK_BT_DESC
Definition: nbtree.h:956
bool heapkeyspace
Definition: nbtree.h:659
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
#define index_getattr(tup, attnum, tupleDesc, isnull)
Definition: itup.h:100
ScanKeyData scankeys[INDEX_MAX_KEYS]
Definition: nbtree.h:666
Oid sk_collation
Definition: skey.h:70
int i
Datum sk_argument
Definition: skey.h:72
#define INVERT_COMPARE_RESULT(var)
Definition: c.h:1066
#define PageGetItem(page, itemId)
Definition: bufpage.h:340
AttrNumber sk_attno
Definition: skey.h:67
#define P_ISLEAF(opaque)
Definition: nbtree.h:214

◆ _bt_conditionallockbuf()

bool _bt_conditionallockbuf ( Relation  rel,
Buffer  buf 
)

Definition at line 1029 of file nbtpage.c.

References BufferGetPage, ConditionalLockBuffer(), RelationUsesLocalBuffers, and VALGRIND_MAKE_MEM_DEFINED.

Referenced by _bt_getbuf(), and _bt_search_insert().

1030 {
1031  /* ConditionalLockBuffer() asserts that pin is held by this backend */
1032  if (!ConditionalLockBuffer(buf))
1033  return false;
1034 
1035  if (!RelationUsesLocalBuffers(rel))
1037 
1038  return true;
1039 }
#define VALGRIND_MAKE_MEM_DEFINED(addr, size)
Definition: memdebug.h:26
static char * buf
Definition: pg_test_fsync.c:68
#define BufferGetPage(buffer)
Definition: bufmgr.h:169
bool ConditionalLockBuffer(Buffer buffer)
Definition: bufmgr.c:3776
#define RelationUsesLocalBuffers(relation)
Definition: rel.h:572

◆ _bt_dedup_finish_pending()

Size _bt_dedup_finish_pending ( Page  newpage,
BTDedupState  state 
)

Definition at line 446 of file nbtdedup.c.

References _bt_form_posting(), Assert, BTDedupStateData::base, BTDedupInterval::baseoff, BTDedupStateData::baseoff, elog, ERROR, BTDedupStateData::htids, IndexTupleSize, BTDedupStateData::intervals, InvalidOffsetNumber, MAXALIGN, BTDedupStateData::nhtids, BTDedupStateData::nintervals, BTDedupInterval::nitems, BTDedupStateData::nitems, OffsetNumberNext, PageAddItem, PageGetMaxOffsetNumber, pfree(), and BTDedupStateData::phystupsize.

Referenced by _bt_dedup_one_page(), and btree_xlog_dedup().

447 {
448  OffsetNumber tupoff;
449  Size tuplesz;
450  Size spacesaving;
451 
452  Assert(state->nitems > 0);
453  Assert(state->nitems <= state->nhtids);
454  Assert(state->intervals[state->nintervals].baseoff == state->baseoff);
455 
456  tupoff = OffsetNumberNext(PageGetMaxOffsetNumber(newpage));
457  if (state->nitems == 1)
458  {
459  /* Use original, unchanged base tuple */
460  tuplesz = IndexTupleSize(state->base);
461  if (PageAddItem(newpage, (Item) state->base, tuplesz, tupoff,
462  false, false) == InvalidOffsetNumber)
463  elog(ERROR, "deduplication failed to add tuple to page");
464 
465  spacesaving = 0;
466  }
467  else
468  {
469  IndexTuple final;
470 
471  /* Form a tuple with a posting list */
472  final = _bt_form_posting(state->base, state->htids, state->nhtids);
473  tuplesz = IndexTupleSize(final);
474  Assert(tuplesz <= state->maxpostingsize);
475 
476  /* Save final number of items for posting list */
477  state->intervals[state->nintervals].nitems = state->nitems;
478 
479  Assert(tuplesz == MAXALIGN(IndexTupleSize(final)));
480  if (PageAddItem(newpage, (Item) final, tuplesz, tupoff, false,
481  false) == InvalidOffsetNumber)
482  elog(ERROR, "deduplication failed to add tuple to page");
483 
484  pfree(final);
485  spacesaving = state->phystupsize - (tuplesz + sizeof(ItemIdData));
486  /* Increment nintervals, since we wrote a new posting list tuple */
487  state->nintervals++;
488  Assert(spacesaving > 0 && spacesaving < BLCKSZ);
489  }
490 
491  /* Reset state for next pending posting list */
492  state->nhtids = 0;
493  state->nitems = 0;
494  state->phystupsize = 0;
495 
496  return spacesaving;
497 }
IndexTuple base
Definition: nbtree.h:746
IndexTuple _bt_form_posting(IndexTuple base, ItemPointer htids, int nhtids)
Definition: nbtdedup.c:616
OffsetNumber baseoff
Definition: nbtree.h:747
Pointer Item
Definition: item.h:17
#define PageAddItem(page, item, size, offsetNumber, overwrite, is_heap)
Definition: bufpage.h:410
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:357
uint16 OffsetNumber
Definition: off.h:24
ItemPointer htids
Definition: nbtree.h:751
void pfree(void *pointer)
Definition: mcxt.c:1057
Size phystupsize
Definition: nbtree.h:754
#define ERROR
Definition: elog.h:43
BTDedupInterval intervals[MaxIndexTuplesPerPage]
Definition: nbtree.h:763
struct ItemIdData ItemIdData
#define InvalidOffsetNumber
Definition: off.h:26
uint16 nitems
Definition: nbtree.h:717
#define Assert(condition)
Definition: c.h:745
#define OffsetNumberNext(offsetNumber)
Definition: off.h:52
size_t Size
Definition: c.h:473
#define MAXALIGN(LEN)
Definition: c.h:698
#define elog(elevel,...)
Definition: elog.h:214
OffsetNumber baseoff
Definition: nbtree.h:716
#define IndexTupleSize(itup)
Definition: itup.h:71

◆ _bt_dedup_one_page()

void _bt_dedup_one_page ( Relation  rel,
Buffer  buf,
Relation  heapRel,
IndexTuple  newitem,
Size  newitemsz,
bool  checkingunique 
)

Definition at line 56 of file nbtdedup.c.

References _bt_dedup_finish_pending(), _bt_dedup_save_htid(), _bt_dedup_start_pending(), _bt_delitems_delete(), _bt_do_singleval(), _bt_keep_natts_fast(), _bt_singleval_fillfactor(), Assert, BTDedupStateData::base, BTDedupStateData::baseoff, BTDedupStateData::basetupsize, BTMaxItemSize, BTP_HAS_GARBAGE, BTPageOpaqueData::btpo_flags, BufferGetPage, BTDedupStateData::deduplicate, elog, END_CRIT_SECTION, ERROR, BTDedupStateData::htids, INDEX_SIZE_MASK, IndexRelationGetNumberOfKeyAttributes, BTDedupStateData::intervals, InvalidOffsetNumber, ItemIdGetLength, ItemIdIsDead, MarkBufferDirty(), MaxIndexTuplesPerPage, BTDedupStateData::maxpostingsize, Min, BTDedupStateData::nhtids, xl_btree_dedup::nintervals, BTDedupStateData::nintervals, BTDedupStateData::nitems, BTDedupStateData::nmaxitems, OffsetNumberNext, P_FIRSTDATAKEY, P_HAS_GARBAGE, P_HIKEY, P_RIGHTMOST, PageAddItem, PageGetExactFreeSpace(), PageGetFreeSpace(), PageGetItem, PageGetItemId, PageGetLSN, PageGetMaxOffsetNumber, PageGetSpecialPointer, PageGetTempPageCopySpecial(), PageRestoreTempPage(), PageSetLSN, palloc(), pfree(), BTDedupStateData::phystupsize, REGBUF_STANDARD, RelationNeedsWAL, SizeOfBtreeDedup, START_CRIT_SECTION, XLOG_BTREE_DEDUP, XLogBeginInsert(), XLogInsert(), XLogRegisterBufData(), XLogRegisterBuffer(), and XLogRegisterData().

Referenced by _bt_findinsertloc().

58 {
59  OffsetNumber offnum,
60  minoff,
61  maxoff;
62  Page page = BufferGetPage(buf);
63  BTPageOpaque opaque;
64  Page newpage;
67  int ndeletable = 0;
68  Size pagesaving = 0;
69  bool singlevalstrat = false;
70  int nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
71 
72  /*
73  * We can't assume that there are no LP_DEAD items. For one thing, VACUUM
74  * will clear the BTP_HAS_GARBAGE hint without reliably removing items
75  * that are marked LP_DEAD. We don't want to unnecessarily unset LP_DEAD
76  * bits when deduplicating items. Allowing it would be correct, though
77  * wasteful.
78  */
79  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
80  minoff = P_FIRSTDATAKEY(opaque);
81  maxoff = PageGetMaxOffsetNumber(page);
82  for (offnum = minoff;
83  offnum <= maxoff;
84  offnum = OffsetNumberNext(offnum))
85  {
86  ItemId itemid = PageGetItemId(page, offnum);
87 
88  if (ItemIdIsDead(itemid))
89  deletable[ndeletable++] = offnum;
90  }
91 
92  if (ndeletable > 0)
93  {
94  _bt_delitems_delete(rel, buf, deletable, ndeletable, heapRel);
95 
96  /*
97  * Return when a split will be avoided. This is equivalent to
98  * avoiding a split using the usual _bt_vacuum_one_page() path.
99  */
100  if (PageGetFreeSpace(page) >= newitemsz)
101  return;
102 
103  /*
104  * Reconsider number of items on page, in case _bt_delitems_delete()
105  * managed to delete an item or two
106  */
107  minoff = P_FIRSTDATAKEY(opaque);
108  maxoff = PageGetMaxOffsetNumber(page);
109  }
110 
111  /* Passed-in newitemsz is MAXALIGNED but does not include line pointer */
112  newitemsz += sizeof(ItemIdData);
113 
114  /*
115  * By here, it's clear that deduplication will definitely be attempted.
116  * Initialize deduplication state.
117  *
118  * It would be possible for maxpostingsize (limit on posting list tuple
119  * size) to be set to one third of the page. However, it seems like a
120  * good idea to limit the size of posting lists to one sixth of a page.
121  * That ought to leave us with a good split point when pages full of
122  * duplicates can be split several times.
123  */
124  state = (BTDedupState) palloc(sizeof(BTDedupStateData));
125  state->deduplicate = true;
126  state->nmaxitems = 0;
127  state->maxpostingsize = Min(BTMaxItemSize(page) / 2, INDEX_SIZE_MASK);
128  /* Metadata about base tuple of current pending posting list */
129  state->base = NULL;
130  state->baseoff = InvalidOffsetNumber;
131  state->basetupsize = 0;
132  /* Metadata about current pending posting list TIDs */
133  state->htids = palloc(state->maxpostingsize);
134  state->nhtids = 0;
135  state->nitems = 0;
136  /* Size of all physical tuples to be replaced by pending posting list */
137  state->phystupsize = 0;
138  /* nintervals should be initialized to zero */
139  state->nintervals = 0;
140 
141  /* Determine if "single value" strategy should be used */
142  if (!checkingunique)
143  singlevalstrat = _bt_do_singleval(rel, page, state, minoff, newitem);
144 
145  /*
146  * Deduplicate items from page, and write them to newpage.
147  *
148  * Copy the original page's LSN into newpage copy. This will become the
149  * updated version of the page. We need this because XLogInsert will
150  * examine the LSN and possibly dump it in a page image.
151  */
152  newpage = PageGetTempPageCopySpecial(page);
153  PageSetLSN(newpage, PageGetLSN(page));
154 
155  /* Copy high key, if any */
156  if (!P_RIGHTMOST(opaque))
157  {
158  ItemId hitemid = PageGetItemId(page, P_HIKEY);
159  Size hitemsz = ItemIdGetLength(hitemid);
160  IndexTuple hitem = (IndexTuple) PageGetItem(page, hitemid);
161 
162  if (PageAddItem(newpage, (Item) hitem, hitemsz, P_HIKEY,
163  false, false) == InvalidOffsetNumber)
164  elog(ERROR, "deduplication failed to add highkey");
165  }
166 
167  for (offnum = minoff;
168  offnum <= maxoff;
169  offnum = OffsetNumberNext(offnum))
170  {
171  ItemId itemid = PageGetItemId(page, offnum);
172  IndexTuple itup = (IndexTuple) PageGetItem(page, itemid);
173 
174  Assert(!ItemIdIsDead(itemid));
175 
176  if (offnum == minoff)
177  {
178  /*
179  * No previous/base tuple for the data item -- use the data item
180  * as base tuple of pending posting list
181  */
182  _bt_dedup_start_pending(state, itup, offnum);
183  }
184  else if (state->deduplicate &&
185  _bt_keep_natts_fast(rel, state->base, itup) > nkeyatts &&
186  _bt_dedup_save_htid(state, itup))
187  {
188  /*
189  * Tuple is equal to base tuple of pending posting list. Heap
190  * TID(s) for itup have been saved in state.
191  */
192  }
193  else
194  {
195  /*
196  * Tuple is not equal to pending posting list tuple, or
197  * _bt_dedup_save_htid() opted to not merge current item into
198  * pending posting list for some other reason (e.g., adding more
199  * TIDs would have caused posting list to exceed current
200  * maxpostingsize).
201  *
202  * If state contains pending posting list with more than one item,
203  * form new posting tuple, and actually update the page. Else
204  * reset the state and move on without modifying the page.
205  */
206  pagesaving += _bt_dedup_finish_pending(newpage, state);
207 
208  if (singlevalstrat)
209  {
210  /*
211  * Single value strategy's extra steps.
212  *
213  * Lower maxpostingsize for sixth and final large posting list
214  * tuple at the point where 5 maxpostingsize-capped tuples
215  * have either been formed or observed.
216  *
217  * When a sixth maxpostingsize-capped item is formed/observed,
218  * stop merging together tuples altogether. The few tuples
219  * that remain at the end of the page won't be merged together
220  * at all (at least not until after a future page split takes
221  * place).
222  */
223  if (state->nmaxitems == 5)
224  _bt_singleval_fillfactor(page, state, newitemsz);
225  else if (state->nmaxitems == 6)
226  {
227  state->deduplicate = false;
228  singlevalstrat = false; /* won't be back here */
229  }
230  }
231 
232  /* itup starts new pending posting list */
233  _bt_dedup_start_pending(state, itup, offnum);
234  }
235  }
236 
237  /* Handle the last item */
238  pagesaving += _bt_dedup_finish_pending(newpage, state);
239 
240  /*
241  * If no items suitable for deduplication were found, newpage must be
242  * exactly the same as the original page, so just return from function.
243  *
244  * We could determine whether or not to proceed on the basis the space
245  * savings being sufficient to avoid an immediate page split instead. We
246  * don't do that because there is some small value in nbtsplitloc.c always
247  * operating against a page that is fully deduplicated (apart from
248  * newitem). Besides, most of the cost has already been paid.
249  */
250  if (state->nintervals == 0)
251  {
252  /* cannot leak memory here */
253  pfree(newpage);
254  pfree(state->htids);
255  pfree(state);
256  return;
257  }
258 
259  /*
260  * By here, it's clear that deduplication will definitely go ahead.
261  *
262  * Clear the BTP_HAS_GARBAGE page flag in the unlikely event that it is
263  * still falsely set, just to keep things tidy. (We can't rely on
264  * _bt_vacuum_one_page() having done this already, and we can't rely on a
265  * page split or VACUUM getting to it in the near future.)
266  */
267  if (P_HAS_GARBAGE(opaque))
268  {
269  BTPageOpaque nopaque = (BTPageOpaque) PageGetSpecialPointer(newpage);
270 
271  nopaque->btpo_flags &= ~BTP_HAS_GARBAGE;
272  }
273 
275 
276  PageRestoreTempPage(newpage, page);
278 
279  /* XLOG stuff */
280  if (RelationNeedsWAL(rel))
281  {
282  XLogRecPtr recptr;
283  xl_btree_dedup xlrec_dedup;
284 
285  xlrec_dedup.nintervals = state->nintervals;
286 
287  XLogBeginInsert();
289  XLogRegisterData((char *) &xlrec_dedup, SizeOfBtreeDedup);
290 
291  /*
292  * The intervals array is not in the buffer, but pretend that it is.
293  * When XLogInsert stores the whole buffer, the array need not be
294  * stored too.
295  */
296  XLogRegisterBufData(0, (char *) state->intervals,
297  state->nintervals * sizeof(BTDedupInterval));
298 
299  recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DEDUP);
300 
301  PageSetLSN(page, recptr);
302  }
303 
305 
306  /* Local space accounting should agree with page accounting */
307  Assert(pagesaving < newitemsz || PageGetExactFreeSpace(page) >= newitemsz);
308 
309  /* cannot leak memory here */
310  pfree(state->htids);
311  pfree(state);
312 }
void XLogRegisterBufData(uint8 block_id, char *data, int len)
Definition: xloginsert.c:368
IndexTuple base
Definition: nbtree.h:746
void PageRestoreTempPage(Page tempPage, Page oldPage)
Definition: bufpage.c:403
uint16 nintervals
Definition: nbtxlog.h:172
OffsetNumber baseoff
Definition: nbtree.h:747
void MarkBufferDirty(Buffer buffer)
Definition: bufmgr.c:1469
void XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags)
Definition: xloginsert.c:220
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:244
#define Min(x, y)
Definition: c.h:927
#define END_CRIT_SECTION()
Definition: miscadmin.h:134
Pointer Item
Definition: item.h:17
#define INDEX_SIZE_MASK
Definition: itup.h:65
#define P_HAS_GARBAGE(opaque)
Definition: nbtree.h:220
#define START_CRIT_SECTION()
Definition: miscadmin.h:132
#define PageAddItem(page, item, size, offsetNumber, overwrite, is_heap)
Definition: bufpage.h:410
void _bt_dedup_start_pending(BTDedupState state, IndexTuple base, OffsetNumber baseoff)
Definition: nbtdedup.c:324
#define ItemIdIsDead(itemId)
Definition: itemid.h:113
Size _bt_dedup_finish_pending(Page newpage, BTDedupState state)
Definition: nbtdedup.c:446
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:357
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:69
Size PageGetFreeSpace(Page page)
Definition: bufpage.c:782
uint16 OffsetNumber
Definition: off.h:24
Page PageGetTempPageCopySpecial(Page page)
Definition: bufpage.c:381
ItemPointer htids
Definition: nbtree.h:751
void pfree(void *pointer)
Definition: mcxt.c:1057
#define ItemIdGetLength(itemId)
Definition: itemid.h:59
Size phystupsize
Definition: nbtree.h:754
#define ERROR
Definition: elog.h:43
BTDedupInterval intervals[MaxIndexTuplesPerPage]
Definition: nbtree.h:763
static bool _bt_do_singleval(Relation rel, Page page, BTDedupState state, OffsetNumber minoff, IndexTuple newitem)
Definition: nbtdedup.c:534
void _bt_delitems_delete(Relation rel, Buffer buf, OffsetNumber *deletable, int ndeletable, Relation heapRel)
Definition: nbtpage.c:1290
static char * buf
Definition: pg_test_fsync.c:68
bool _bt_dedup_save_htid(BTDedupState state, IndexTuple itup)
Definition: nbtdedup.c:375
IndexTupleData * IndexTuple
Definition: itup.h:53
#define REGBUF_STANDARD
Definition: xloginsert.h:35
struct ItemIdData ItemIdData
#define SizeOfBtreeDedup
Definition: nbtxlog.h:177
#define IndexRelationGetNumberOfKeyAttributes(relation)
Definition: rel.h:475
#define BufferGetPage(buffer)
Definition: bufmgr.h:169
#define XLOG_BTREE_DEDUP
Definition: nbtxlog.h:32
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:235
void XLogRegisterData(char *data, int len)
Definition: xloginsert.c:330
XLogRecPtr XLogInsert(RmgrId rmid, uint8 info)
Definition: xloginsert.c:422
#define InvalidOffsetNumber
Definition: off.h:26
uint64 XLogRecPtr
Definition: xlogdefs.h:21
#define Assert(condition)
Definition: c.h:745
Definition: regguts.h:298
#define OffsetNumberNext(offsetNumber)
Definition: off.h:52
size_t Size
Definition: c.h:473
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
Size basetupsize
Definition: nbtree.h:748
#define RelationNeedsWAL(relation)
Definition: rel.h:562
Size maxpostingsize
Definition: nbtree.h:743
#define PageGetLSN(page)
Definition: bufpage.h:366
Size PageGetExactFreeSpace(Page page)
Definition: bufpage.c:833
#define BTMaxItemSize(page)
Definition: nbtree.h:157
#define P_HIKEY
Definition: nbtree.h:242
int _bt_keep_natts_fast(Relation rel, IndexTuple lastleft, IndexTuple firstright)
Definition: nbtutils.c:2418
#define MaxIndexTuplesPerPage
Definition: itup.h:145
void * palloc(Size size)
Definition: mcxt.c:950
bool deduplicate
Definition: nbtree.h:741
#define elog(elevel,...)
Definition: elog.h:214
BTDedupStateData * BTDedupState
Definition: nbtree.h:766
void XLogBeginInsert(void)
Definition: xloginsert.c:123
uint16 btpo_flags
Definition: nbtree.h:65
#define PageSetLSN(page, lsn)
Definition: bufpage.h:368
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:213
#define BTP_HAS_GARBAGE
Definition: nbtree.h:78
#define PageGetItem(page, itemId)
Definition: bufpage.h:340
Pointer Page
Definition: bufpage.h:78
static void _bt_singleval_fillfactor(Page page, BTDedupState state, Size newitemsz)
Definition: nbtdedup.c:574

◆ _bt_dedup_save_htid()

bool _bt_dedup_save_htid ( BTDedupState  state,
IndexTuple  itup 
)

Definition at line 375 of file nbtdedup.c.

References Assert, BTDedupStateData::basetupsize, BTreeTupleGetNPosting(), BTreeTupleGetPosting(), BTreeTupleIsPivot(), BTreeTupleIsPosting(), BTDedupStateData::htids, IndexTupleSize, MAXALIGN, BTDedupStateData::maxpostingsize, BTDedupStateData::nhtids, BTDedupStateData::nitems, BTDedupStateData::nmaxitems, BTDedupStateData::phystupsize, and IndexTupleData::t_tid.

Referenced by _bt_dedup_one_page(), _bt_load(), and btree_xlog_dedup().

376 {
377  int nhtids;
378  ItemPointer htids;
379  Size mergedtupsz;
380 
381  Assert(!BTreeTupleIsPivot(itup));
382 
383  if (!BTreeTupleIsPosting(itup))
384  {
385  nhtids = 1;
386  htids = &itup->t_tid;
387  }
388  else
389  {
390  nhtids = BTreeTupleGetNPosting(itup);
391  htids = BTreeTupleGetPosting(itup);
392  }
393 
394  /*
395  * Don't append (have caller finish pending posting list as-is) if
396  * appending heap TID(s) from itup would put us over maxpostingsize limit.
397  *
398  * This calculation needs to match the code used within _bt_form_posting()
399  * for new posting list tuples.
400  */
401  mergedtupsz = MAXALIGN(state->basetupsize +
402  (state->nhtids + nhtids) * sizeof(ItemPointerData));
403 
404  if (mergedtupsz > state->maxpostingsize)
405  {
406  /*
407  * Count this as an oversized item for single value strategy, though
408  * only when there are 50 TIDs in the final posting list tuple. This
409  * limit (which is fairly arbitrary) avoids confusion about how many
410  * 1/6 of a page tuples have been encountered/created by the current
411  * deduplication pass.
412  *
413  * Note: We deliberately don't consider which deduplication pass
414  * merged together tuples to create this item (could be a previous
415  * deduplication pass, or current pass). See _bt_do_singleval()
416  * comments.
417  */
418  if (state->nhtids > 50)
419  state->nmaxitems++;
420 
421  return false;
422  }
423 
424  /*
425  * Save heap TIDs to pending posting list tuple -- itup can be merged into
426  * pending posting list
427  */
428  state->nitems++;
429  memcpy(state->htids + state->nhtids, htids,
430  sizeof(ItemPointerData) * nhtids);
431  state->nhtids += nhtids;
432  state->phystupsize += MAXALIGN(IndexTupleSize(itup)) + sizeof(ItemIdData);
433 
434  return true;
435 }
static bool BTreeTupleIsPivot(IndexTuple itup)
Definition: nbtree.h:348
static ItemPointer BTreeTupleGetPosting(IndexTuple posting)
Definition: nbtree.h:405
ItemPointerData t_tid
Definition: itup.h:37
ItemPointer htids
Definition: nbtree.h:751
Size phystupsize
Definition: nbtree.h:754
struct ItemIdData ItemIdData
static bool BTreeTupleIsPosting(IndexTuple itup)
Definition: nbtree.h:360
#define Assert(condition)
Definition: c.h:745
size_t Size
Definition: c.h:473
#define MAXALIGN(LEN)
Definition: c.h:698
Size basetupsize
Definition: nbtree.h:748
Size maxpostingsize
Definition: nbtree.h:743
static uint16 BTreeTupleGetNPosting(IndexTuple posting)
Definition: nbtree.h:386
#define IndexTupleSize(itup)
Definition: itup.h:71

◆ _bt_dedup_start_pending()

void _bt_dedup_start_pending ( BTDedupState  state,
IndexTuple  base,
OffsetNumber  baseoff 
)

Definition at line 324 of file nbtdedup.c.

References Assert, BTDedupStateData::base, BTDedupInterval::baseoff, BTDedupStateData::baseoff, BTDedupStateData::basetupsize, BTreeTupleGetNPosting(), BTreeTupleGetPosting(), BTreeTupleGetPostingOffset(), BTreeTupleIsPivot(), BTreeTupleIsPosting(), BTDedupStateData::htids, IndexTupleSize, BTDedupStateData::intervals, MAXALIGN, BTDedupStateData::nhtids, BTDedupStateData::nintervals, BTDedupStateData::nitems, BTDedupStateData::phystupsize, and IndexTupleData::t_tid.

Referenced by _bt_dedup_one_page(), _bt_load(), and btree_xlog_dedup().

326 {
327  Assert(state->nhtids == 0);
328  Assert(state->nitems == 0);
329  Assert(!BTreeTupleIsPivot(base));
330 
331  /*
332  * Copy heap TID(s) from new base tuple for new candidate posting list
333  * into working state's array
334  */
335  if (!BTreeTupleIsPosting(base))
336  {
337  memcpy(state->htids, &base->t_tid, sizeof(ItemPointerData));
338  state->nhtids = 1;
339  state->basetupsize = IndexTupleSize(base);
340  }
341  else
342  {
343  int nposting;
344 
345  nposting = BTreeTupleGetNPosting(base);
346  memcpy(state->htids, BTreeTupleGetPosting(base),
347  sizeof(ItemPointerData) * nposting);
348  state->nhtids = nposting;
349  /* basetupsize should not include existing posting list */
351  }
352 
353  /*
354  * Save new base tuple itself -- it'll be needed if we actually create a
355  * new posting list from new pending posting list.
356  *
357  * Must maintain physical size of all existing tuples (including line
358  * pointer overhead) so that we can calculate space savings on page.
359  */
360  state->nitems = 1;
361  state->base = base;
362  state->baseoff = baseoff;
363  state->phystupsize = MAXALIGN(IndexTupleSize(base)) + sizeof(ItemIdData);
364  /* Also save baseoff in pending state for interval */
365  state->intervals[state->nintervals].baseoff = state->baseoff;
366 }
IndexTuple base
Definition: nbtree.h:746
static bool BTreeTupleIsPivot(IndexTuple itup)
Definition: nbtree.h:348
OffsetNumber baseoff
Definition: nbtree.h:747
static ItemPointer BTreeTupleGetPosting(IndexTuple posting)
Definition: nbtree.h:405
ItemPointerData t_tid
Definition: itup.h:37
ItemPointer htids
Definition: nbtree.h:751
Size phystupsize
Definition: nbtree.h:754
BTDedupInterval intervals[MaxIndexTuplesPerPage]
Definition: nbtree.h:763
struct ItemIdData ItemIdData
static bool BTreeTupleIsPosting(IndexTuple itup)
Definition: nbtree.h:360
static uint32 BTreeTupleGetPostingOffset(IndexTuple posting)
Definition: nbtree.h:397
#define Assert(condition)
Definition: c.h:745
#define MAXALIGN(LEN)
Definition: c.h:698
Size basetupsize
Definition: nbtree.h:748
static uint16 BTreeTupleGetNPosting(IndexTuple posting)
Definition: nbtree.h:386
OffsetNumber baseoff
Definition: nbtree.h:716
#define IndexTupleSize(itup)
Definition: itup.h:71

◆ _bt_delitems_delete()

void _bt_delitems_delete ( Relation  rel,
Buffer  buf,
OffsetNumber deletable,
int  ndeletable,
Relation  heapRel 
)

Definition at line 1290 of file nbtpage.c.

References _bt_xid_horizon(), Assert, BTP_HAS_GARBAGE, BTPageOpaqueData::btpo_flags, BufferGetPage, END_CRIT_SECTION, InvalidTransactionId, xl_btree_delete::latestRemovedXid, MarkBufferDirty(), xl_btree_delete::ndeleted, PageGetSpecialPointer, PageIndexMultiDelete(), PageSetLSN, REGBUF_STANDARD, RelationNeedsWAL, SizeOfBtreeDelete, START_CRIT_SECTION, XLOG_BTREE_DELETE, XLogBeginInsert(), XLogInsert(), XLogRegisterBufData(), XLogRegisterBuffer(), XLogRegisterData(), and XLogStandbyInfoActive.

Referenced by _bt_dedup_one_page(), and _bt_vacuum_one_page().

1293 {
1294  Page page = BufferGetPage(buf);
1295  BTPageOpaque opaque;
1296  TransactionId latestRemovedXid = InvalidTransactionId;
1297 
1298  /* Shouldn't be called unless there's something to do */
1299  Assert(ndeletable > 0);
1300 
1302  latestRemovedXid =
1303  _bt_xid_horizon(rel, heapRel, page, deletable, ndeletable);
1304 
1305  /* No ereport(ERROR) until changes are logged */
1307 
1308  /* Fix the page */
1309  PageIndexMultiDelete(page, deletable, ndeletable);
1310 
1311  /*
1312  * Unlike _bt_delitems_vacuum, we *must not* clear the vacuum cycle ID,
1313  * because this is not called by VACUUM. Just clear the BTP_HAS_GARBAGE
1314  * page flag, since we deleted all items with their LP_DEAD bit set.
1315  */
1316  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1317  opaque->btpo_flags &= ~BTP_HAS_GARBAGE;
1318 
1320 
1321  /* XLOG stuff */
1322  if (RelationNeedsWAL(rel))
1323  {
1324  XLogRecPtr recptr;
1325  xl_btree_delete xlrec_delete;
1326 
1327  xlrec_delete.latestRemovedXid = latestRemovedXid;
1328  xlrec_delete.ndeleted = ndeletable;
1329 
1330  XLogBeginInsert();
1332  XLogRegisterData((char *) &xlrec_delete, SizeOfBtreeDelete);
1333 
1334  /*
1335  * The deletable array is not in the buffer, but pretend that it is.
1336  * When XLogInsert stores the whole buffer, the array need not be
1337  * stored too.
1338  */
1339  XLogRegisterBufData(0, (char *) deletable,
1340  ndeletable * sizeof(OffsetNumber));
1341 
1342  recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DELETE);
1343 
1344  PageSetLSN(page, recptr);
1345  }
1346 
1347  END_CRIT_SECTION();
1348 }
void XLogRegisterBufData(uint8 block_id, char *data, int len)
Definition: xloginsert.c:368
TransactionId latestRemovedXid
Definition: nbtxlog.h:189
uint32 TransactionId
Definition: c.h:520
void MarkBufferDirty(Buffer buffer)
Definition: bufmgr.c:1469
void XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags)
Definition: xloginsert.c:220
#define END_CRIT_SECTION()
Definition: miscadmin.h:134
#define START_CRIT_SECTION()
Definition: miscadmin.h:132
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:69
uint32 ndeleted
Definition: nbtxlog.h:190
uint16 OffsetNumber
Definition: off.h:24
static char * buf
Definition: pg_test_fsync.c:68
#define REGBUF_STANDARD
Definition: xloginsert.h:35
#define InvalidTransactionId
Definition: transam.h:31
#define BufferGetPage(buffer)
Definition: bufmgr.h:169
static TransactionId _bt_xid_horizon(Relation rel, Relation heapRel, Page page, OffsetNumber *deletable, int ndeletable)
Definition: nbtpage.c:1359
#define XLOG_BTREE_DELETE
Definition: nbtxlog.h:33
void XLogRegisterData(char *data, int len)
Definition: xloginsert.c:330
XLogRecPtr XLogInsert(RmgrId rmid, uint8 info)
Definition: xloginsert.c:422
#define XLogStandbyInfoActive()
Definition: xlog.h:205
uint64 XLogRecPtr
Definition: xlogdefs.h:21
#define Assert(condition)
Definition: c.h:745
void PageIndexMultiDelete(Page page, OffsetNumber *itemnos, int nitems)
Definition: bufpage.c:1036
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
#define RelationNeedsWAL(relation)
Definition: rel.h:562
#define SizeOfBtreeDelete
Definition: nbtxlog.h:195
void XLogBeginInsert(void)
Definition: xloginsert.c:123
uint16 btpo_flags
Definition: nbtree.h:65
#define PageSetLSN(page, lsn)
Definition: bufpage.h:368
#define BTP_HAS_GARBAGE
Definition: nbtree.h:78
Pointer Page
Definition: bufpage.h:78

◆ _bt_delitems_vacuum()

void _bt_delitems_vacuum ( Relation  rel,
Buffer  buf,
OffsetNumber deletable,
int  ndeletable,
BTVacuumPosting updatable,
int  nupdatable 
)

Definition at line 1124 of file nbtpage.c.

References _bt_update_posting(), Assert, BTP_HAS_GARBAGE, BTPageOpaqueData::btpo_cycleid, BTPageOpaqueData::btpo_flags, BufferGetBlockNumber(), BufferGetPage, BTVacuumPostingData::deletetids, elog, END_CRIT_SECTION, i, IndexTupleSize, BTVacuumPostingData::itup, MarkBufferDirty(), MAXALIGN, MaxIndexTuplesPerPage, xl_btree_vacuum::ndeleted, xl_btree_update::ndeletedtids, BTVacuumPostingData::ndeletedtids, xl_btree_vacuum::nupdated, PageGetSpecialPointer, PageIndexMultiDelete(), PageIndexTupleOverwrite(), PageSetLSN, palloc(), PANIC, pfree(), REGBUF_STANDARD, RelationGetRelationName, RelationNeedsWAL, SizeOfBtreeUpdate, SizeOfBtreeVacuum, START_CRIT_SECTION, BTVacuumPostingData::updatedoffset, XLOG_BTREE_VACUUM, XLogBeginInsert(), XLogInsert(), XLogRegisterBufData(), XLogRegisterBuffer(), and XLogRegisterData().

Referenced by btvacuumpage().

1127 {
1128  Page page = BufferGetPage(buf);
1129  BTPageOpaque opaque;
1130  Size itemsz;
1131  char *updatedbuf = NULL;
1132  Size updatedbuflen = 0;
1133  OffsetNumber updatedoffsets[MaxIndexTuplesPerPage];
1134 
1135  /* Shouldn't be called unless there's something to do */
1136  Assert(ndeletable > 0 || nupdatable > 0);
1137 
1138  for (int i = 0; i < nupdatable; i++)
1139  {
1140  /* Replace work area IndexTuple with updated version */
1141  _bt_update_posting(updatable[i]);
1142 
1143  /* Maintain array of updatable page offsets for WAL record */
1144  updatedoffsets[i] = updatable[i]->updatedoffset;
1145  }
1146 
1147  /* XLOG stuff -- allocate and fill buffer before critical section */
1148  if (nupdatable > 0 && RelationNeedsWAL(rel))
1149  {
1150  Size offset = 0;
1151 
1152  for (int i = 0; i < nupdatable; i++)
1153  {
1154  BTVacuumPosting vacposting = updatable[i];
1155 
1156  itemsz = SizeOfBtreeUpdate +
1157  vacposting->ndeletedtids * sizeof(uint16);
1158  updatedbuflen += itemsz;
1159  }
1160 
1161  updatedbuf = palloc(updatedbuflen);
1162  for (int i = 0; i < nupdatable; i++)
1163  {
1164  BTVacuumPosting vacposting = updatable[i];
1165  xl_btree_update update;
1166 
1167  update.ndeletedtids = vacposting->ndeletedtids;
1168  memcpy(updatedbuf + offset, &update.ndeletedtids,
1170  offset += SizeOfBtreeUpdate;
1171 
1172  itemsz = update.ndeletedtids * sizeof(uint16);
1173  memcpy(updatedbuf + offset, vacposting->deletetids, itemsz);
1174  offset += itemsz;
1175  }
1176  }
1177 
1178  /* No ereport(ERROR) until changes are logged */
1180 
1181  /*
1182  * Handle posting tuple updates.
1183  *
1184  * Deliberately do this before handling simple deletes. If we did it the
1185  * other way around (i.e. WAL record order -- simple deletes before
1186  * updates) then we'd have to make compensating changes to the 'updatable'
1187  * array of offset numbers.
1188  *
1189  * PageIndexTupleOverwrite() won't unset each item's LP_DEAD bit when it
1190  * happens to already be set. Although we unset the BTP_HAS_GARBAGE page
1191  * level flag, unsetting individual LP_DEAD bits should still be avoided.
1192  */
1193  for (int i = 0; i < nupdatable; i++)
1194  {
1195  OffsetNumber updatedoffset = updatedoffsets[i];
1196  IndexTuple itup;
1197 
1198  itup = updatable[i]->itup;
1199  itemsz = MAXALIGN(IndexTupleSize(itup));
1200  if (!PageIndexTupleOverwrite(page, updatedoffset, (Item) itup,
1201  itemsz))
1202  elog(PANIC, "failed to update partially dead item in block %u of index \"%s\"",
1204  }
1205 
1206  /* Now handle simple deletes of entire tuples */
1207  if (ndeletable > 0)
1208  PageIndexMultiDelete(page, deletable, ndeletable);
1209 
1210  /*
1211  * We can clear the vacuum cycle ID since this page has certainly been
1212  * processed by the current vacuum scan.
1213  */
1214  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1215  opaque->btpo_cycleid = 0;
1216 
1217  /*
1218  * Mark the page as not containing any LP_DEAD items. This is not
1219  * certainly true (there might be some that have recently been marked, but
1220  * weren't targeted by VACUUM's heap scan), but it will be true often
1221  * enough. VACUUM does not delete items purely because they have their
1222  * LP_DEAD bit set, since doing so would necessitate explicitly logging a
1223  * latestRemovedXid cutoff (this is how _bt_delitems_delete works).
1224  *
1225  * The consequences of falsely unsetting BTP_HAS_GARBAGE should be fairly
1226  * limited, since we never falsely unset an LP_DEAD bit. Workloads that
1227  * are particularly dependent on LP_DEAD bits being set quickly will
1228  * usually manage to set the BTP_HAS_GARBAGE flag before the page fills up
1229  * again anyway. Furthermore, attempting a deduplication pass will remove
1230  * all LP_DEAD items, regardless of whether the BTP_HAS_GARBAGE hint bit
1231  * is set or not.
1232  */
1233  opaque->btpo_flags &= ~BTP_HAS_GARBAGE;
1234 
1236 
1237  /* XLOG stuff */
1238  if (RelationNeedsWAL(rel))
1239  {
1240  XLogRecPtr recptr;
1241  xl_btree_vacuum xlrec_vacuum;
1242 
1243  xlrec_vacuum.ndeleted = ndeletable;
1244  xlrec_vacuum.nupdated = nupdatable;
1245 
1246  XLogBeginInsert();
1248  XLogRegisterData((char *) &xlrec_vacuum, SizeOfBtreeVacuum);
1249 
1250  if (ndeletable > 0)
1251  XLogRegisterBufData(0, (char *) deletable,
1252  ndeletable * sizeof(OffsetNumber));
1253 
1254  if (nupdatable > 0)
1255  {
1256  XLogRegisterBufData(0, (char *) updatedoffsets,
1257  nupdatable * sizeof(OffsetNumber));
1258  XLogRegisterBufData(0, updatedbuf, updatedbuflen);
1259  }
1260 
1261  recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_VACUUM);
1262 
1263  PageSetLSN(page, recptr);
1264  }
1265 
1266  END_CRIT_SECTION();
1267 
1268  /* can't leak memory here */
1269  if (updatedbuf != NULL)
1270  pfree(updatedbuf);
1271  /* free tuples generated by calling _bt_update_posting() */
1272  for (int i = 0; i < nupdatable; i++)
1273  pfree(updatable[i]->itup);
1274 }
void XLogRegisterBufData(uint8 block_id, char *data, int len)
Definition: xloginsert.c:368
uint16 ndeletedtids
Definition: nbtree.h:782
void _bt_update_posting(BTVacuumPosting vacposting)
Definition: nbtdedup.c:676
#define SizeOfBtreeUpdate
Definition: nbtxlog.h:225
void MarkBufferDirty(Buffer buffer)
Definition: bufmgr.c:1469
void XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags)
Definition: xloginsert.c:220
OffsetNumber updatedoffset
Definition: nbtree.h:779
#define END_CRIT_SECTION()
Definition: miscadmin.h:134
IndexTuple itup
Definition: nbtree.h:778
Pointer Item
Definition: item.h:17
#define START_CRIT_SECTION()
Definition: miscadmin.h:132
uint16 nupdated
Definition: nbtxlog.h:243
#define PANIC
Definition: elog.h:53
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:69
uint16 OffsetNumber
Definition: off.h:24
unsigned short uint16
Definition: c.h:373
void pfree(void *pointer)
Definition: mcxt.c:1057
#define SizeOfBtreeVacuum
Definition: nbtxlog.h:250
BTCycleId btpo_cycleid
Definition: nbtree.h:66
bool PageIndexTupleOverwrite(Page page, OffsetNumber offnum, Item newtup, Size newsize)
Definition: bufpage.c:1280
static char * buf
Definition: pg_test_fsync.c:68
#define REGBUF_STANDARD
Definition: xloginsert.h:35
#define XLOG_BTREE_VACUUM
Definition: nbtxlog.h:38
#define RelationGetRelationName(relation)
Definition: rel.h:490
#define BufferGetPage(buffer)
Definition: bufmgr.h:169
uint16 ndeleted
Definition: nbtxlog.h:242
void XLogRegisterData(char *data, int len)
Definition: xloginsert.c:330
XLogRecPtr XLogInsert(RmgrId rmid, uint8 info)
Definition: xloginsert.c:422
uint16 deletetids[FLEXIBLE_ARRAY_MEMBER]
Definition: nbtree.h:783
uint64 XLogRecPtr
Definition: xlogdefs.h:21
#define Assert(condition)
Definition: c.h:745
void PageIndexMultiDelete(Page page, OffsetNumber *itemnos, int nitems)
Definition: bufpage.c:1036
size_t Size
Definition: c.h:473
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
#define MAXALIGN(LEN)
Definition: c.h:698
#define RelationNeedsWAL(relation)
Definition: rel.h:562
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2661
#define MaxIndexTuplesPerPage
Definition: itup.h:145
void * palloc(Size size)
Definition: mcxt.c:950
#define elog(elevel,...)
Definition: elog.h:214
int i
void XLogBeginInsert(void)
Definition: xloginsert.c:123
uint16 btpo_flags
Definition: nbtree.h:65
#define PageSetLSN(page, lsn)
Definition: bufpage.h:368
#define BTP_HAS_GARBAGE
Definition: nbtree.h:78
uint16 ndeletedtids
Definition: nbtxlog.h:220
Pointer Page
Definition: bufpage.h:78
#define IndexTupleSize(itup)
Definition: itup.h:71

◆ _bt_doinsert()

bool _bt_doinsert ( Relation  rel,
IndexTuple  itup,
IndexUniqueCheck  checkUnique,
Relation  heapRel 
)

Definition at line 82 of file nbtinsert.c.

References _bt_check_unique(), _bt_findinsertloc(), _bt_freestack(), _bt_insertonpg(), _bt_mkscankey(), _bt_relbuf(), _bt_search_insert(), BTScanInsertData::anynullkeys, Assert, BTInsertStateData::bounds_valid, BTInsertStateData::buf, BufferGetBlockNumber(), CheckForSerializableConflictIn(), BTScanInsertData::heapkeyspace, IndexTupleSize, InvalidBuffer, BTInsertStateData::itemsz, BTInsertStateData::itup, BTInsertStateData::itup_key, MAXALIGN, pfree(), BTInsertStateData::postingoff, BTScanInsertData::scantid, SpeculativeInsertionWait(), IndexTupleData::t_tid, TransactionIdIsValid, UNIQUE_CHECK_EXISTING, UNIQUE_CHECK_NO, unlikely, XactLockTableWait(), and XLTW_InsertIndex.

Referenced by btinsert().

84 {
85  bool is_unique = false;
86  BTInsertStateData insertstate;
87  BTScanInsert itup_key;
88  BTStack stack;
89  bool checkingunique = (checkUnique != UNIQUE_CHECK_NO);
90 
91  /* we need an insertion scan key to do our search, so build one */
92  itup_key = _bt_mkscankey(rel, itup);
93 
94  if (checkingunique)
95  {
96  if (!itup_key->anynullkeys)
97  {
98  /* No (heapkeyspace) scantid until uniqueness established */
99  itup_key->scantid = NULL;
100  }
101  else
102  {
103  /*
104  * Scan key for new tuple contains NULL key values. Bypass
105  * checkingunique steps. They are unnecessary because core code
106  * considers NULL unequal to every value, including NULL.
107  *
108  * This optimization avoids O(N^2) behavior within the
109  * _bt_findinsertloc() heapkeyspace path when a unique index has a
110  * large number of "duplicates" with NULL key values.
111  */
112  checkingunique = false;
113  /* Tuple is unique in the sense that core code cares about */
114  Assert(checkUnique != UNIQUE_CHECK_EXISTING);
115  is_unique = true;
116  }
117  }
118 
119  /*
120  * Fill in the BTInsertState working area, to track the current page and
121  * position within the page to insert on.
122  *
123  * Note that itemsz is passed down to lower level code that deals with
124  * inserting the item. It must be MAXALIGN()'d. This ensures that space
125  * accounting code consistently considers the alignment overhead that we
126  * expect PageAddItem() will add later. (Actually, index_form_tuple() is
127  * already conservative about alignment, but we don't rely on that from
128  * this distance. Besides, preserving the "true" tuple size in index
129  * tuple headers for the benefit of nbtsplitloc.c might happen someday.
130  * Note that heapam does not MAXALIGN() each heap tuple's lp_len field.)
131  */
132  insertstate.itup = itup;
133  insertstate.itemsz = MAXALIGN(IndexTupleSize(itup));
134  insertstate.itup_key = itup_key;
135  insertstate.bounds_valid = false;
136  insertstate.buf = InvalidBuffer;
137  insertstate.postingoff = 0;
138 
139 search:
140 
141  /*
142  * Find and lock the leaf page that the tuple should be added to by
143  * searching from the root page. insertstate.buf will hold a buffer that
144  * is locked in exclusive mode afterwards.
145  */
146  stack = _bt_search_insert(rel, &insertstate);
147 
148  /*
149  * checkingunique inserts are not allowed to go ahead when two tuples with
150  * equal key attribute values would be visible to new MVCC snapshots once
151  * the xact commits. Check for conflicts in the locked page/buffer (if
152  * needed) here.
153  *
154  * It might be necessary to check a page to the right in _bt_check_unique,
155  * though that should be very rare. In practice the first page the value
156  * could be on (with scantid omitted) is almost always also the only page
157  * that a matching tuple might be found on. This is due to the behavior
158  * of _bt_findsplitloc with duplicate tuples -- a group of duplicates can
159  * only be allowed to cross a page boundary when there is no candidate
160  * leaf page split point that avoids it. Also, _bt_check_unique can use
161  * the leaf page high key to determine that there will be no duplicates on
162  * the right sibling without actually visiting it (it uses the high key in
163  * cases where the new item happens to belong at the far right of the leaf
164  * page).
165  *
166  * NOTE: obviously, _bt_check_unique can only detect keys that are already
167  * in the index; so it cannot defend against concurrent insertions of the
168  * same key. We protect against that by means of holding a write lock on
169  * the first page the value could be on, with omitted/-inf value for the
170  * implicit heap TID tiebreaker attribute. Any other would-be inserter of
171  * the same key must acquire a write lock on the same page, so only one
172  * would-be inserter can be making the check at one time. Furthermore,
173  * once we are past the check we hold write locks continuously until we
174  * have performed our insertion, so no later inserter can fail to see our
175  * insertion. (This requires some care in _bt_findinsertloc.)
176  *
177  * If we must wait for another xact, we release the lock while waiting,
178  * and then must perform a new search.
179  *
180  * For a partial uniqueness check, we don't wait for the other xact. Just
181  * let the tuple in and return false for possibly non-unique, or true for
182  * definitely unique.
183  */
184  if (checkingunique)
185  {
186  TransactionId xwait;
187  uint32 speculativeToken;
188 
189  xwait = _bt_check_unique(rel, &insertstate, heapRel, checkUnique,
190  &is_unique, &speculativeToken);
191 
192  if (unlikely(TransactionIdIsValid(xwait)))
193  {
194  /* Have to wait for the other guy ... */
195  _bt_relbuf(rel, insertstate.buf);
196  insertstate.buf = InvalidBuffer;
197 
198  /*
199  * If it's a speculative insertion, wait for it to finish (ie. to
200  * go ahead with the insertion, or kill the tuple). Otherwise
201  * wait for the transaction to finish as usual.
202  */
203  if (speculativeToken)
204  SpeculativeInsertionWait(xwait, speculativeToken);
205  else
206  XactLockTableWait(xwait, rel, &itup->t_tid, XLTW_InsertIndex);
207 
208  /* start over... */
209  if (stack)
210  _bt_freestack(stack);
211  goto search;
212  }
213 
214  /* Uniqueness is established -- restore heap tid as scantid */
215  if (itup_key->heapkeyspace)
216  itup_key->scantid = &itup->t_tid;
217  }
218 
219  if (checkUnique != UNIQUE_CHECK_EXISTING)
220  {
221  OffsetNumber newitemoff;
222 
223  /*
224  * The only conflict predicate locking cares about for indexes is when
225  * an index tuple insert conflicts with an existing lock. We don't
226  * know the actual page we're going to insert on for sure just yet in
227  * checkingunique and !heapkeyspace cases, but it's okay to use the
228  * first page the value could be on (with scantid omitted) instead.
229  */
231 
232  /*
233  * Do the insertion. Note that insertstate contains cached binary
234  * search bounds established within _bt_check_unique when insertion is
235  * checkingunique.
236  */
237  newitemoff = _bt_findinsertloc(rel, &insertstate, checkingunique,
238  stack, heapRel);
239  _bt_insertonpg(rel, itup_key, insertstate.buf, InvalidBuffer, stack,
240  itup, insertstate.itemsz, newitemoff,
241  insertstate.postingoff, false);
242  }
243  else
244  {
245  /* just release the buffer */
246  _bt_relbuf(rel, insertstate.buf);
247  }
248 
249  /* be tidy */
250  if (stack)
251  _bt_freestack(stack);
252  pfree(itup_key);
253 
254  return is_unique;
255 }
static void _bt_insertonpg(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, BTStack stack, IndexTuple itup, Size itemsz, OffsetNumber newitemoff, int postingoff, bool split_only_page)
Definition: nbtinsert.c:1101
void _bt_freestack(BTStack stack)
Definition: nbtutils.c:175
bool bounds_valid
Definition: nbtree.h:696
uint32 TransactionId
Definition: c.h:520
BTScanInsert _bt_mkscankey(Relation rel, IndexTuple itup)
Definition: nbtutils.c:90
ItemPointer scantid
Definition: nbtree.h:664
ItemPointerData t_tid
Definition: itup.h:37
static OffsetNumber _bt_findinsertloc(Relation rel, BTInsertState insertstate, bool checkingunique, BTStack stack, Relation heapRel)
Definition: nbtinsert.c:790
#define InvalidBuffer
Definition: buf.h:25
uint16 OffsetNumber
Definition: off.h:24
void pfree(void *pointer)
Definition: mcxt.c:1057
void SpeculativeInsertionWait(TransactionId xid, uint32 token)
Definition: lmgr.c:796
unsigned int uint32
Definition: c.h:374
IndexTuple itup
Definition: nbtree.h:684
static BTStack _bt_search_insert(Relation rel, BTInsertState insertstate)
Definition: nbtinsert.c:296
bool anynullkeys
Definition: nbtree.h:661
void CheckForSerializableConflictIn(Relation relation, ItemPointer tid, BlockNumber blkno)
Definition: predicate.c:4375
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:959
void XactLockTableWait(TransactionId xid, Relation rel, ItemPointer ctid, XLTW_Oper oper)
Definition: lmgr.c:639
BTScanInsert itup_key
Definition: nbtree.h:686
#define Assert(condition)
Definition: c.h:745
bool heapkeyspace
Definition: nbtree.h:659
#define MAXALIGN(LEN)
Definition: c.h:698
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2661
static TransactionId _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel, IndexUniqueCheck checkUnique, bool *is_unique, uint32 *speculativeToken)
Definition: nbtinsert.c:387
#define unlikely(x)
Definition: c.h:206
#define TransactionIdIsValid(xid)
Definition: transam.h:41
#define IndexTupleSize(itup)
Definition: itup.h:71

◆ _bt_end_vacuum()

void _bt_end_vacuum ( Relation  rel)

Definition at line 2025 of file nbtutils.c.

References LockRelId::dbId, i, LockInfoData::lockRelId, LW_EXCLUSIVE, LWLockAcquire(), LWLockRelease(), BTVacInfo::num_vacuums, RelationData::rd_lockInfo, LockRelId::relId, BTOneVacInfo::relid, and BTVacInfo::vacuums.

Referenced by _bt_end_vacuum_callback(), and btbulkdelete().

2026 {
2027  int i;
2028 
2029  LWLockAcquire(BtreeVacuumLock, LW_EXCLUSIVE);
2030 
2031  /* Find the array entry */
2032  for (i = 0; i < btvacinfo->num_vacuums; i++)
2033  {
2034  BTOneVacInfo *vac = &btvacinfo->vacuums[i];
2035 
2036  if (vac->relid.relId == rel->rd_lockInfo.lockRelId.relId &&
2037  vac->relid.dbId == rel->rd_lockInfo.lockRelId.dbId)
2038  {
2039  /* Remove it by shifting down the last entry */
2040  *vac = btvacinfo->vacuums[btvacinfo->num_vacuums - 1];
2042  break;
2043  }
2044  }
2045 
2046  LWLockRelease(BtreeVacuumLock);
2047 }
LockRelId lockRelId
Definition: rel.h:44
static BTVacInfo * btvacinfo
Definition: nbtutils.c:1921
LockRelId relid
Definition: nbtutils.c:1909
Oid dbId
Definition: rel.h:39
BTOneVacInfo vacuums[FLEXIBLE_ARRAY_MEMBER]
Definition: nbtutils.c:1918
void LWLockRelease(LWLock *lock)
Definition: lwlock.c:1812
LockInfoData rd_lockInfo
Definition: rel.h:112
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1208
int num_vacuums
Definition: nbtutils.c:1916
int i
Oid relId
Definition: rel.h:38

◆ _bt_end_vacuum_callback()

void _bt_end_vacuum_callback ( int  code,
Datum  arg 
)

Definition at line 2053 of file nbtutils.c.

References _bt_end_vacuum(), and DatumGetPointer.

Referenced by btbulkdelete().

2054 {
2056 }
void _bt_end_vacuum(Relation rel)
Definition: nbtutils.c:2025
#define DatumGetPointer(X)
Definition: postgres.h:549
void * arg

◆ _bt_findsplitloc()

OffsetNumber _bt_findsplitloc ( Relation  rel,
Page  origpage,
OffsetNumber  newitemoff,
Size  newitemsz,
IndexTuple  newitem,
bool newitemonleft 
)

Definition at line 130 of file nbtsplitloc.c.

References _bt_afternewitemoff(), _bt_bestsplitloc(), _bt_defaultinterval(), _bt_deltasortsplits(), _bt_recsplitloc(), _bt_strategy(), Assert, BTGetFillFactor, BTREE_NONLEAF_FILLFACTOR, BTREE_SINGLEVAL_FILLFACTOR, BTreeTupleIsPosting(), elog, ERROR, SplitPoint::firstrightoff, i, IndexRelationGetNumberOfKeyAttributes, FindSplitData::interval, FindSplitData::is_leaf, FindSplitData::is_rightmost, ItemIdGetLength, FindSplitData::leftspace, MAXALIGN, FindSplitData::maxsplits, FindSplitData::minfirstrightsz, FindSplitData::newitem, FindSplitData::newitemoff, SplitPoint::newitemonleft, FindSplitData::newitemsz, FindSplitData::nsplits, OffsetNumberNext, FindSplitData::olddataitemstotal, FindSplitData::origpage, P_FIRSTDATAKEY, P_HIKEY, P_ISLEAF, P_RIGHTMOST, PageGetExactFreeSpace(), PageGetItemId, PageGetMaxOffsetNumber, PageGetPageSize, PageGetSpecialPointer, palloc(), pfree(), FindSplitData::rel, RelationGetRelationName, FindSplitData::rightspace, SizeOfPageHeaderData, SPLIT_DEFAULT, SPLIT_MANY_DUPLICATES, SPLIT_SINGLE_VALUE, and FindSplitData::splits.

Referenced by _bt_split().

136 {
137  BTPageOpaque opaque;
138  int leftspace,
139  rightspace,
140  olddataitemstotal,
141  olddataitemstoleft,
142  perfectpenalty,
143  leaffillfactor;
145  FindSplitStrat strategy;
146  ItemId itemid;
147  OffsetNumber offnum,
148  maxoff,
149  firstrightoff;
150  double fillfactormult;
151  bool usemult;
152  SplitPoint leftpage,
153  rightpage;
154 
155  opaque = (BTPageOpaque) PageGetSpecialPointer(origpage);
156  maxoff = PageGetMaxOffsetNumber(origpage);
157 
158  /* Total free space available on a btree page, after fixed overhead */
159  leftspace = rightspace =
161  MAXALIGN(sizeof(BTPageOpaqueData));
162 
163  /* The right page will have the same high key as the old page */
164  if (!P_RIGHTMOST(opaque))
165  {
166  itemid = PageGetItemId(origpage, P_HIKEY);
167  rightspace -= (int) (MAXALIGN(ItemIdGetLength(itemid)) +
168  sizeof(ItemIdData));
169  }
170 
171  /* Count up total space in data items before actually scanning 'em */
172  olddataitemstotal = rightspace - (int) PageGetExactFreeSpace(origpage);
173  leaffillfactor = BTGetFillFactor(rel);
174 
175  /* Passed-in newitemsz is MAXALIGNED but does not include line pointer */
176  newitemsz += sizeof(ItemIdData);
177  state.rel = rel;
178  state.origpage = origpage;
179  state.newitem = newitem;
180  state.newitemsz = newitemsz;
181  state.is_leaf = P_ISLEAF(opaque);
182  state.is_rightmost = P_RIGHTMOST(opaque);
183  state.leftspace = leftspace;
184  state.rightspace = rightspace;
185  state.olddataitemstotal = olddataitemstotal;
186  state.minfirstrightsz = SIZE_MAX;
187  state.newitemoff = newitemoff;
188 
189  /* newitem cannot be a posting list item */
190  Assert(!BTreeTupleIsPosting(newitem));
191 
192  /*
193  * nsplits should never exceed maxoff because there will be at most as
194  * many candidate split points as there are points _between_ tuples, once
195  * you imagine that the new item is already on the original page (the
196  * final number of splits may be slightly lower because not all points
197  * between tuples will be legal).
198  */
199  state.maxsplits = maxoff;
200  state.splits = palloc(sizeof(SplitPoint) * state.maxsplits);
201  state.nsplits = 0;
202 
203  /*
204  * Scan through the data items and calculate space usage for a split at
205  * each possible position
206  */
207  olddataitemstoleft = 0;
208 
209  for (offnum = P_FIRSTDATAKEY(opaque);
210  offnum <= maxoff;
211  offnum = OffsetNumberNext(offnum))
212  {
213  Size itemsz;
214 
215  itemid = PageGetItemId(origpage, offnum);
216  itemsz = MAXALIGN(ItemIdGetLength(itemid)) + sizeof(ItemIdData);
217 
218  /*
219  * When item offset number is not newitemoff, neither side of the
220  * split can be newitem. Record a split after the previous data item
221  * from original page, but before the current data item from original
222  * page. (_bt_recsplitloc() will reject the split when there are no
223  * previous items, which we rely on.)
224  */
225  if (offnum < newitemoff)
226  _bt_recsplitloc(&state, offnum, false, olddataitemstoleft, itemsz);
227  else if (offnum > newitemoff)
228  _bt_recsplitloc(&state, offnum, true, olddataitemstoleft, itemsz);
229  else
230  {
231  /*
232  * Record a split after all "offnum < newitemoff" original page
233  * data items, but before newitem
234  */
235  _bt_recsplitloc(&state, offnum, false, olddataitemstoleft, itemsz);
236 
237  /*
238  * Record a split after newitem, but before data item from
239  * original page at offset newitemoff/current offset
240  */
241  _bt_recsplitloc(&state, offnum, true, olddataitemstoleft, itemsz);
242  }
243 
244  olddataitemstoleft += itemsz;
245  }
246 
247  /*
248  * Record a split after all original page data items, but before newitem.
249  * (Though only when it's possible that newitem will end up alone on new
250  * right page.)
251  */
252  Assert(olddataitemstoleft == olddataitemstotal);
253  if (newitemoff > maxoff)
254  _bt_recsplitloc(&state, newitemoff, false, olddataitemstotal, 0);
255 
256  /*
257  * I believe it is not possible to fail to find a feasible split, but just
258  * in case ...
259  */
260  if (state.nsplits == 0)
261  elog(ERROR, "could not find a feasible split point for index \"%s\"",
263 
264  /*
265  * Start search for a split point among list of legal split points. Give
266  * primary consideration to equalizing available free space in each half
267  * of the split initially (start with default strategy), while applying
268  * rightmost and split-after-new-item optimizations where appropriate.
269  * Either of the two other fallback strategies may be required for cases
270  * with a large number of duplicates around the original/space-optimal
271  * split point.
272  *
273  * Default strategy gives some weight to suffix truncation in deciding a
274  * split point on leaf pages. It attempts to select a split point where a
275  * distinguishing attribute appears earlier in the new high key for the
276  * left side of the split, in order to maximize the number of trailing
277  * attributes that can be truncated away. Only candidate split points
278  * that imply an acceptable balance of free space on each side are
279  * considered. See _bt_defaultinterval().
280  */
281  if (!state.is_leaf)
282  {
283  /* fillfactormult only used on rightmost page */
284  usemult = state.is_rightmost;
285  fillfactormult = BTREE_NONLEAF_FILLFACTOR / 100.0;
286  }
287  else if (state.is_rightmost)
288  {
289  /* Rightmost leaf page -- fillfactormult always used */
290  usemult = true;
291  fillfactormult = leaffillfactor / 100.0;
292  }
293  else if (_bt_afternewitemoff(&state, maxoff, leaffillfactor, &usemult))
294  {
295  /*
296  * New item inserted at rightmost point among a localized grouping on
297  * a leaf page -- apply "split after new item" optimization, either by
298  * applying leaf fillfactor multiplier, or by choosing the exact split
299  * point that leaves newitem as lastleft. (usemult is set for us.)
300  */
301  if (usemult)
302  {
303  /* fillfactormult should be set based on leaf fillfactor */
304  fillfactormult = leaffillfactor / 100.0;
305  }
306  else
307  {
308  /* find precise split point after newitemoff */
309  for (int i = 0; i < state.nsplits; i++)
310  {
311  SplitPoint *split = state.splits + i;
312 
313  if (split->newitemonleft &&
314  newitemoff == split->firstrightoff)
315  {
316  pfree(state.splits);
317  *newitemonleft = true;
318  return newitemoff;
319  }
320  }
321 
322  /*
323  * Cannot legally split after newitemoff; proceed with split
324  * without using fillfactor multiplier. This is defensive, and
325  * should never be needed in practice.
326  */
327  fillfactormult = 0.50;
328  }
329  }
330  else
331  {
332  /* Other leaf page. 50:50 page split. */
333  usemult = false;
334  /* fillfactormult not used, but be tidy */
335  fillfactormult = 0.50;
336  }
337 
338  /*
339  * Save leftmost and rightmost splits for page before original ordinal
340  * sort order is lost by delta/fillfactormult sort
341  */
342  leftpage = state.splits[0];
343  rightpage = state.splits[state.nsplits - 1];
344 
345  /* Give split points a fillfactormult-wise delta, and sort on deltas */
346  _bt_deltasortsplits(&state, fillfactormult, usemult);
347 
348  /* Determine split interval for default strategy */
349  state.interval = _bt_defaultinterval(&state);
350 
351  /*
352  * Determine if default strategy/split interval will produce a
353  * sufficiently distinguishing split, or if we should change strategies.
354  * Alternative strategies change the range of split points that are
355  * considered acceptable (split interval), and possibly change
356  * fillfactormult, in order to deal with pages with a large number of
357  * duplicates gracefully.
358  *
359  * Pass low and high splits for the entire page (actually, they're for an
360  * imaginary version of the page that includes newitem). These are used
361  * when the initial split interval encloses split points that are full of
362  * duplicates, and we need to consider if it's even possible to avoid
363  * appending a heap TID.
364  */
365  perfectpenalty = _bt_strategy(&state, &leftpage, &rightpage, &strategy);
366 
367  if (strategy == SPLIT_DEFAULT)
368  {
369  /*
370  * Default strategy worked out (always works out with internal page).
371  * Original split interval still stands.
372  */
373  }
374 
375  /*
376  * Many duplicates strategy is used when a heap TID would otherwise be
377  * appended, but the page isn't completely full of logical duplicates.
378  *
379  * The split interval is widened to include all legal candidate split
380  * points. There might be a few as two distinct values in the whole-page
381  * split interval, though it's also possible that most of the values on
382  * the page are unique. The final split point will either be to the
383  * immediate left or to the immediate right of the group of duplicate
384  * tuples that enclose the first/delta-optimal split point (perfect
385  * penalty was set so that the lowest delta split point that avoids
386  * appending a heap TID will be chosen). Maximizing the number of
387  * attributes that can be truncated away is not a goal of the many
388  * duplicates strategy.
389  *
390  * Single value strategy is used when it is impossible to avoid appending
391  * a heap TID. It arranges to leave the left page very full. This
392  * maximizes space utilization in cases where tuples with the same
393  * attribute values span many pages. Newly inserted duplicates will tend
394  * to have higher heap TID values, so we'll end up splitting to the right
395  * consistently. (Single value strategy is harmless though not
396  * particularly useful with !heapkeyspace indexes.)
397  */
398  else if (strategy == SPLIT_MANY_DUPLICATES)
399  {
400  Assert(state.is_leaf);
401  /* Shouldn't try to truncate away extra user attributes */
402  Assert(perfectpenalty ==
404  /* No need to resort splits -- no change in fillfactormult/deltas */
405  state.interval = state.nsplits;
406  }
407  else if (strategy == SPLIT_SINGLE_VALUE)
408  {
409  Assert(state.is_leaf);
410  /* Split near the end of the page */
411  usemult = true;
412  fillfactormult = BTREE_SINGLEVAL_FILLFACTOR / 100.0;
413  /* Resort split points with new delta */
414  _bt_deltasortsplits(&state, fillfactormult, usemult);
415  /* Appending a heap TID is unavoidable, so interval of 1 is fine */
416  state.interval = 1;
417  }
418 
419  /*
420  * Search among acceptable split points (using final split interval) for
421  * the entry that has the lowest penalty, and is therefore expected to
422  * maximize fan-out. Sets *newitemonleft for us.
423  */
424  firstrightoff = _bt_bestsplitloc(&state, perfectpenalty, newitemonleft,
425  strategy);
426  pfree(state.splits);
427 
428  return firstrightoff;
429 }
bool is_rightmost
Definition: nbtsplitloc.c:49
Size minfirstrightsz
Definition: nbtsplitloc.c:54
static int _bt_defaultinterval(FindSplitData *state)
Definition: nbtsplitloc.c:882
IndexTuple newitem
Definition: nbtsplitloc.c:46
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:244
bool newitemonleft
Definition: nbtsplitloc.c:37
OffsetNumber firstrightoff
Definition: nbtsplitloc.c:36
#define SizeOfPageHeaderData
Definition: bufpage.h:216
static int _bt_strategy(FindSplitData *state, SplitPoint *leftpage, SplitPoint *rightpage, FindSplitStrat *strategy)
Definition: nbtsplitloc.c:940
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:357
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:69
SplitPoint * splits
Definition: nbtsplitloc.c:59
static void _bt_recsplitloc(FindSplitData *state, OffsetNumber firstrightoff, bool newitemonleft, int olddataitemstoleft, Size firstrightofforigpagetuplesz)
Definition: nbtsplitloc.c:450
uint16 OffsetNumber
Definition: off.h:24
void pfree(void *pointer)
Definition: mcxt.c:1057
#define ItemIdGetLength(itemId)
Definition: itemid.h:59
#define BTREE_NONLEAF_FILLFACTOR
Definition: nbtree.h:195
#define ERROR
Definition: elog.h:43
static OffsetNumber _bt_bestsplitloc(FindSplitData *state, int perfectpenalty, bool *newitemonleft, FindSplitStrat strategy)
Definition: nbtsplitloc.c:794
#define PageGetPageSize(page)
Definition: bufpage.h:268
static bool _bt_afternewitemoff(FindSplitData *state, OffsetNumber maxoff, int leaffillfactor, bool *usemult)
Definition: nbtsplitloc.c:636
#define RelationGetRelationName(relation)
Definition: rel.h:490
struct ItemIdData ItemIdData
Relation rel
Definition: nbtsplitloc.c:44
#define IndexRelationGetNumberOfKeyAttributes(relation)
Definition: rel.h:475
#define BTREE_SINGLEVAL_FILLFACTOR
Definition: nbtree.h:196
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:235
#define BTGetFillFactor(relation)
Definition: nbtree.h:968
FindSplitStrat
Definition: nbtsplitloc.c:20
static bool BTreeTupleIsPosting(IndexTuple itup)
Definition: nbtree.h:360
int olddataitemstotal
Definition: nbtsplitloc.c:53
#define Assert(condition)
Definition: c.h:745
Definition: regguts.h:298
OffsetNumber newitemoff
Definition: nbtsplitloc.c:50
#define OffsetNumberNext(offsetNumber)
Definition: off.h:52
size_t Size
Definition: c.h:473
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
#define MAXALIGN(LEN)
Definition: c.h:698
Size PageGetExactFreeSpace(Page page)
Definition: bufpage.c:833
#define P_HIKEY
Definition: nbtree.h:242
void * palloc(Size size)
Definition: mcxt.c:950
#define elog(elevel,...)
Definition: elog.h:214
int i
static void _bt_deltasortsplits(FindSplitData *state, double fillfactormult, bool usemult)
Definition: nbtsplitloc.c:567
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:213
#define P_ISLEAF(opaque)
Definition: nbtree.h:214

◆ _bt_finish_split()

void _bt_finish_split ( Relation  rel,
Buffer  lbuf,
BTStack  stack 
)

Definition at line 2213 of file nbtinsert.c.

References _bt_getbuf(), _bt_insert_parent(), _bt_relbuf(), Assert, BT_WRITE, BTMetaPageData::btm_root, BTPageGetMeta, BTPageOpaqueData::btpo_next, BTREE_METAPAGE, BufferGetBlockNumber(), BufferGetPage, DEBUG1, elog, P_INCOMPLETE_SPLIT, P_LEFTMOST, P_RIGHTMOST, and PageGetSpecialPointer.

Referenced by _bt_getstackbuf(), _bt_moveright(), and _bt_stepright().

2214 {
2215  Page lpage = BufferGetPage(lbuf);
2216  BTPageOpaque lpageop = (BTPageOpaque) PageGetSpecialPointer(lpage);
2217  Buffer rbuf;
2218  Page rpage;
2219  BTPageOpaque rpageop;
2220  bool was_root;
2221  bool was_only;
2222 
2223  Assert(P_INCOMPLETE_SPLIT(lpageop));
2224 
2225  /* Lock right sibling, the one missing the downlink */
2226  rbuf = _bt_getbuf(rel, lpageop->btpo_next, BT_WRITE);
2227  rpage = BufferGetPage(rbuf);
2228  rpageop = (BTPageOpaque) PageGetSpecialPointer(rpage);
2229 
2230  /* Could this be a root split? */
2231  if (!stack)
2232  {
2233  Buffer metabuf;
2234  Page metapg;
2235  BTMetaPageData *metad;
2236 
2237  /* acquire lock on the metapage */
2238  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
2239  metapg = BufferGetPage(metabuf);
2240  metad = BTPageGetMeta(metapg);
2241 
2242  was_root = (metad->btm_root == BufferGetBlockNumber(lbuf));
2243 
2244  _bt_relbuf(rel, metabuf);
2245  }
2246  else
2247  was_root = false;
2248 
2249  /* Was this the only page on the level before split? */
2250  was_only = (P_LEFTMOST(lpageop) && P_RIGHTMOST(rpageop));
2251 
2252  elog(DEBUG1, "finishing incomplete split of %u/%u",
2254 
2255  _bt_insert_parent(rel, lbuf, rbuf, stack, was_root, was_only);
2256 }
BlockNumber btpo_next
Definition: nbtree.h:59
#define DEBUG1
Definition: elog.h:25
Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access)
Definition: nbtpage.c:803
static void _bt_insert_parent(Relation rel, Buffer buf, Buffer rbuf, BTStack stack, bool is_root, bool is_only)
Definition: nbtinsert.c:2076
#define P_INCOMPLETE_SPLIT(opaque)
Definition: nbtree.h:221
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:69
#define BTPageGetMeta(p)
Definition: nbtree.h:114
#define P_LEFTMOST(opaque)
Definition: nbtree.h:212
#define BufferGetPage(buffer)
Definition: bufmgr.h:169
#define BTREE_METAPAGE
Definition: nbtree.h:141
BlockNumber btm_root
Definition: nbtree.h:102
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:959
#define Assert(condition)
Definition: c.h:745
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2661
#define elog(elevel,...)
Definition: elog.h:214
#define BT_WRITE
Definition: nbtree.h:588
int Buffer
Definition: buf.h:23
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:213
Pointer Page
Definition: bufpage.h:78

◆ _bt_first()

bool _bt_first ( IndexScanDesc  scan,
ScanDirection  dir 
)

Definition at line 848 of file nbtsearch.c.

References _bt_binsrch(), _bt_drop_lock_and_maybe_pin(), _bt_endpoint(), _bt_freestack(), _bt_initialize_more_data(), _bt_metaversion(), _bt_parallel_done(), _bt_parallel_readpage(), _bt_parallel_seize(), _bt_preprocess_keys(), _bt_readpage(), _bt_search(), _bt_steppage(), _bt_unlockbuf(), Assert, BT_READ, BTEqualStrategyNumber, BTGreaterEqualStrategyNumber, BTGreaterStrategyNumber, BTLessEqualStrategyNumber, BTLessStrategyNumber, BTORDER_PROC, BTScanPosInvalidate, BTScanPosIsValid, buf, BTScanPosData::buf, BufferGetBlockNumber(), BufferIsValid, cur, BTScanOpaqueData::currPos, BTScanOpaqueData::currTuples, DatumGetPointer, elog, ERROR, get_opfamily_proc(), i, index_getprocinfo(), INDEX_MAX_KEYS, IndexScanDescData::indexRelation, InvalidBlockNumber, InvalidOid, InvalidStrategy, BTScanPosData::itemIndex, BTScanPosData::items, BTScanOpaqueData::keyData, BTScanOpaqueData::numberOfKeys, OffsetNumberPrev, IndexScanDescData::opaque, P_NONE, IndexScanDescData::parallel_scan, pgstat_count_index_scan, PredicateLockPage(), PredicateLockRelation(), BTScanOpaqueData::qual_ok, RelationData::rd_opcintype, RelationData::rd_opfamily, RegProcedureIsValid, RelationGetRelationName, ScanDirectionIsBackward, ScanDirectionIsForward, ScanKeyEntryInitialize(), ScanKeyEntryInitializeWithInfo(), ScanKeyData::sk_argument, ScanKeyData::sk_attno, SK_BT_DESC, SK_BT_NULLS_FIRST, ScanKeyData::sk_collation, ScanKeyData::sk_flags, SK_ISNULL, SK_ROW_END, SK_ROW_HEADER, SK_ROW_MEMBER, SK_SEARCHNOTNULL, ScanKeyData::sk_strategy, ScanKeyData::sk_subtype, status(), IndexScanDescData::xs_heaptid, IndexScanDescData::xs_itup, IndexScanDescData::xs_snapshot, and IndexScanDescData::xs_want_itup.

Referenced by btgetbitmap(), and btgettuple().

849 {
850  Relation rel = scan->indexRelation;
851  BTScanOpaque so = (BTScanOpaque) scan->opaque;
852  Buffer buf;
853  BTStack stack;
854  OffsetNumber offnum;
855  StrategyNumber strat;
856  bool nextkey;
857  bool goback;
858  BTScanInsertData inskey;
859  ScanKey startKeys[INDEX_MAX_KEYS];
860  ScanKeyData notnullkeys[INDEX_MAX_KEYS];
861  int keysCount = 0;
862  int i;
863  bool status;
864  StrategyNumber strat_total;
865  BTScanPosItem *currItem;
866  BlockNumber blkno;
867 
869 
871 
872  /*
873  * Examine the scan keys and eliminate any redundant keys; also mark the
874  * keys that must be matched to continue the scan.
875  */
876  _bt_preprocess_keys(scan);
877 
878  /*
879  * Quit now if _bt_preprocess_keys() discovered that the scan keys can
880  * never be satisfied (eg, x == 1 AND x > 2).
881  */
882  if (!so->qual_ok)
883  {
884  /* Notify any other workers that we're done with this scan key. */
885  _bt_parallel_done(scan);
886  return false;
887  }
888 
889  /*
890  * For parallel scans, get the starting page from shared state. If the
891  * scan has not started, proceed to find out first leaf page in the usual
892  * way while keeping other participating processes waiting. If the scan
893  * has already begun, use the page number from the shared structure.
894  */
895  if (scan->parallel_scan != NULL)
896  {
897  status = _bt_parallel_seize(scan, &blkno);
898  if (!status)
899  return false;
900  else if (blkno == P_NONE)
901  {
902  _bt_parallel_done(scan);
903  return false;
904  }
905  else if (blkno != InvalidBlockNumber)
906  {
907  if (!_bt_parallel_readpage(scan, blkno, dir))
908  return false;
909  goto readcomplete;
910  }
911  }
912 
913  /*----------
914  * Examine the scan keys to discover where we need to start the scan.
915  *
916  * We want to identify the keys that can be used as starting boundaries;
917  * these are =, >, or >= keys for a forward scan or =, <, <= keys for
918  * a backwards scan. We can use keys for multiple attributes so long as
919  * the prior attributes had only =, >= (resp. =, <=) keys. Once we accept
920  * a > or < boundary or find an attribute with no boundary (which can be
921  * thought of as the same as "> -infinity"), we can't use keys for any
922  * attributes to its right, because it would break our simplistic notion
923  * of what initial positioning strategy to use.
924  *
925  * When the scan keys include cross-type operators, _bt_preprocess_keys
926  * may not be able to eliminate redundant keys; in such cases we will
927  * arbitrarily pick a usable one for each attribute. This is correct
928  * but possibly not optimal behavior. (For example, with keys like
929  * "x >= 4 AND x >= 5" we would elect to scan starting at x=4 when
930  * x=5 would be more efficient.) Since the situation only arises given
931  * a poorly-worded query plus an incomplete opfamily, live with it.
932  *
933  * When both equality and inequality keys appear for a single attribute
934  * (again, only possible when cross-type operators appear), we *must*
935  * select one of the equality keys for the starting point, because
936  * _bt_checkkeys() will stop the scan as soon as an equality qual fails.
937  * For example, if we have keys like "x >= 4 AND x = 10" and we elect to
938  * start at x=4, we will fail and stop before reaching x=10. If multiple
939  * equality quals survive preprocessing, however, it doesn't matter which
940  * one we use --- by definition, they are either redundant or
941  * contradictory.
942  *
943  * Any regular (not SK_SEARCHNULL) key implies a NOT NULL qualifier.
944  * If the index stores nulls at the end of the index we'll be starting
945  * from, and we have no boundary key for the column (which means the key
946  * we deduced NOT NULL from is an inequality key that constrains the other
947  * end of the index), then we cons up an explicit SK_SEARCHNOTNULL key to
948  * use as a boundary key. If we didn't do this, we might find ourselves
949  * traversing a lot of null entries at the start of the scan.
950  *
951  * In this loop, row-comparison keys are treated the same as keys on their
952  * first (leftmost) columns. We'll add on lower-order columns of the row
953  * comparison below, if possible.
954  *
955  * The selected scan keys (at most one per index column) are remembered by
956  * storing their addresses into the local startKeys[] array.
957  *----------
958  */
959  strat_total = BTEqualStrategyNumber;
960  if (so->numberOfKeys > 0)
961  {
962  AttrNumber curattr;
963  ScanKey chosen;
964  ScanKey impliesNN;
965  ScanKey cur;
966 
967  /*
968  * chosen is the so-far-chosen key for the current attribute, if any.
969  * We don't cast the decision in stone until we reach keys for the
970  * next attribute.
971  */
972  curattr = 1;
973  chosen = NULL;
974  /* Also remember any scankey that implies a NOT NULL constraint */
975  impliesNN = NULL;
976 
977  /*
978  * Loop iterates from 0 to numberOfKeys inclusive; we use the last
979  * pass to handle after-last-key processing. Actual exit from the
980  * loop is at one of the "break" statements below.
981  */
982  for (cur = so->keyData, i = 0;; cur++, i++)
983  {
984  if (i >= so->numberOfKeys || cur->sk_attno != curattr)
985  {
986  /*
987  * Done looking at keys for curattr. If we didn't find a
988  * usable boundary key, see if we can deduce a NOT NULL key.
989  */
990  if (chosen == NULL && impliesNN != NULL &&
991  ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
994  {
995  /* Yes, so build the key in notnullkeys[keysCount] */
996  chosen = &notnullkeys[keysCount];
997  ScanKeyEntryInitialize(chosen,
999  (impliesNN->sk_flags &
1001  curattr,
1002  ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
1005  InvalidOid,
1006  InvalidOid,
1007  InvalidOid,
1008  (Datum) 0);
1009  }
1010 
1011  /*
1012  * If we still didn't find a usable boundary key, quit; else
1013  * save the boundary key pointer in startKeys.
1014  */
1015  if (chosen == NULL)
1016  break;
1017  startKeys[keysCount++] = chosen;
1018 
1019  /*
1020  * Adjust strat_total, and quit if we have stored a > or <
1021  * key.
1022  */
1023  strat = chosen->sk_strategy;
1024  if (strat != BTEqualStrategyNumber)
1025  {
1026  strat_total = strat;
1027  if (strat == BTGreaterStrategyNumber ||
1028  strat == BTLessStrategyNumber)
1029  break;
1030  }
1031 
1032  /*
1033  * Done if that was the last attribute, or if next key is not
1034  * in sequence (implying no boundary key is available for the
1035  * next attribute).
1036  */
1037  if (i >= so->numberOfKeys ||
1038  cur->sk_attno != curattr + 1)
1039  break;
1040 
1041  /*
1042  * Reset for next attr.
1043  */
1044  curattr = cur->sk_attno;
1045  chosen = NULL;
1046  impliesNN = NULL;
1047  }
1048 
1049  /*
1050  * Can we use this key as a starting boundary for this attr?
1051  *
1052  * If not, does it imply a NOT NULL constraint? (Because
1053  * SK_SEARCHNULL keys are always assigned BTEqualStrategyNumber,
1054  * *any* inequality key works for that; we need not test.)
1055  */
1056  switch (cur->sk_strategy)
1057  {
1058  case BTLessStrategyNumber:
1060  if (chosen == NULL)
1061  {
1062  if (ScanDirectionIsBackward(dir))
1063  chosen = cur;
1064  else
1065  impliesNN = cur;
1066  }
1067  break;
1068  case BTEqualStrategyNumber:
1069  /* override any non-equality choice */
1070  chosen = cur;
1071  break;
1074  if (chosen == NULL)
1075  {
1076  if (ScanDirectionIsForward(dir))
1077  chosen = cur;
1078  else
1079  impliesNN = cur;
1080  }
1081  break;
1082  }
1083  }
1084  }
1085 
1086  /*
1087  * If we found no usable boundary keys, we have to start from one end of
1088  * the tree. Walk down that edge to the first or last key, and scan from
1089  * there.
1090  */
1091  if (keysCount == 0)
1092  {
1093  bool match;
1094 
1095  match = _bt_endpoint(scan, dir);
1096 
1097  if (!match)
1098  {
1099  /* No match, so mark (parallel) scan finished */
1100  _bt_parallel_done(scan);
1101  }
1102 
1103  return match;
1104  }
1105 
1106  /*
1107  * We want to start the scan somewhere within the index. Set up an
1108  * insertion scankey we can use to search for the boundary point we
1109  * identified above. The insertion scankey is built using the keys
1110  * identified by startKeys[]. (Remaining insertion scankey fields are
1111  * initialized after initial-positioning strategy is finalized.)
1112  */
1113  Assert(keysCount <= INDEX_MAX_KEYS);
1114  for (i = 0; i < keysCount; i++)
1115  {
1116  ScanKey cur = startKeys[i];
1117 
1118  Assert(cur->sk_attno == i + 1);
1119 
1120  if (cur->sk_flags & SK_ROW_HEADER)
1121  {
1122  /*
1123  * Row comparison header: look to the first row member instead.
1124  *
1125  * The member scankeys are already in insertion format (ie, they
1126  * have sk_func = 3-way-comparison function), but we have to watch
1127  * out for nulls, which _bt_preprocess_keys didn't check. A null
1128  * in the first row member makes the condition unmatchable, just
1129  * like qual_ok = false.
1130  */
1131  ScanKey subkey = (ScanKey) DatumGetPointer(cur->sk_argument);
1132 
1133  Assert(subkey->sk_flags & SK_ROW_MEMBER);
1134  if (subkey->sk_flags & SK_ISNULL)
1135  {
1136  _bt_parallel_done(scan);
1137  return false;
1138  }
1139  memcpy(inskey.scankeys + i, subkey, sizeof(ScanKeyData));
1140 
1141  /*
1142  * If the row comparison is the last positioning key we accepted,
1143  * try to add additional keys from the lower-order row members.
1144  * (If we accepted independent conditions on additional index
1145  * columns, we use those instead --- doesn't seem worth trying to
1146  * determine which is more restrictive.) Note that this is OK
1147  * even if the row comparison is of ">" or "<" type, because the
1148  * condition applied to all but the last row member is effectively
1149  * ">=" or "<=", and so the extra keys don't break the positioning
1150  * scheme. But, by the same token, if we aren't able to use all
1151  * the row members, then the part of the row comparison that we
1152  * did use has to be treated as just a ">=" or "<=" condition, and
1153  * so we'd better adjust strat_total accordingly.
1154  */
1155  if (i == keysCount - 1)
1156  {
1157  bool used_all_subkeys = false;
1158 
1159  Assert(!(subkey->sk_flags & SK_ROW_END));
1160  for (;;)
1161  {
1162  subkey++;
1163  Assert(subkey->sk_flags & SK_ROW_MEMBER);
1164  if (subkey->sk_attno != keysCount + 1)
1165  break; /* out-of-sequence, can't use it */
1166  if (subkey->sk_strategy != cur->sk_strategy)
1167  break; /* wrong direction, can't use it */
1168  if (subkey->sk_flags & SK_ISNULL)
1169  break; /* can't use null keys */
1170  Assert(keysCount < INDEX_MAX_KEYS);
1171  memcpy(inskey.scankeys + keysCount, subkey,
1172  sizeof(ScanKeyData));
1173  keysCount++;
1174  if (subkey->sk_flags & SK_ROW_END)
1175  {
1176  used_all_subkeys = true;
1177  break;
1178  }
1179  }
1180  if (!used_all_subkeys)
1181  {
1182  switch (strat_total)
1183  {
1184  case BTLessStrategyNumber:
1185  strat_total = BTLessEqualStrategyNumber;
1186  break;
1188  strat_total = BTGreaterEqualStrategyNumber;
1189  break;
1190  }
1191  }
1192  break; /* done with outer loop */
1193  }
1194  }
1195  else
1196  {
1197  /*
1198  * Ordinary comparison key. Transform the search-style scan key
1199  * to an insertion scan key by replacing the sk_func with the
1200  * appropriate btree comparison function.
1201  *
1202  * If scankey operator is not a cross-type comparison, we can use
1203  * the cached comparison function; otherwise gotta look it up in
1204  * the catalogs. (That can't lead to infinite recursion, since no
1205  * indexscan initiated by syscache lookup will use cross-data-type
1206  * operators.)
1207  *
1208  * We support the convention that sk_subtype == InvalidOid means
1209  * the opclass input type; this is a hack to simplify life for
1210  * ScanKeyInit().
1211  */
1212  if (cur->sk_subtype == rel->rd_opcintype[i] ||
1213  cur->sk_subtype == InvalidOid)
1214  {
1215  FmgrInfo *procinfo;
1216 
1217  procinfo = index_getprocinfo(rel, cur->sk_attno, BTORDER_PROC);
1218  ScanKeyEntryInitializeWithInfo(inskey.scankeys + i,
1219  cur->sk_flags,
1220  cur->sk_attno,
1222  cur->sk_subtype,
1223  cur->sk_collation,
1224  procinfo,
1225  cur->sk_argument);
1226  }
1227  else
1228  {
1229  RegProcedure cmp_proc;
1230 
1231  cmp_proc = get_opfamily_proc(rel->rd_opfamily[i],
1232  rel->rd_opcintype[i],
1233  cur->sk_subtype,
1234  BTORDER_PROC);
1235  if (!RegProcedureIsValid(cmp_proc))
1236  elog(ERROR, "missing support function %d(%u,%u) for attribute %d of index \"%s\"",
1237  BTORDER_PROC, rel->rd_opcintype[i], cur->sk_subtype,
1238  cur->sk_attno, RelationGetRelationName(rel));
1239  ScanKeyEntryInitialize(inskey.scankeys + i,
1240  cur->sk_flags,
1241  cur->sk_attno,
1243  cur->sk_subtype,
1244  cur->sk_collation,
1245  cmp_proc,
1246  cur->sk_argument);
1247  }
1248  }
1249  }
1250 
1251  /*----------
1252  * Examine the selected initial-positioning strategy to determine exactly
1253  * where we need to start the scan, and set flag variables to control the
1254  * code below.
1255  *
1256  * If nextkey = false, _bt_search and _bt_binsrch will locate the first
1257  * item >= scan key. If nextkey = true, they will locate the first
1258  * item > scan key.
1259  *
1260  * If goback = true, we will then step back one item, while if
1261  * goback = false, we will start the scan on the located item.
1262  *----------
1263  */
1264  switch (strat_total)
1265  {
1266  case BTLessStrategyNumber:
1267 
1268  /*
1269  * Find first item >= scankey, then back up one to arrive at last
1270  * item < scankey. (Note: this positioning strategy is only used
1271  * for a backward scan, so that is always the correct starting
1272  * position.)
1273  */
1274  nextkey = false;
1275  goback = true;
1276  break;
1277 
1279 
1280  /*
1281  * Find first item > scankey, then back up one to arrive at last
1282  * item <= scankey. (Note: this positioning strategy is only used
1283  * for a backward scan, so that is always the correct starting
1284  * position.)
1285  */
1286  nextkey = true;
1287  goback = true;
1288  break;
1289 
1290  case BTEqualStrategyNumber:
1291 
1292  /*
1293  * If a backward scan was specified, need to start with last equal
1294  * item not first one.
1295  */
1296  if (ScanDirectionIsBackward(dir))
1297  {
1298  /*
1299  * This is the same as the <= strategy. We will check at the
1300  * end whether the found item is actually =.
1301  */
1302  nextkey = true;
1303  goback = true;
1304  }
1305  else
1306  {
1307  /*
1308  * This is the same as the >= strategy. We will check at the
1309  * end whether the found item is actually =.
1310  */
1311  nextkey = false;
1312  goback = false;
1313  }
1314  break;
1315 
1317 
1318  /*
1319  * Find first item >= scankey. (This is only used for forward
1320  * scans.)
1321  */
1322  nextkey = false;
1323  goback = false;
1324  break;
1325 
1327 
1328  /*
1329  * Find first item > scankey. (This is only used for forward
1330  * scans.)
1331  */
1332  nextkey = true;
1333  goback = false;
1334  break;
1335 
1336  default:
1337  /* can't get here, but keep compiler quiet */
1338  elog(ERROR, "unrecognized strat_total: %d", (int) strat_total);
1339  return false;
1340  }
1341 
1342  /* Initialize remaining insertion scan key fields */
1343  _bt_metaversion(rel, &inskey.heapkeyspace, &inskey.allequalimage);
1344  inskey.anynullkeys = false; /* unused */
1345  inskey.nextkey = nextkey;
1346  inskey.pivotsearch = false;
1347  inskey.scantid = NULL;
1348  inskey.keysz = keysCount;
1349 
1350  /*
1351  * Use the manufactured insertion scan key to descend the tree and
1352  * position ourselves on the target leaf page.
1353  */
1354  stack = _bt_search(rel, &inskey, &buf, BT_READ, scan->xs_snapshot);
1355 
1356  /* don't need to keep the stack around... */
1357  _bt_freestack(stack);
1358 
1359  if (!BufferIsValid(buf))
1360  {
1361  /*
1362  * We only get here if the index is completely empty. Lock relation
1363  * because nothing finer to lock exists.
1364  */
1365  PredicateLockRelation(rel, scan->xs_snapshot);
1366 
1367  /*
1368  * mark parallel scan as done, so that all the workers can finish
1369  * their scan
1370  */
1371  _bt_parallel_done(scan);
1373 
1374  return false;
1375  }
1376  else
1378  scan->xs_snapshot);
1379 
1380  _bt_initialize_more_data(so, dir);
1381 
1382  /* position to the precise item on the page */
1383  offnum = _bt_binsrch(rel, &inskey, buf);
1384 
1385  /*
1386  * If nextkey = false, we are positioned at the first item >= scan key, or
1387  * possibly at the end of a page on which all the existing items are less
1388  * than the scan key and we know that everything on later pages is greater
1389  * than or equal to scan key.
1390  *
1391  * If nextkey = true, we are positioned at the first item > scan key, or
1392  * possibly at the end of a page on which all the existing items are less
1393  * than or equal to the scan key and we know that everything on later
1394  * pages is greater than scan key.
1395  *
1396  * The actually desired starting point is either this item or the prior
1397  * one, or in the end-of-page case it's the first item on the next page or
1398  * the last item on this page. Adjust the starting offset if needed. (If
1399  * this results in an offset before the first item or after the last one,
1400  * _bt_readpage will report no items found, and then we'll step to the
1401  * next page as needed.)
1402  */
1403  if (goback)
1404  offnum = OffsetNumberPrev(offnum);
1405 
1406  /* remember which buffer we have pinned, if any */
1408  so->currPos.buf = buf;
1409 
1410  /*
1411  * Now load data from the first page of the scan.
1412  */
1413  if (!_bt_readpage(scan, dir, offnum))
1414  {
1415  /*
1416  * There's no actually-matching data on this page. Try to advance to
1417  * the next page. Return false if there's no matching data at all.
1418  */
1419  _bt_unlockbuf(scan->indexRelation, so->currPos.buf);
1420  if (!_bt_steppage(scan, dir))
1421  return false;
1422  }
1423  else
1424  {
1425  /* Drop the lock, and maybe the pin, on the current page */
1427  }
1428 
1429 readcomplete:
1430  /* OK, itemIndex says what to return */
1431  currItem = &so->currPos.items[so->currPos.itemIndex];
1432  scan->xs_heaptid = currItem->heapTid;
1433  if (scan->xs_want_itup)
1434  scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
1435 
1436  return true;
1437 }
#define InvalidStrategy
Definition: stratnum.h:24
Oid sk_subtype
Definition: skey.h:69
Definition: fmgr.h:56
#define SK_ROW_MEMBER
Definition: skey.h:118
void _bt_freestack(BTStack stack)
Definition: nbtutils.c:175
void PredicateLockPage(Relation relation, BlockNumber blkno, Snapshot snapshot)
Definition: predicate.c:2521
#define BTORDER_PROC
Definition: nbtree.h:575
#define BTGreaterStrategyNumber
Definition: stratnum.h:33
FmgrInfo * index_getprocinfo(Relation irel, AttrNumber attnum, uint16 procnum)
Definition: indexam.c:801
#define ScanDirectionIsForward(direction)
Definition: sdir.h:55
IndexTuple xs_itup
Definition: relscan.h:139
void ScanKeyEntryInitializeWithInfo(ScanKey entry, int flags, AttrNumber attributeNumber, StrategyNumber strategy, Oid subtype, Oid collation, FmgrInfo *finfo, Datum argument)
Definition: scankey.c:101
void _bt_parallel_done(IndexScanDesc scan)
Definition: nbtree.c:728
static bool _bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
Definition: nbtsearch.c:2146
static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp)
Definition: nbtsearch.c:65
int itemIndex
Definition: nbtree.h:855
void PredicateLockRelation(Relation relation, Snapshot snapshot)
Definition: predicate.c:2498
char * currTuples
Definition: nbtree.h:929
regproc RegProcedure
Definition: c.h:518
#define P_NONE
Definition: nbtree.h:206
struct cursor * cur
Definition: ecpg.c:28
uint16 StrategyNumber
Definition: stratnum.h:22
#define BTScanPosIsValid(scanpos)
Definition: nbtree.h:879
struct SnapshotData * xs_snapshot
Definition: relscan.h:116
uint32 BlockNumber
Definition: block.h:31
static OffsetNumber _bt_binsrch(Relation rel, BTScanInsert key, Buffer buf)
Definition: nbtsearch.c:343
#define BTLessEqualStrategyNumber
Definition: stratnum.h:30
Relation indexRelation
Definition: relscan.h:115
uint16 OffsetNumber
Definition: off.h:24
#define SK_ROW_END
Definition: skey.h:119
#define ScanDirectionIsBackward(direction)
Definition: sdir.h:41
#define BT_READ
Definition: nbtree.h:587
#define ERROR
Definition: elog.h:43
BTScanOpaqueData * BTScanOpaque
Definition: nbtree.h:946
StrategyNumber sk_strategy
Definition: skey.h:68
ItemPointerData xs_heaptid
Definition: relscan.h:144
static char * buf
Definition: pg_test_fsync.c:68
void ScanKeyEntryInitialize(ScanKey entry, int flags, AttrNumber attributeNumber, StrategyNumber strategy, Oid subtype, Oid collation, RegProcedure procedure, Datum argument)
Definition: scankey.c:32
ScanKeyData * ScanKey
Definition: skey.h:75
static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:2379
#define RegProcedureIsValid(p)
Definition: c.h:653
IndexTupleData * IndexTuple
Definition: itup.h:53
#define RelationGetRelationName(relation)
Definition: rel.h:490
#define pgstat_count_index_scan(rel)
Definition: pgstat.h:1417
#define SK_SEARCHNOTNULL
Definition: skey.h:122
bool _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno)
Definition: nbtree.c:647
Oid * rd_opfamily
Definition: rel.h:189
#define SK_ISNULL
Definition: skey.h:115
#define SK_ROW_HEADER
Definition: skey.h:117
void _bt_metaversion(Relation rel, bool *heapkeyspace, bool *allequalimage)
Definition: nbtpage.c:668
void _bt_preprocess_keys(IndexScanDesc scan)
Definition: nbtutils.c:742
#define SK_BT_NULLS_FIRST
Definition: nbtree.h:957
static void _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir)
Definition: nbtsearch.c:2469
struct ParallelIndexScanDescData * parallel_scan
Definition: relscan.h:163
uintptr_t Datum
Definition: postgres.h:367
#define InvalidOid
Definition: postgres_ext.h:36
int sk_flags
Definition: skey.h:66
#define Assert(condition)
Definition: c.h:745
#define SK_BT_DESC
Definition: nbtree.h:956
Oid get_opfamily_proc(Oid opfamily, Oid lefttype, Oid righttype, int16 procnum)
Definition: lsyscache.c:794
#define BTScanPosInvalidate(scanpos)
Definition: nbtree.h:885
#define INDEX_MAX_KEYS
#define InvalidBlockNumber
Definition: block.h:33
#define OffsetNumberPrev(offsetNumber)
Definition: off.h:54
int numberOfKeys
Definition: nbtree.h:908
#define BufferIsValid(bufnum)
Definition: bufmgr.h:123
BTScanPosItem items[MaxTIDsPerBTreePage]
Definition: nbtree.h:857
#define DatumGetPointer(X)
Definition: postgres.h:549
void _bt_unlockbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:1006
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2661
BTScanPosData currPos
Definition: nbtree.h:942
ScanKey keyData
Definition: nbtree.h:909
Oid sk_collation
Definition: skey.h:70
#define elog(elevel,...)
Definition: elog.h:214
int i
static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:1861
Buffer buf
Definition: nbtree.h:825
Oid * rd_opcintype
Definition: rel.h:190
static void static void status(const char *fmt,...) pg_attribute_printf(1
Definition: pg_regress.c:227
#define BTLessStrategyNumber
Definition: stratnum.h:29
int Buffer
Definition: buf.h:23
Datum sk_argument
Definition: skey.h:72
static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
Definition: nbtsearch.c:1509
int16 AttrNumber
Definition: attnum.h:21
BTStack _bt_search(Relation rel, BTScanInsert key, Buffer *bufP, int access, Snapshot snapshot)
Definition: nbtsearch.c:101
#define BTEqualStrategyNumber
Definition: stratnum.h:31
#define BTGreaterEqualStrategyNumber
Definition: stratnum.h:32
AttrNumber sk_attno
Definition: skey.h:67

◆ _bt_form_posting()

IndexTuple _bt_form_posting ( IndexTuple  base,
ItemPointer  htids,
int  nhtids 
)

Definition at line 616 of file nbtdedup.c.

References Assert, BTreeTupleGetPosting(), BTreeTupleGetPostingOffset(), BTreeTupleIsPivot(), BTreeTupleIsPosting(), BTreeTupleSetPosting(), INDEX_ALT_TID_MASK, INDEX_SIZE_MASK, IndexTupleSize, ItemPointerCopy, ItemPointerIsValid, MAXALIGN, palloc0(), PG_UINT16_MAX, IndexTupleData::t_info, and IndexTupleData::t_tid.

Referenced by _bt_dedup_finish_pending(), _bt_sort_dedup_finish_pending(), and bt_posting_plain_tuple().

617 {
618  uint32 keysize,
619  newsize;
620  IndexTuple itup;
621 
622  if (BTreeTupleIsPosting(base))
623  keysize = BTreeTupleGetPostingOffset(base);
624  else
625  keysize = IndexTupleSize(base);
626 
627  Assert(!BTreeTupleIsPivot(base));
628  Assert(nhtids > 0 && nhtids <= PG_UINT16_MAX);
629  Assert(keysize == MAXALIGN(keysize));
630 
631  /* Determine final size of new tuple */
632  if (nhtids > 1)
633  newsize = MAXALIGN(keysize +
634  nhtids * sizeof(ItemPointerData));
635  else
636  newsize = keysize;
637 
638  Assert(newsize <= INDEX_SIZE_MASK);
639  Assert(newsize == MAXALIGN(newsize));
640 
641  /* Allocate memory using palloc0() (matches index_form_tuple()) */
642  itup = palloc0(newsize);
643  memcpy(itup, base, keysize);
644  itup->t_info &= ~INDEX_SIZE_MASK;
645  itup->t_info |= newsize;
646  if (nhtids > 1)
647  {
648  /* Form posting list tuple */
649  BTreeTupleSetPosting(itup, nhtids, keysize);
650  memcpy(BTreeTupleGetPosting(itup), htids,
651  sizeof(ItemPointerData) * nhtids);
652  Assert(_bt_posting_valid(itup));
653  }
654  else
655  {
656  /* Form standard non-pivot tuple */
657  itup->t_info &= ~INDEX_ALT_TID_MASK;
658  ItemPointerCopy(htids, &itup->t_tid);
660  }
661 
662  return itup;
663 }
#define ItemPointerIsValid(pointer)
Definition: itemptr.h:82
static bool BTreeTupleIsPivot(IndexTuple itup)
Definition: nbtree.h:348
static ItemPointer BTreeTupleGetPosting(IndexTuple posting)
Definition: nbtree.h:405
ItemPointerData t_tid
Definition: itup.h:37
#define INDEX_SIZE_MASK
Definition: itup.h:65
#define INDEX_ALT_TID_MASK
Definition: nbtree.h:334
#define PG_UINT16_MAX
Definition: c.h:455
static void BTreeTupleSetPosting(IndexTuple itup, uint16 nhtids, int postingoffset)
Definition: nbtree.h:372
unsigned int uint32
Definition: c.h:374
void * palloc0(Size size)
Definition: mcxt.c:981
static bool BTreeTupleIsPosting(IndexTuple itup)
Definition: nbtree.h:360
static uint32 BTreeTupleGetPostingOffset(IndexTuple posting)
Definition: nbtree.h:397
#define Assert(condition)
Definition: c.h:745
#define MAXALIGN(LEN)
Definition: c.h:698
unsigned short t_info
Definition: itup.h:49
#define IndexTupleSize(itup)
Definition: itup.h:71
#define ItemPointerCopy(fromPointer, toPointer)
Definition: itemptr.h:161

◆ _bt_freestack()

void _bt_freestack ( BTStack  stack)

Definition at line 175 of file nbtutils.c.

References BTStackData::bts_parent, and pfree().

Referenced by _bt_doinsert(), _bt_first(), and bt_rootdescend().

176 {
177  BTStack ostack;
178 
179  while (stack != NULL)
180  {
181  ostack = stack;
182  stack = stack->bts_parent;
183  pfree(ostack);
184  }
185 }
void pfree(void *pointer)
Definition: mcxt.c:1057
struct BTStackData * bts_parent
Definition: nbtree.h:604

◆ _bt_get_endpoint()

Buffer _bt_get_endpoint ( Relation  rel,
uint32  level,
bool  rightmost,
Snapshot  snapshot 
)

Definition at line 2295 of file nbtsearch.c.

References _bt_getroot(), _bt_gettrueroot(), _bt_relandgetbuf(), BT_READ, BTPageOpaqueData::btpo, BTPageOpaqueData::btpo_next, BTreeTupleGetDownLink(), buf, BufferGetPage, BufferIsValid, elog, ereport, errcode(), errmsg_internal(), ERROR, InvalidBuffer, BTPageOpaqueData::level, P_FIRSTDATAKEY, P_IGNORE, P_NONE, P_RIGHTMOST, PageGetItem, PageGetItemId, PageGetMaxOffsetNumber, PageGetSpecialPointer, RelationGetRelationName, and TestForOldSnapshot().

Referenced by _bt_endpoint(), and _bt_insert_parent().

2297 {
2298  Buffer buf;
2299  Page page;
2300  BTPageOpaque opaque;
2301  OffsetNumber offnum;
2302  BlockNumber blkno;
2303  IndexTuple itup;
2304 
2305  /*
2306  * If we are looking for a leaf page, okay to descend from fast root;
2307  * otherwise better descend from true root. (There is no point in being
2308  * smarter about intermediate levels.)
2309  */
2310  if (level == 0)
2311  buf = _bt_getroot(rel, BT_READ);
2312  else
2313  buf = _bt_gettrueroot(rel);
2314 
2315  if (!BufferIsValid(buf))
2316  return InvalidBuffer;
2317 
2318  page = BufferGetPage(buf);
2319  TestForOldSnapshot(snapshot, rel, page);
2320  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2321 
2322  for (;;)
2323  {
2324  /*
2325  * If we landed on a deleted page, step right to find a live page
2326  * (there must be one). Also, if we want the rightmost page, step
2327  * right if needed to get to it (this could happen if the page split
2328  * since we obtained a pointer to it).
2329  */
2330  while (P_IGNORE(opaque) ||
2331  (rightmost && !P_RIGHTMOST(opaque)))
2332  {
2333  blkno = opaque->btpo_next;
2334  if (blkno == P_NONE)
2335  elog(ERROR, "fell off the end of index \"%s\"",
2337  buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2338  page = BufferGetPage(buf);
2339  TestForOldSnapshot(snapshot, rel, page);
2340  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2341  }
2342 
2343  /* Done? */
2344  if (opaque->btpo.level == level)
2345  break;
2346  if (opaque->btpo.level < level)
2347  ereport(ERROR,
2348  (errcode(ERRCODE_INDEX_CORRUPTED),
2349  errmsg_internal("btree level %u not found in index \"%s\"",
2350  level, RelationGetRelationName(rel))));
2351 
2352  /* Descend to leftmost or rightmost child page */
2353  if (rightmost)
2354  offnum = PageGetMaxOffsetNumber(page);
2355  else
2356  offnum = P_FIRSTDATAKEY(opaque);
2357 
2358  itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
2359  blkno = BTreeTupleGetDownLink(itup);
2360 
2361  buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2362  page = BufferGetPage(buf);
2363  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2364  }
2365 
2366  return buf;
2367 }
BlockNumber btpo_next
Definition: nbtree.h:59
static void TestForOldSnapshot(Snapshot snapshot, Relation relation, Page page)
Definition: bufmgr.h:277
#define P_IGNORE(opaque)
Definition: nbtree.h:219
Buffer _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
Definition: nbtpage.c:939
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:244
union BTPageOpaqueData::@45 btpo
#define P_NONE
Definition: nbtree.h:206
static BlockNumber BTreeTupleGetDownLink(IndexTuple pivot)
Definition: nbtree.h:424
#define InvalidBuffer
Definition: buf.h:25
int errcode(int sqlerrcode)
Definition: elog.c:610
uint32 BlockNumber
Definition: block.h:31
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:357
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:69
uint16 OffsetNumber
Definition: off.h:24
#define BT_READ
Definition: nbtree.h:587
#define ERROR
Definition: elog.h:43
static char * buf
Definition: pg_test_fsync.c:68
IndexTupleData * IndexTuple
Definition: itup.h:53
#define RelationGetRelationName(relation)
Definition: rel.h:490
Buffer _bt_getroot(Relation rel, int access)
Definition: nbtpage.c:272
#define BufferGetPage(buffer)
Definition: bufmgr.h:169
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:235
uint32 level
Definition: nbtree.h:62
Buffer _bt_gettrueroot(Relation rel)
Definition: nbtpage.c:508
#define ereport(elevel,...)
Definition: elog.h:144
int errmsg_internal(const char *fmt,...)
Definition: elog.c:911
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
#define BufferIsValid(bufnum)
Definition: bufmgr.h:123
#define elog(elevel,...)
Definition: elog.h:214
int Buffer
Definition: buf.h:23
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:213
#define PageGetItem(page, itemId)
Definition: bufpage.h:340
Pointer Page
Definition: bufpage.h:78

◆ _bt_getbuf()

Buffer _bt_getbuf ( Relation  rel,
BlockNumber  blkno,
int  access 
)

Definition at line 803 of file nbtpage.c.

References _bt_checkpage(), _bt_conditionallockbuf(), _bt_lockbuf(), _bt_log_reuse_page(), _bt_page_recyclable(), _bt_pageinit(), _bt_relbuf(), Assert, BT_WRITE, BTPageOpaqueData::btpo, buf, BufferGetPage, BufferGetPageSize, DEBUG2, elog, ExclusiveLock, GetFreeIndexPage(), InvalidBlockNumber, LockRelationForExtension(), P_NEW, PageGetSpecialPointer, PageIsNew, ReadBuffer(), RELATION_IS_LOCAL, RelationNeedsWAL, ReleaseBuffer(), UnlockRelationForExtension(), BTPageOpaqueData::xact, and XLogStandbyInfoActive.

Referenced by _bt_finish_split(), _bt_getroot(), _bt_getrootheight(), _bt_getstackbuf(), _bt_gettrueroot(), _bt_insertonpg(), _bt_killitems(), _bt_leftsib_splitflag(), _bt_metaversion(), _bt_moveright(), _bt_newroot(), _bt_pagedel(), _bt_readnextpage(), _bt_rightsib_halfdeadflag(), _bt_split(), _bt_unlink_halfdead_page(), _bt_update_meta_cleanup_info(), _bt_vacuum_needs_cleanup(), and _bt_walk_left().

804 {
805  Buffer buf;
806 
807  if (blkno != P_NEW)
808  {
809  /* Read an existing block of the relation */
810  buf = ReadBuffer(rel, blkno);
811  _bt_lockbuf(rel, buf, access);
812  _bt_checkpage(rel, buf);
813  }
814  else
815  {
816  bool needLock;
817  Page page;
818 
819  Assert(access == BT_WRITE);
820 
821  /*
822  * First see if the FSM knows of any free pages.
823  *
824  * We can't trust the FSM's report unreservedly; we have to check that
825  * the page is still free. (For example, an already-free page could
826  * have been re-used between the time the last VACUUM scanned it and
827  * the time the VACUUM made its FSM updates.)
828  *
829  * In fact, it's worse than that: we can't even assume that it's safe
830  * to take a lock on the reported page. If somebody else has a lock
831  * on it, or even worse our own caller does, we could deadlock. (The
832  * own-caller scenario is actually not improbable. Consider an index
833  * on a serial or timestamp column. Nearly all splits will be at the
834  * rightmost page, so it's entirely likely that _bt_split will call us
835  * while holding a lock on the page most recently acquired from FSM. A
836  * VACUUM running concurrently with the previous split could well have
837  * placed that page back in FSM.)
838  *
839  * To get around that, we ask for only a conditional lock on the
840  * reported page. If we fail, then someone else is using the page,
841  * and we may reasonably assume it's not free. (If we happen to be
842  * wrong, the worst consequence is the page will be lost to use till
843  * the next VACUUM, which is no big problem.)
844  */
845  for (;;)
846  {
847  blkno = GetFreeIndexPage(rel);
848  if (blkno == InvalidBlockNumber)
849  break;
850  buf = ReadBuffer(rel, blkno);
851  if (_bt_conditionallockbuf(rel, buf))
852  {
853  page = BufferGetPage(buf);
854  if (_bt_page_recyclable(page))
855  {
856  /*
857  * If we are generating WAL for Hot Standby then create a
858  * WAL record that will allow us to conflict with queries
859  * running on standby, in case they have snapshots older