PostgreSQL Source Code  git master
nbtree.h File Reference
#include "access/amapi.h"
#include "access/itup.h"
#include "access/sdir.h"
#include "access/tableam.h"
#include "access/xlogreader.h"
#include "catalog/pg_am_d.h"
#include "catalog/pg_index.h"
#include "lib/stringinfo.h"
#include "storage/bufmgr.h"
#include "storage/shm_toc.h"
Include dependency graph for nbtree.h:
This graph shows which files directly or indirectly include this file:

Go to the source code of this file.

Data Structures

struct  BTPageOpaqueData
 
struct  BTMetaPageData
 
struct  BTDeletedPageData
 
struct  BTPendingFSM
 
struct  BTVacState
 
struct  BTStackData
 
struct  BTScanInsertData
 
struct  BTInsertStateData
 
struct  BTDedupInterval
 
struct  BTDedupStateData
 
struct  BTVacuumPostingData
 
struct  BTScanPosItem
 
struct  BTScanPosData
 
struct  BTArrayKeyInfo
 
struct  BTScanOpaqueData
 
struct  BTOptions
 

Macros

#define BTP_LEAF   (1 << 0) /* leaf page, i.e. not internal page */
 
#define BTP_ROOT   (1 << 1) /* root page (has no parent) */
 
#define BTP_DELETED   (1 << 2) /* page has been deleted from tree */
 
#define BTP_META   (1 << 3) /* meta-page */
 
#define BTP_HALF_DEAD   (1 << 4) /* empty, but still in tree */
 
#define BTP_SPLIT_END   (1 << 5) /* rightmost page of split group */
 
#define BTP_HAS_GARBAGE   (1 << 6) /* page has LP_DEAD tuples (deprecated) */
 
#define BTP_INCOMPLETE_SPLIT   (1 << 7) /* right sibling's downlink is missing */
 
#define BTP_HAS_FULLXID   (1 << 8) /* contains BTDeletedPageData */
 
#define MAX_BT_CYCLE_ID   0xFF7F
 
#define BTPageGetMeta(p)   ((BTMetaPageData *) PageGetContents(p))
 
#define BTREE_METAPAGE   0 /* first page is meta */
 
#define BTREE_MAGIC   0x053162 /* magic number in metapage */
 
#define BTREE_VERSION   4 /* current version number */
 
#define BTREE_MIN_VERSION   2 /* minimum supported version */
 
#define BTREE_NOVAC_VERSION   3 /* version with all meta fields set */
 
#define BTMaxItemSize(page)
 
#define BTMaxItemSizeNoHeapTid(page)
 
#define MaxTIDsPerBTreePage
 
#define BTREE_MIN_FILLFACTOR   10
 
#define BTREE_DEFAULT_FILLFACTOR   90
 
#define BTREE_NONLEAF_FILLFACTOR   70
 
#define BTREE_SINGLEVAL_FILLFACTOR   96
 
#define P_NONE   0
 
#define P_LEFTMOST(opaque)   ((opaque)->btpo_prev == P_NONE)
 
#define P_RIGHTMOST(opaque)   ((opaque)->btpo_next == P_NONE)
 
#define P_ISLEAF(opaque)   (((opaque)->btpo_flags & BTP_LEAF) != 0)
 
#define P_ISROOT(opaque)   (((opaque)->btpo_flags & BTP_ROOT) != 0)
 
#define P_ISDELETED(opaque)   (((opaque)->btpo_flags & BTP_DELETED) != 0)
 
#define P_ISMETA(opaque)   (((opaque)->btpo_flags & BTP_META) != 0)
 
#define P_ISHALFDEAD(opaque)   (((opaque)->btpo_flags & BTP_HALF_DEAD) != 0)
 
#define P_IGNORE(opaque)   (((opaque)->btpo_flags & (BTP_DELETED|BTP_HALF_DEAD)) != 0)
 
#define P_HAS_GARBAGE(opaque)   (((opaque)->btpo_flags & BTP_HAS_GARBAGE) != 0)
 
#define P_INCOMPLETE_SPLIT(opaque)   (((opaque)->btpo_flags & BTP_INCOMPLETE_SPLIT) != 0)
 
#define P_HAS_FULLXID(opaque)   (((opaque)->btpo_flags & BTP_HAS_FULLXID) != 0)
 
#define P_HIKEY   ((OffsetNumber) 1)
 
#define P_FIRSTKEY   ((OffsetNumber) 2)
 
#define P_FIRSTDATAKEY(opaque)   (P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY)
 
#define INDEX_ALT_TID_MASK   INDEX_AM_RESERVED_BIT
 
#define BT_OFFSET_MASK   0x0FFF
 
#define BT_STATUS_OFFSET_MASK   0xF000
 
#define BT_PIVOT_HEAP_TID_ATTR   0x1000
 
#define BT_IS_POSTING   0x2000
 
#define BTreeTupleGetNAtts(itup, rel)
 
#define BTCommuteStrategyNumber(strat)   (BTMaxStrategyNumber + 1 - (strat))
 
#define BTORDER_PROC   1
 
#define BTSORTSUPPORT_PROC   2
 
#define BTINRANGE_PROC   3
 
#define BTEQUALIMAGE_PROC   4
 
#define BTOPTIONS_PROC   5
 
#define BTNProcs   5
 
#define BT_READ   BUFFER_LOCK_SHARE
 
#define BT_WRITE   BUFFER_LOCK_EXCLUSIVE
 
#define BTScanPosIsPinned(scanpos)
 
#define BTScanPosUnpin(scanpos)
 
#define BTScanPosUnpinIfPinned(scanpos)
 
#define BTScanPosIsValid(scanpos)
 
#define BTScanPosInvalidate(scanpos)
 
#define SK_BT_REQFWD   0x00010000 /* required to continue forward scan */
 
#define SK_BT_REQBKWD   0x00020000 /* required to continue backward scan */
 
#define SK_BT_INDOPTION_SHIFT   24 /* must clear the above bits */
 
#define SK_BT_DESC   (INDOPTION_DESC << SK_BT_INDOPTION_SHIFT)
 
#define SK_BT_NULLS_FIRST   (INDOPTION_NULLS_FIRST << SK_BT_INDOPTION_SHIFT)
 
#define BTGetFillFactor(relation)
 
#define BTGetTargetPageFreeSpace(relation)   (BLCKSZ * (100 - BTGetFillFactor(relation)) / 100)
 
#define BTGetDeduplicateItems(relation)
 
#define PROGRESS_BTREE_PHASE_INDEXBUILD_TABLESCAN   2
 
#define PROGRESS_BTREE_PHASE_PERFORMSORT_1   3
 
#define PROGRESS_BTREE_PHASE_PERFORMSORT_2   4
 
#define PROGRESS_BTREE_PHASE_LEAF_LOAD   5
 

Typedefs

typedef uint16 BTCycleId
 
typedef struct BTPageOpaqueData BTPageOpaqueData
 
typedef BTPageOpaqueDataBTPageOpaque
 
typedef struct BTMetaPageData BTMetaPageData
 
typedef struct BTDeletedPageData BTDeletedPageData
 
typedef struct BTPendingFSM BTPendingFSM
 
typedef struct BTVacState BTVacState
 
typedef struct BTStackData BTStackData
 
typedef BTStackDataBTStack
 
typedef struct BTScanInsertData BTScanInsertData
 
typedef BTScanInsertDataBTScanInsert
 
typedef struct BTInsertStateData BTInsertStateData
 
typedef BTInsertStateDataBTInsertState
 
typedef struct BTDedupInterval BTDedupInterval
 
typedef struct BTDedupStateData BTDedupStateData
 
typedef BTDedupStateDataBTDedupState
 
typedef struct BTVacuumPostingData BTVacuumPostingData
 
typedef BTVacuumPostingDataBTVacuumPosting
 
typedef struct BTScanPosItem BTScanPosItem
 
typedef struct BTScanPosData BTScanPosData
 
typedef BTScanPosDataBTScanPos
 
typedef struct BTArrayKeyInfo BTArrayKeyInfo
 
typedef struct BTScanOpaqueData BTScanOpaqueData
 
typedef BTScanOpaqueDataBTScanOpaque
 
typedef struct BTOptions BTOptions
 

Functions

static void BTPageSetDeleted (Page page, FullTransactionId safexid)
 
static FullTransactionId BTPageGetDeleteXid (Page page)
 
static bool BTPageIsRecyclable (Page page)
 
static bool BTreeTupleIsPivot (IndexTuple itup)
 
static bool BTreeTupleIsPosting (IndexTuple itup)
 
static void BTreeTupleSetPosting (IndexTuple itup, uint16 nhtids, int postingoffset)
 
static uint16 BTreeTupleGetNPosting (IndexTuple posting)
 
static uint32 BTreeTupleGetPostingOffset (IndexTuple posting)
 
static ItemPointer BTreeTupleGetPosting (IndexTuple posting)
 
static ItemPointer BTreeTupleGetPostingN (IndexTuple posting, int n)
 
static BlockNumber BTreeTupleGetDownLink (IndexTuple pivot)
 
static void BTreeTupleSetDownLink (IndexTuple pivot, BlockNumber blkno)
 
static void BTreeTupleSetNAtts (IndexTuple itup, uint16 nkeyatts, bool heaptid)
 
static BlockNumber BTreeTupleGetTopParent (IndexTuple leafhikey)
 
static void BTreeTupleSetTopParent (IndexTuple leafhikey, BlockNumber blkno)
 
static ItemPointer BTreeTupleGetHeapTID (IndexTuple itup)
 
static ItemPointer BTreeTupleGetMaxHeapTID (IndexTuple itup)
 
void btbuildempty (Relation index)
 
bool btinsert (Relation rel, Datum *values, bool *isnull, ItemPointer ht_ctid, Relation heapRel, IndexUniqueCheck checkUnique, bool indexUnchanged, struct IndexInfo *indexInfo)
 
IndexScanDesc btbeginscan (Relation rel, int nkeys, int norderbys)
 
Size btestimateparallelscan (void)
 
void btinitparallelscan (void *target)
 
bool btgettuple (IndexScanDesc scan, ScanDirection dir)
 
int64 btgetbitmap (IndexScanDesc scan, TIDBitmap *tbm)
 
void btrescan (IndexScanDesc scan, ScanKey scankey, int nscankeys, ScanKey orderbys, int norderbys)
 
void btparallelrescan (IndexScanDesc scan)
 
void btendscan (IndexScanDesc scan)
 
void btmarkpos (IndexScanDesc scan)
 
void btrestrpos (IndexScanDesc scan)
 
IndexBulkDeleteResultbtbulkdelete (IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state)
 
IndexBulkDeleteResultbtvacuumcleanup (IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
 
bool btcanreturn (Relation index, int attno)
 
bool _bt_parallel_seize (IndexScanDesc scan, BlockNumber *pageno)
 
void _bt_parallel_release (IndexScanDesc scan, BlockNumber scan_page)
 
void _bt_parallel_done (IndexScanDesc scan)
 
void _bt_parallel_advance_array_keys (IndexScanDesc scan)
 
void _bt_dedup_pass (Relation rel, Buffer buf, Relation heapRel, IndexTuple newitem, Size newitemsz, bool bottomupdedup)
 
bool _bt_bottomupdel_pass (Relation rel, Buffer buf, Relation heapRel, Size newitemsz)
 
void _bt_dedup_start_pending (BTDedupState state, IndexTuple base, OffsetNumber baseoff)
 
bool _bt_dedup_save_htid (BTDedupState state, IndexTuple itup)
 
Size _bt_dedup_finish_pending (Page newpage, BTDedupState state)
 
IndexTuple _bt_form_posting (IndexTuple base, ItemPointer htids, int nhtids)
 
void _bt_update_posting (BTVacuumPosting vacposting)
 
IndexTuple _bt_swap_posting (IndexTuple newitem, IndexTuple oposting, int postingoff)
 
bool _bt_doinsert (Relation rel, IndexTuple itup, IndexUniqueCheck checkUnique, bool indexUnchanged, Relation heapRel)
 
void _bt_finish_split (Relation rel, Buffer lbuf, BTStack stack)
 
Buffer _bt_getstackbuf (Relation rel, BTStack stack, BlockNumber child)
 
OffsetNumber _bt_findsplitloc (Relation rel, Page origpage, OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem, bool *newitemonleft)
 
void _bt_initmetapage (Page page, BlockNumber rootbknum, uint32 level, bool allequalimage)
 
bool _bt_vacuum_needs_cleanup (Relation rel)
 
void _bt_set_cleanup_info (Relation rel, BlockNumber num_delpages)
 
void _bt_upgrademetapage (Page page)
 
Buffer _bt_getroot (Relation rel, int access)
 
Buffer _bt_gettrueroot (Relation rel)
 
int _bt_getrootheight (Relation rel)
 
void _bt_metaversion (Relation rel, bool *heapkeyspace, bool *allequalimage)
 
void _bt_checkpage (Relation rel, Buffer buf)
 
Buffer _bt_getbuf (Relation rel, BlockNumber blkno, int access)
 
Buffer _bt_relandgetbuf (Relation rel, Buffer obuf, BlockNumber blkno, int access)
 
void _bt_relbuf (Relation rel, Buffer buf)
 
void _bt_lockbuf (Relation rel, Buffer buf, int access)
 
void _bt_unlockbuf (Relation rel, Buffer buf)
 
bool _bt_conditionallockbuf (Relation rel, Buffer buf)
 
void _bt_upgradelockbufcleanup (Relation rel, Buffer buf)
 
void _bt_pageinit (Page page, Size size)
 
void _bt_delitems_vacuum (Relation rel, Buffer buf, OffsetNumber *deletable, int ndeletable, BTVacuumPosting *updatable, int nupdatable)
 
void _bt_delitems_delete_check (Relation rel, Buffer buf, Relation heapRel, TM_IndexDeleteOp *delstate)
 
void _bt_pagedel (Relation rel, Buffer leafbuf, BTVacState *vstate)
 
void _bt_pendingfsm_init (Relation rel, BTVacState *vstate, bool cleanuponly)
 
void _bt_pendingfsm_finalize (Relation rel, BTVacState *vstate)
 
BTStack _bt_search (Relation rel, BTScanInsert key, Buffer *bufP, int access, Snapshot snapshot)
 
Buffer _bt_moveright (Relation rel, BTScanInsert key, Buffer buf, bool forupdate, BTStack stack, int access, Snapshot snapshot)
 
OffsetNumber _bt_binsrch_insert (Relation rel, BTInsertState insertstate)
 
int32 _bt_compare (Relation rel, BTScanInsert key, Page page, OffsetNumber offnum)
 
bool _bt_first (IndexScanDesc scan, ScanDirection dir)
 
bool _bt_next (IndexScanDesc scan, ScanDirection dir)
 
Buffer _bt_get_endpoint (Relation rel, uint32 level, bool rightmost, Snapshot snapshot)
 
BTScanInsert _bt_mkscankey (Relation rel, IndexTuple itup)
 
void _bt_freestack (BTStack stack)
 
void _bt_preprocess_array_keys (IndexScanDesc scan)
 
void _bt_start_array_keys (IndexScanDesc scan, ScanDirection dir)
 
bool _bt_advance_array_keys (IndexScanDesc scan, ScanDirection dir)
 
void _bt_mark_array_keys (IndexScanDesc scan)
 
void _bt_restore_array_keys (IndexScanDesc scan)
 
void _bt_preprocess_keys (IndexScanDesc scan)
 
bool _bt_checkkeys (IndexScanDesc scan, IndexTuple tuple, int tupnatts, ScanDirection dir, bool *continuescan)
 
void _bt_killitems (IndexScanDesc scan)
 
BTCycleId _bt_vacuum_cycleid (Relation rel)
 
BTCycleId _bt_start_vacuum (Relation rel)
 
void _bt_end_vacuum (Relation rel)
 
void _bt_end_vacuum_callback (int code, Datum arg)
 
Size BTreeShmemSize (void)
 
void BTreeShmemInit (void)
 
byteabtoptions (Datum reloptions, bool validate)
 
bool btproperty (Oid index_oid, int attno, IndexAMProperty prop, const char *propname, bool *res, bool *isnull)
 
char * btbuildphasename (int64 phasenum)
 
IndexTuple _bt_truncate (Relation rel, IndexTuple lastleft, IndexTuple firstright, BTScanInsert itup_key)
 
int _bt_keep_natts_fast (Relation rel, IndexTuple lastleft, IndexTuple firstright)
 
bool _bt_check_natts (Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum)
 
void _bt_check_third_page (Relation rel, Relation heap, bool needheaptidspace, Page page, IndexTuple newtup)
 
bool _bt_allequalimage (Relation rel, bool debugmessage)
 
bool btvalidate (Oid opclassoid)
 
void btadjustmembers (Oid opfamilyoid, Oid opclassoid, List *operators, List *functions)
 
IndexBuildResultbtbuild (Relation heap, Relation index, struct IndexInfo *indexInfo)
 
void _bt_parallel_build_main (dsm_segment *seg, shm_toc *toc)
 

Macro Definition Documentation

◆ BT_IS_POSTING

#define BT_IS_POSTING   0x2000

Definition at line 466 of file nbtree.h.

Referenced by BTreeTupleIsPivot(), BTreeTupleIsPosting(), and BTreeTupleSetPosting().

◆ BT_OFFSET_MASK

#define BT_OFFSET_MASK   0x0FFF

Definition at line 462 of file nbtree.h.

Referenced by _bt_check_natts(), and BTreeTupleGetNPosting().

◆ BT_PIVOT_HEAP_TID_ATTR

#define BT_PIVOT_HEAP_TID_ATTR   0x1000

Definition at line 465 of file nbtree.h.

Referenced by _bt_check_natts(), BTreeTupleGetHeapTID(), and BTreeTupleSetNAtts().

◆ BT_READ

◆ BT_STATUS_OFFSET_MASK

#define BT_STATUS_OFFSET_MASK   0xF000

Definition at line 463 of file nbtree.h.

Referenced by BTreeTupleSetNAtts(), and BTreeTupleSetPosting().

◆ BT_WRITE

◆ BTCommuteStrategyNumber

#define BTCommuteStrategyNumber (   strat)    (BTMaxStrategyNumber + 1 - (strat))

Definition at line 678 of file nbtree.h.

Referenced by _bt_compare_scankey_args(), and _bt_fix_scankey_strategy().

◆ BTEQUALIMAGE_PROC

#define BTEQUALIMAGE_PROC   4

Definition at line 703 of file nbtree.h.

Referenced by _bt_allequalimage(), assignProcTypes(), and btvalidate().

◆ BTGetDeduplicateItems

#define BTGetDeduplicateItems (   relation)
Value:
(AssertMacro(relation->rd_rel->relkind == RELKIND_INDEX && \
relation->rd_rel->relam == BTREE_AM_OID), \
((relation)->rd_options ? \
((BTOptions *) (relation)->rd_options)->deduplicate_items : true))
#define AssertMacro(condition)
Definition: c.h:805

Definition at line 1101 of file nbtree.h.

Referenced by _bt_delete_or_dedup_one_page(), and _bt_load().

◆ BTGetFillFactor

#define BTGetFillFactor (   relation)
Value:
(AssertMacro(relation->rd_rel->relkind == RELKIND_INDEX && \
relation->rd_rel->relam == BTREE_AM_OID), \
(relation)->rd_options ? \
((BTOptions *) (relation)->rd_options)->fillfactor : \
BTREE_DEFAULT_FILLFACTOR)
#define AssertMacro(condition)
Definition: c.h:805

Definition at line 1093 of file nbtree.h.

Referenced by _bt_findsplitloc().

◆ BTGetTargetPageFreeSpace

#define BTGetTargetPageFreeSpace (   relation)    (BLCKSZ * (100 - BTGetFillFactor(relation)) / 100)

Definition at line 1099 of file nbtree.h.

Referenced by _bt_pagestate().

◆ BTINRANGE_PROC

#define BTINRANGE_PROC   3

Definition at line 702 of file nbtree.h.

Referenced by assignProcTypes(), btvalidate(), and transformFrameOffset().

◆ BTMaxItemSize

#define BTMaxItemSize (   page)
Value:
3*sizeof(ItemIdData) + \
3*sizeof(ItemPointerData)) - \
MAXALIGN(sizeof(BTPageOpaqueData))) / 3)
#define SizeOfPageHeaderData
Definition: bufpage.h:216
#define PageGetPageSize(page)
Definition: bufpage.h:268
#define MAXALIGN(LEN)
Definition: c.h:757
#define MAXALIGN_DOWN(LEN)
Definition: c.h:769

Definition at line 162 of file nbtree.h.

Referenced by _bt_buildadd(), _bt_check_third_page(), _bt_dedup_pass(), _bt_findinsertloc(), _bt_load(), bt_target_page_check(), and btree_xlog_dedup().

◆ BTMaxItemSizeNoHeapTid

#define BTMaxItemSizeNoHeapTid (   page)
Value:
MAXALIGN(sizeof(BTPageOpaqueData))) / 3)
#define SizeOfPageHeaderData
Definition: bufpage.h:216
#define PageGetPageSize(page)
Definition: bufpage.h:268
#define MAXALIGN(LEN)
Definition: c.h:757
#define MAXALIGN_DOWN(LEN)
Definition: c.h:769

Definition at line 168 of file nbtree.h.

Referenced by _bt_check_third_page(), and bt_target_page_check().

◆ BTNProcs

#define BTNProcs   5

Definition at line 705 of file nbtree.h.

Referenced by bthandler().

◆ BTOPTIONS_PROC

#define BTOPTIONS_PROC   5

Definition at line 704 of file nbtree.h.

Referenced by bthandler(), and btvalidate().

◆ BTORDER_PROC

◆ BTP_DELETED

#define BTP_DELETED   (1 << 2) /* page has been deleted from tree */

Definition at line 76 of file nbtree.h.

Referenced by BTPageSetDeleted().

◆ BTP_HALF_DEAD

#define BTP_HALF_DEAD   (1 << 4) /* empty, but still in tree */

◆ BTP_HAS_FULLXID

#define BTP_HAS_FULLXID   (1 << 8) /* contains BTDeletedPageData */

Definition at line 82 of file nbtree.h.

Referenced by BTPageSetDeleted().

◆ BTP_HAS_GARBAGE

#define BTP_HAS_GARBAGE   (1 << 6) /* page has LP_DEAD tuples (deprecated) */

◆ BTP_INCOMPLETE_SPLIT

#define BTP_INCOMPLETE_SPLIT   (1 << 7) /* right sibling's downlink is missing */

◆ BTP_LEAF

#define BTP_LEAF   (1 << 0) /* leaf page, i.e. not internal page */

◆ BTP_META

#define BTP_META   (1 << 3) /* meta-page */

Definition at line 77 of file nbtree.h.

Referenced by _bt_initmetapage(), _bt_restore_meta(), and _bt_upgrademetapage().

◆ BTP_ROOT

#define BTP_ROOT   (1 << 1) /* root page (has no parent) */

Definition at line 75 of file nbtree.h.

Referenced by _bt_getroot(), _bt_newroot(), _bt_split(), _bt_uppershutdown(), and btree_xlog_newroot().

◆ BTP_SPLIT_END

#define BTP_SPLIT_END   (1 << 5) /* rightmost page of split group */

Definition at line 79 of file nbtree.h.

Referenced by _bt_split(), btree_mask(), and btvacuumpage().

◆ BTPageGetMeta

◆ BTREE_DEFAULT_FILLFACTOR

#define BTREE_DEFAULT_FILLFACTOR   90

Definition at line 199 of file nbtree.h.

◆ BTREE_MAGIC

#define BTREE_MAGIC   0x053162 /* magic number in metapage */

◆ BTREE_METAPAGE

◆ BTREE_MIN_FILLFACTOR

#define BTREE_MIN_FILLFACTOR   10

Definition at line 198 of file nbtree.h.

◆ BTREE_MIN_VERSION

#define BTREE_MIN_VERSION   2 /* minimum supported version */

◆ BTREE_NONLEAF_FILLFACTOR

#define BTREE_NONLEAF_FILLFACTOR   70

Definition at line 200 of file nbtree.h.

Referenced by _bt_findsplitloc(), and _bt_pagestate().

◆ BTREE_NOVAC_VERSION

◆ BTREE_SINGLEVAL_FILLFACTOR

#define BTREE_SINGLEVAL_FILLFACTOR   96

Definition at line 201 of file nbtree.h.

Referenced by _bt_findsplitloc(), and _bt_singleval_fillfactor().

◆ BTREE_VERSION

#define BTREE_VERSION   4 /* current version number */

◆ BTreeTupleGetNAtts

#define BTreeTupleGetNAtts (   itup,
  rel 
)
Value:
( \
(BTreeTupleIsPivot(itup)) ? \
( \
) \
: \
)
#define ItemPointerGetOffsetNumberNoCheck(pointer)
Definition: itemptr.h:108
static bool BTreeTupleIsPivot(IndexTuple itup)
Definition: nbtree.h:473
#define BT_OFFSET_MASK
Definition: nbtree.h:462
#define IndexRelationGetNumberOfAttributes(relation)
Definition: rel.h:489

Definition at line 570 of file nbtree.h.

Referenced by _bt_buildadd(), _bt_check_natts(), _bt_checkkeys(), _bt_compare(), _bt_insertonpg(), _bt_mkscankey(), _bt_newroot(), _bt_readpage(), _bt_split(), _bt_uppershutdown(), and bt_target_page_check().

◆ BTScanPosInvalidate

#define BTScanPosInvalidate (   scanpos)
Value:
do { \
(scanpos).currPage = InvalidBlockNumber; \
(scanpos).nextPage = InvalidBlockNumber; \
(scanpos).buf = InvalidBuffer; \
(scanpos).lsn = InvalidXLogRecPtr; \
(scanpos).nextTupleOffset = 0; \
} while (0)
#define InvalidXLogRecPtr
Definition: xlogdefs.h:28
#define InvalidBuffer
Definition: buf.h:25
static char * buf
Definition: pg_test_fsync.c:68
#define InvalidBlockNumber
Definition: block.h:33

Definition at line 1011 of file nbtree.h.

Referenced by _bt_endpoint(), _bt_first(), _bt_readnextpage(), _bt_steppage(), btbeginscan(), btmarkpos(), btrescan(), and btrestrpos().

◆ BTScanPosIsPinned

#define BTScanPosIsPinned (   scanpos)
Value:
( \
AssertMacro(BlockNumberIsValid((scanpos).currPage) || \
!BufferIsValid((scanpos).buf)), \
BufferIsValid((scanpos).buf) \
)
static char * buf
Definition: pg_test_fsync.c:68
#define BlockNumberIsValid(blockNumber)
Definition: block.h:70
#define BufferIsValid(bufnum)
Definition: bufmgr.h:123

Definition at line 988 of file nbtree.h.

Referenced by _bt_killitems(), _bt_readnextpage(), _bt_readpage(), _bt_steppage(), and btrestrpos().

◆ BTScanPosIsValid

#define BTScanPosIsValid (   scanpos)
Value:
( \
AssertMacro(BlockNumberIsValid((scanpos).currPage) || \
!BufferIsValid((scanpos).buf)), \
BlockNumberIsValid((scanpos).currPage) \
)
static char * buf
Definition: pg_test_fsync.c:68
#define BlockNumberIsValid(blockNumber)
Definition: block.h:70
#define BufferIsValid(bufnum)
Definition: bufmgr.h:123

Definition at line 1005 of file nbtree.h.

Referenced by _bt_first(), _bt_killitems(), _bt_steppage(), btendscan(), btgettuple(), btmarkpos(), btrescan(), and btrestrpos().

◆ BTScanPosUnpin

#define BTScanPosUnpin (   scanpos)
Value:
do { \
ReleaseBuffer((scanpos).buf); \
(scanpos).buf = InvalidBuffer; \
} while (0)
#define InvalidBuffer
Definition: buf.h:25
static char * buf
Definition: pg_test_fsync.c:68

Definition at line 994 of file nbtree.h.

◆ BTScanPosUnpinIfPinned

#define BTScanPosUnpinIfPinned (   scanpos)
Value:
do { \
if (BTScanPosIsPinned(scanpos)) \
BTScanPosUnpin(scanpos); \
} while (0)
#define BTScanPosIsPinned(scanpos)
Definition: nbtree.h:988

Definition at line 999 of file nbtree.h.

Referenced by _bt_readnextpage(), _bt_steppage(), btendscan(), btmarkpos(), btrescan(), and btrestrpos().

◆ BTSORTSUPPORT_PROC

#define BTSORTSUPPORT_PROC   2

Definition at line 701 of file nbtree.h.

Referenced by assignProcTypes(), btvalidate(), FinishSortSupportFunction(), and MJExamineQuals().

◆ INDEX_ALT_TID_MASK

◆ MAX_BT_CYCLE_ID

#define MAX_BT_CYCLE_ID   0xFF7F

Definition at line 91 of file nbtree.h.

Referenced by _bt_start_vacuum().

◆ MaxTIDsPerBTreePage

#define MaxTIDsPerBTreePage
Value:
(int) ((BLCKSZ - SizeOfPageHeaderData - sizeof(BTPageOpaqueData)) / \
sizeof(ItemPointerData))
#define SizeOfPageHeaderData
Definition: bufpage.h:216

Definition at line 184 of file nbtree.h.

Referenced by _bt_bottomupdel_pass(), _bt_readpage(), _bt_simpledel_pass(), bt_check_every_level(), and btgettuple().

◆ P_FIRSTDATAKEY

◆ P_FIRSTKEY

#define P_FIRSTKEY   ((OffsetNumber) 2)

Definition at line 368 of file nbtree.h.

Referenced by _bt_afternewitemoff(), _bt_buildadd(), _bt_newroot(), and _bt_slideleft().

◆ P_HAS_FULLXID

#define P_HAS_FULLXID (   opaque)    (((opaque)->btpo_flags & BTP_HAS_FULLXID) != 0)

◆ P_HAS_GARBAGE

#define P_HAS_GARBAGE (   opaque)    (((opaque)->btpo_flags & BTP_HAS_GARBAGE) != 0)

Definition at line 225 of file nbtree.h.

Referenced by _bt_dedup_pass(), _bt_findinsertloc(), btree_xlog_dedup(), and palloc_btree_page().

◆ P_HIKEY

◆ P_IGNORE

◆ P_INCOMPLETE_SPLIT

◆ P_ISDELETED

◆ P_ISHALFDEAD

#define P_ISHALFDEAD (   opaque)    (((opaque)->btpo_flags & BTP_HALF_DEAD) != 0)

◆ P_ISLEAF

◆ P_ISMETA

#define P_ISMETA (   opaque)    (((opaque)->btpo_flags & BTP_META) != 0)

Definition at line 222 of file nbtree.h.

Referenced by _bt_getmeta(), _bt_gettrueroot(), bt_page_items_bytea(), and palloc_btree_page().

◆ P_ISROOT

#define P_ISROOT (   opaque)    (((opaque)->btpo_flags & BTP_ROOT) != 0)

◆ P_LEFTMOST

#define P_LEFTMOST (   opaque)    ((opaque)->btpo_prev == P_NONE)

◆ P_NONE

◆ P_RIGHTMOST

◆ PROGRESS_BTREE_PHASE_INDEXBUILD_TABLESCAN

#define PROGRESS_BTREE_PHASE_INDEXBUILD_TABLESCAN   2

Definition at line 1112 of file nbtree.h.

Referenced by _bt_spools_heapscan(), and btbuildphasename().

◆ PROGRESS_BTREE_PHASE_LEAF_LOAD

#define PROGRESS_BTREE_PHASE_LEAF_LOAD   5

Definition at line 1115 of file nbtree.h.

Referenced by _bt_leafbuild(), and btbuildphasename().

◆ PROGRESS_BTREE_PHASE_PERFORMSORT_1

#define PROGRESS_BTREE_PHASE_PERFORMSORT_1   3

Definition at line 1113 of file nbtree.h.

Referenced by _bt_leafbuild(), _bt_parallel_scan_and_sort(), and btbuildphasename().

◆ PROGRESS_BTREE_PHASE_PERFORMSORT_2

#define PROGRESS_BTREE_PHASE_PERFORMSORT_2   4

Definition at line 1114 of file nbtree.h.

Referenced by _bt_leafbuild(), _bt_parallel_scan_and_sort(), and btbuildphasename().

◆ SK_BT_DESC

◆ SK_BT_INDOPTION_SHIFT

#define SK_BT_INDOPTION_SHIFT   24 /* must clear the above bits */

Definition at line 1081 of file nbtree.h.

Referenced by _bt_fix_scankey_strategy(), and _bt_mkscankey().

◆ SK_BT_NULLS_FIRST

◆ SK_BT_REQBKWD

#define SK_BT_REQBKWD   0x00020000 /* required to continue backward scan */

Definition at line 1080 of file nbtree.h.

Referenced by _bt_check_rowcompare(), _bt_checkkeys(), and _bt_mark_scankey_required().

◆ SK_BT_REQFWD

#define SK_BT_REQFWD   0x00010000 /* required to continue forward scan */

Definition at line 1079 of file nbtree.h.

Referenced by _bt_check_rowcompare(), _bt_checkkeys(), and _bt_mark_scankey_required().

Typedef Documentation

◆ BTArrayKeyInfo

◆ BTCycleId

typedef uint16 BTCycleId

Definition at line 29 of file nbtree.h.

◆ BTDedupInterval

◆ BTDedupState

Definition at line 891 of file nbtree.h.

◆ BTDedupStateData

◆ BTDeletedPageData

◆ BTInsertState

Definition at line 833 of file nbtree.h.

◆ BTInsertStateData

◆ BTMetaPageData

◆ BTOptions

typedef struct BTOptions BTOptions

◆ BTPageOpaque

Definition at line 71 of file nbtree.h.

◆ BTPageOpaqueData

◆ BTPendingFSM

typedef struct BTPendingFSM BTPendingFSM

◆ BTScanInsert

Definition at line 794 of file nbtree.h.

◆ BTScanInsertData

◆ BTScanOpaque

Definition at line 1072 of file nbtree.h.

◆ BTScanOpaqueData

◆ BTScanPos

Definition at line 986 of file nbtree.h.

◆ BTScanPosData

typedef struct BTScanPosData BTScanPosData

◆ BTScanPosItem

typedef struct BTScanPosItem BTScanPosItem

◆ BTStack

typedef BTStackData* BTStack

Definition at line 732 of file nbtree.h.

◆ BTStackData

typedef struct BTStackData BTStackData

◆ BTVacState

typedef struct BTVacState BTVacState

◆ BTVacuumPosting

Definition at line 912 of file nbtree.h.

◆ BTVacuumPostingData

Function Documentation

◆ _bt_advance_array_keys()

bool _bt_advance_array_keys ( IndexScanDesc  scan,
ScanDirection  dir 
)

Definition at line 544 of file nbtutils.c.

References _bt_parallel_advance_array_keys(), BTScanOpaqueData::arrayKeyData, BTScanOpaqueData::arrayKeys, BTArrayKeyInfo::cur_elem, BTArrayKeyInfo::elem_values, i, BTArrayKeyInfo::num_elems, BTScanOpaqueData::numArrayKeys, IndexScanDescData::opaque, IndexScanDescData::parallel_scan, BTArrayKeyInfo::scan_key, ScanDirectionIsBackward, and ScanKeyData::sk_argument.

Referenced by btgetbitmap(), and btgettuple().

545 {
546  BTScanOpaque so = (BTScanOpaque) scan->opaque;
547  bool found = false;
548  int i;
549 
550  /*
551  * We must advance the last array key most quickly, since it will
552  * correspond to the lowest-order index column among the available
553  * qualifications. This is necessary to ensure correct ordering of output
554  * when there are multiple array keys.
555  */
556  for (i = so->numArrayKeys - 1; i >= 0; i--)
557  {
558  BTArrayKeyInfo *curArrayKey = &so->arrayKeys[i];
559  ScanKey skey = &so->arrayKeyData[curArrayKey->scan_key];
560  int cur_elem = curArrayKey->cur_elem;
561  int num_elems = curArrayKey->num_elems;
562 
563  if (ScanDirectionIsBackward(dir))
564  {
565  if (--cur_elem < 0)
566  {
567  cur_elem = num_elems - 1;
568  found = false; /* need to advance next array key */
569  }
570  else
571  found = true;
572  }
573  else
574  {
575  if (++cur_elem >= num_elems)
576  {
577  cur_elem = 0;
578  found = false; /* need to advance next array key */
579  }
580  else
581  found = true;
582  }
583 
584  curArrayKey->cur_elem = cur_elem;
585  skey->sk_argument = curArrayKey->elem_values[cur_elem];
586  if (found)
587  break;
588  }
589 
590  /* advance parallel scan */
591  if (scan->parallel_scan != NULL)
593 
594  return found;
595 }
Datum * elem_values
Definition: nbtree.h:1027
#define ScanDirectionIsBackward(direction)
Definition: sdir.h:41
BTScanOpaqueData * BTScanOpaque
Definition: nbtree.h:1072
ScanKey arrayKeyData
Definition: nbtree.h:1038
struct ParallelIndexScanDescData * parallel_scan
Definition: relscan.h:166
void _bt_parallel_advance_array_keys(IndexScanDesc scan)
Definition: nbtree.c:756
BTArrayKeyInfo * arrayKeys
Definition: nbtree.h:1043
int i
Datum sk_argument
Definition: skey.h:72

◆ _bt_allequalimage()

bool _bt_allequalimage ( Relation  rel,
bool  debugmessage 
)

Definition at line 2692 of file nbtutils.c.

References BTEQUALIMAGE_PROC, BTSortArrayContext::collation, DatumGetBool, DEBUG1, elog, get_opfamily_proc(), i, IndexRelationGetNumberOfAttributes, IndexRelationGetNumberOfKeyAttributes, ObjectIdGetDatum, OidFunctionCall1Coll(), OidIsValid, RelationData::rd_indcollation, RelationData::rd_opcintype, RelationData::rd_opfamily, and RelationGetRelationName.

Referenced by _bt_leafbuild(), bt_index_check_internal(), and btbuildempty().

2693 {
2694  bool allequalimage = true;
2695 
2696  /* INCLUDE indexes can never support deduplication */
2699  return false;
2700 
2701  for (int i = 0; i < IndexRelationGetNumberOfKeyAttributes(rel); i++)
2702  {
2703  Oid opfamily = rel->rd_opfamily[i];
2704  Oid opcintype = rel->rd_opcintype[i];
2705  Oid collation = rel->rd_indcollation[i];
2706  Oid equalimageproc;
2707 
2708  equalimageproc = get_opfamily_proc(opfamily, opcintype, opcintype,
2710 
2711  /*
2712  * If there is no BTEQUALIMAGE_PROC then deduplication is assumed to
2713  * be unsafe. Otherwise, actually call proc and see what it says.
2714  */
2715  if (!OidIsValid(equalimageproc) ||
2716  !DatumGetBool(OidFunctionCall1Coll(equalimageproc, collation,
2717  ObjectIdGetDatum(opcintype))))
2718  {
2719  allequalimage = false;
2720  break;
2721  }
2722  }
2723 
2724  if (debugmessage)
2725  {
2726  if (allequalimage)
2727  elog(DEBUG1, "index \"%s\" can safely use deduplication",
2729  else
2730  elog(DEBUG1, "index \"%s\" cannot use deduplication",
2732  }
2733 
2734  return allequalimage;
2735 }
#define DEBUG1
Definition: elog.h:25
#define BTEQUALIMAGE_PROC
Definition: nbtree.h:703
unsigned int Oid
Definition: postgres_ext.h:31
#define OidIsValid(objectId)
Definition: c.h:710
Oid * rd_indcollation
Definition: rel.h:212
#define ObjectIdGetDatum(X)
Definition: postgres.h:551
#define DatumGetBool(X)
Definition: postgres.h:437
#define IndexRelationGetNumberOfAttributes(relation)
Definition: rel.h:489
#define RelationGetRelationName(relation)
Definition: rel.h:511
Oid * rd_opfamily
Definition: rel.h:202
#define IndexRelationGetNumberOfKeyAttributes(relation)
Definition: rel.h:496
Datum OidFunctionCall1Coll(Oid functionId, Oid collation, Datum arg1)
Definition: fmgr.c:1410
Oid get_opfamily_proc(Oid opfamily, Oid lefttype, Oid righttype, int16 procnum)
Definition: lsyscache.c:794
#define elog(elevel,...)
Definition: elog.h:232
int i
Oid * rd_opcintype
Definition: rel.h:203

◆ _bt_binsrch_insert()

OffsetNumber _bt_binsrch_insert ( Relation  rel,
BTInsertState  insertstate 
)

Definition at line 447 of file nbtsearch.c.

References _bt_binsrch_posting(), _bt_compare(), Assert, BTInsertStateData::bounds_valid, BTInsertStateData::buf, BufferGetPage, InvalidOffsetNumber, BTInsertStateData::itup_key, sort-test::key, BTInsertStateData::low, BTScanInsertData::nextkey, P_FIRSTDATAKEY, P_ISLEAF, PageGetMaxOffsetNumber, PageGetSpecialPointer, BTInsertStateData::postingoff, BTScanInsertData::scantid, BTInsertStateData::stricthigh, and unlikely.

Referenced by _bt_check_unique(), _bt_findinsertloc(), and bt_rootdescend().

448 {
449  BTScanInsert key = insertstate->itup_key;
450  Page page;
451  BTPageOpaque opaque;
452  OffsetNumber low,
453  high,
454  stricthigh;
455  int32 result,
456  cmpval;
457 
458  page = BufferGetPage(insertstate->buf);
459  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
460 
461  Assert(P_ISLEAF(opaque));
462  Assert(!key->nextkey);
463  Assert(insertstate->postingoff == 0);
464 
465  if (!insertstate->bounds_valid)
466  {
467  /* Start new binary search */
468  low = P_FIRSTDATAKEY(opaque);
469  high = PageGetMaxOffsetNumber(page);
470  }
471  else
472  {
473  /* Restore result of previous binary search against same page */
474  low = insertstate->low;
475  high = insertstate->stricthigh;
476  }
477 
478  /* If there are no keys on the page, return the first available slot */
479  if (unlikely(high < low))
480  {
481  /* Caller can't reuse bounds */
482  insertstate->low = InvalidOffsetNumber;
483  insertstate->stricthigh = InvalidOffsetNumber;
484  insertstate->bounds_valid = false;
485  return low;
486  }
487 
488  /*
489  * Binary search to find the first key on the page >= scan key. (nextkey
490  * is always false when inserting).
491  *
492  * The loop invariant is: all slots before 'low' are < scan key, all slots
493  * at or after 'high' are >= scan key. 'stricthigh' is > scan key, and is
494  * maintained to save additional search effort for caller.
495  *
496  * We can fall out when high == low.
497  */
498  if (!insertstate->bounds_valid)
499  high++; /* establish the loop invariant for high */
500  stricthigh = high; /* high initially strictly higher */
501 
502  cmpval = 1; /* !nextkey comparison value */
503 
504  while (high > low)
505  {
506  OffsetNumber mid = low + ((high - low) / 2);
507 
508  /* We have low <= mid < high, so mid points at a real slot */
509 
510  result = _bt_compare(rel, key, page, mid);
511 
512  if (result >= cmpval)
513  low = mid + 1;
514  else
515  {
516  high = mid;
517  if (result != 0)
518  stricthigh = high;
519  }
520 
521  /*
522  * If tuple at offset located by binary search is a posting list whose
523  * TID range overlaps with caller's scantid, perform posting list
524  * binary search to set postingoff for caller. Caller must split the
525  * posting list when postingoff is set. This should happen
526  * infrequently.
527  */
528  if (unlikely(result == 0 && key->scantid != NULL))
529  insertstate->postingoff = _bt_binsrch_posting(key, page, mid);
530  }
531 
532  /*
533  * On a leaf page, a binary search always returns the first key >= scan
534  * key (at least in !nextkey case), which could be the last slot + 1. This
535  * is also the lower bound of cached search.
536  *
537  * stricthigh may also be the last slot + 1, which prevents caller from
538  * using bounds directly, but is still useful to us if we're called a
539  * second time with cached bounds (cached low will be < stricthigh when
540  * that happens).
541  */
542  insertstate->low = low;
543  insertstate->stricthigh = stricthigh;
544  insertstate->bounds_valid = true;
545 
546  return low;
547 }
bool bounds_valid
Definition: nbtree.h:821
ItemPointer scantid
Definition: nbtree.h:789
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:369
OffsetNumber stricthigh
Definition: nbtree.h:823
OffsetNumber low
Definition: nbtree.h:822
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:357
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:71
signed int int32
Definition: c.h:429
uint16 OffsetNumber
Definition: off.h:24
static int _bt_binsrch_posting(BTScanInsert key, Page page, OffsetNumber offnum)
Definition: nbtsearch.c:558
int32 _bt_compare(Relation rel, BTScanInsert key, Page page, OffsetNumber offnum)
Definition: nbtsearch.c:644
#define BufferGetPage(buffer)
Definition: bufmgr.h:169
#define InvalidOffsetNumber
Definition: off.h:26
BTScanInsert itup_key
Definition: nbtree.h:811
#define Assert(condition)
Definition: c.h:804
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
#define unlikely(x)
Definition: c.h:273
Pointer Page
Definition: bufpage.h:78
#define P_ISLEAF(opaque)
Definition: nbtree.h:219

◆ _bt_bottomupdel_pass()

bool _bt_bottomupdel_pass ( Relation  rel,
Buffer  buf,
Relation  heapRel,
Size  newitemsz 
)

Definition at line 305 of file nbtdedup.c.

References _bt_bottomupdel_finish_pending(), _bt_dedup_save_htid(), _bt_dedup_start_pending(), _bt_delitems_delete_check(), _bt_keep_natts_fast(), Assert, BTDedupStateData::base, BTDedupStateData::baseoff, BTDedupStateData::basetupsize, TM_IndexDeleteOp::bottomup, TM_IndexDeleteOp::bottomupfreespace, BufferGetPage, BTDedupStateData::deduplicate, TM_IndexDeleteOp::deltids, BTDedupStateData::htids, IndexRelationGetNumberOfKeyAttributes, InvalidOffsetNumber, ItemIdIsDead, Max, BTDedupStateData::maxpostingsize, MaxTIDsPerBTreePage, TM_IndexDeleteOp::ndeltids, BTDedupStateData::nhtids, BTDedupStateData::nintervals, BTDedupStateData::nitems, BTDedupStateData::nmaxitems, OffsetNumberNext, P_FIRSTDATAKEY, PageGetExactFreeSpace(), PageGetItem, PageGetItemId, PageGetMaxOffsetNumber, PageGetSpecialPointer, palloc(), pfree(), BTDedupStateData::phystupsize, and TM_IndexDeleteOp::status.

Referenced by _bt_delete_or_dedup_one_page().

307 {
308  OffsetNumber offnum,
309  minoff,
310  maxoff;
311  Page page = BufferGetPage(buf);
314  TM_IndexDeleteOp delstate;
315  bool neverdedup;
316  int nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
317 
318  /* Passed-in newitemsz is MAXALIGNED but does not include line pointer */
319  newitemsz += sizeof(ItemIdData);
320 
321  /* Initialize deduplication state */
322  state = (BTDedupState) palloc(sizeof(BTDedupStateData));
323  state->deduplicate = true;
324  state->nmaxitems = 0;
325  state->maxpostingsize = BLCKSZ; /* We're not really deduplicating */
326  state->base = NULL;
327  state->baseoff = InvalidOffsetNumber;
328  state->basetupsize = 0;
329  state->htids = palloc(state->maxpostingsize);
330  state->nhtids = 0;
331  state->nitems = 0;
332  state->phystupsize = 0;
333  state->nintervals = 0;
334 
335  /*
336  * Initialize tableam state that describes bottom-up index deletion
337  * operation.
338  *
339  * We'll go on to ask the tableam to search for TIDs whose index tuples we
340  * can safely delete. The tableam will search until our leaf page space
341  * target is satisfied, or until the cost of continuing with the tableam
342  * operation seems too high. It focuses its efforts on TIDs associated
343  * with duplicate index tuples that we mark "promising".
344  *
345  * This space target is a little arbitrary. The tableam must be able to
346  * keep the costs and benefits in balance. We provide the tableam with
347  * exhaustive information about what might work, without directly
348  * concerning ourselves with avoiding work during the tableam call. Our
349  * role in costing the bottom-up deletion process is strictly advisory.
350  */
351  delstate.bottomup = true;
352  delstate.bottomupfreespace = Max(BLCKSZ / 16, newitemsz);
353  delstate.ndeltids = 0;
354  delstate.deltids = palloc(MaxTIDsPerBTreePage * sizeof(TM_IndexDelete));
355  delstate.status = palloc(MaxTIDsPerBTreePage * sizeof(TM_IndexStatus));
356 
357  minoff = P_FIRSTDATAKEY(opaque);
358  maxoff = PageGetMaxOffsetNumber(page);
359  for (offnum = minoff;
360  offnum <= maxoff;
361  offnum = OffsetNumberNext(offnum))
362  {
363  ItemId itemid = PageGetItemId(page, offnum);
364  IndexTuple itup = (IndexTuple) PageGetItem(page, itemid);
365 
366  Assert(!ItemIdIsDead(itemid));
367 
368  if (offnum == minoff)
369  {
370  /* itup starts first pending interval */
371  _bt_dedup_start_pending(state, itup, offnum);
372  }
373  else if (_bt_keep_natts_fast(rel, state->base, itup) > nkeyatts &&
374  _bt_dedup_save_htid(state, itup))
375  {
376  /* Tuple is equal; just added its TIDs to pending interval */
377  }
378  else
379  {
380  /* Finalize interval -- move its TIDs to delete state */
381  _bt_bottomupdel_finish_pending(page, state, &delstate);
382 
383  /* itup starts new pending interval */
384  _bt_dedup_start_pending(state, itup, offnum);
385  }
386  }
387  /* Finalize final interval -- move its TIDs to delete state */
388  _bt_bottomupdel_finish_pending(page, state, &delstate);
389 
390  /*
391  * We don't give up now in the event of having few (or even zero)
392  * promising tuples for the tableam because it's not up to us as the index
393  * AM to manage costs (note that the tableam might have heuristics of its
394  * own that work out what to do). We should at least avoid having our
395  * caller do a useless deduplication pass after we return in the event of
396  * zero promising tuples, though.
397  */
398  neverdedup = false;
399  if (state->nintervals == 0)
400  neverdedup = true;
401 
402  pfree(state->htids);
403  pfree(state);
404 
405  /* Ask tableam which TIDs are deletable, then physically delete them */
406  _bt_delitems_delete_check(rel, buf, heapRel, &delstate);
407 
408  pfree(delstate.deltids);
409  pfree(delstate.status);
410 
411  /* Report "success" to caller unconditionally to avoid deduplication */
412  if (neverdedup)
413  return true;
414 
415  /* Don't dedup when we won't end up back here any time soon anyway */
416  return PageGetExactFreeSpace(page) >= Max(BLCKSZ / 24, newitemsz);
417 }
TM_IndexDelete * deltids
Definition: tableam.h:228
IndexTuple base
Definition: nbtree.h:871
void _bt_delitems_delete_check(Relation rel, Buffer buf, Relation heapRel, TM_IndexDeleteOp *delstate)
Definition: nbtpage.c:1529
OffsetNumber baseoff
Definition: nbtree.h:872
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:369
void _bt_dedup_start_pending(BTDedupState state, IndexTuple base, OffsetNumber baseoff)
Definition: nbtdedup.c:429
#define MaxTIDsPerBTreePage
Definition: nbtree.h:184
#define ItemIdIsDead(itemId)
Definition: itemid.h:113
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:357
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:71
uint16 OffsetNumber
Definition: off.h:24
ItemPointer htids
Definition: nbtree.h:876
void pfree(void *pointer)
Definition: mcxt.c:1169
Size phystupsize
Definition: nbtree.h:879
static char * buf
Definition: pg_test_fsync.c:68
bool _bt_dedup_save_htid(BTDedupState state, IndexTuple itup)
Definition: nbtdedup.c:480
IndexTupleData * IndexTuple
Definition: itup.h:53
struct ItemIdData ItemIdData
#define IndexRelationGetNumberOfKeyAttributes(relation)
Definition: rel.h:496
#define BufferGetPage(buffer)
Definition: bufmgr.h:169
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:235
TM_IndexStatus * status
Definition: tableam.h:229
static void _bt_bottomupdel_finish_pending(Page page, BTDedupState state, TM_IndexDeleteOp *delstate)
Definition: nbtdedup.c:641
#define InvalidOffsetNumber
Definition: off.h:26
#define Max(x, y)
Definition: c.h:980
#define Assert(condition)
Definition: c.h:804
Definition: regguts.h:317
#define OffsetNumberNext(offsetNumber)
Definition: off.h:52
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
Size basetupsize
Definition: nbtree.h:873
Size maxpostingsize
Definition: nbtree.h:868
Size PageGetExactFreeSpace(Page page)
Definition: bufpage.c:951
int _bt_keep_natts_fast(Relation rel, IndexTuple lastleft, IndexTuple firstright)
Definition: nbtutils.c:2419
void * palloc(Size size)
Definition: mcxt.c:1062
bool deduplicate
Definition: nbtree.h:866
int bottomupfreespace
Definition: tableam.h:224
BTDedupStateData * BTDedupState
Definition: nbtree.h:891
#define PageGetItem(page, itemId)
Definition: bufpage.h:340
Pointer Page
Definition: bufpage.h:78

◆ _bt_check_natts()

bool _bt_check_natts ( Relation  rel,
bool  heapkeyspace,
Page  page,
OffsetNumber  offnum 
)

Definition at line 2466 of file nbtutils.c.

References Assert, BT_OFFSET_MASK, BT_PIVOT_HEAP_TID_ATTR, BTreeTupleGetHeapTID(), BTreeTupleGetNAtts, BTreeTupleIsPivot(), BTreeTupleIsPosting(), FirstOffsetNumber, INDEX_MAX_KEYS, IndexRelationGetNumberOfAttributes, IndexRelationGetNumberOfKeyAttributes, ItemPointerGetOffsetNumber, ItemPointerGetOffsetNumberNoCheck, P_FIRSTDATAKEY, P_HIKEY, P_IGNORE, P_ISLEAF, P_RIGHTMOST, PageGetItem, PageGetItemId, PageGetMaxOffsetNumber, PageGetSpecialPointer, StaticAssertStmt, and IndexTupleData::t_tid.

Referenced by _bt_compare(), and bt_target_page_check().

2467 {
2471  IndexTuple itup;
2472  int tupnatts;
2473 
2474  /*
2475  * We cannot reliably test a deleted or half-dead page, since they have
2476  * dummy high keys
2477  */
2478  if (P_IGNORE(opaque))
2479  return true;
2480 
2481  Assert(offnum >= FirstOffsetNumber &&
2482  offnum <= PageGetMaxOffsetNumber(page));
2483 
2484  /*
2485  * Mask allocated for number of keys in index tuple must be able to fit
2486  * maximum possible number of index attributes
2487  */
2489  "BT_OFFSET_MASK can't fit INDEX_MAX_KEYS");
2490 
2491  itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
2492  tupnatts = BTreeTupleGetNAtts(itup, rel);
2493 
2494  /* !heapkeyspace indexes do not support deduplication */
2495  if (!heapkeyspace && BTreeTupleIsPosting(itup))
2496  return false;
2497 
2498  /* Posting list tuples should never have "pivot heap TID" bit set */
2499  if (BTreeTupleIsPosting(itup) &&
2501  BT_PIVOT_HEAP_TID_ATTR) != 0)
2502  return false;
2503 
2504  /* INCLUDE indexes do not support deduplication */
2505  if (natts != nkeyatts && BTreeTupleIsPosting(itup))
2506  return false;
2507 
2508  if (P_ISLEAF(opaque))
2509  {
2510  if (offnum >= P_FIRSTDATAKEY(opaque))
2511  {
2512  /*
2513  * Non-pivot tuple should never be explicitly marked as a pivot
2514  * tuple
2515  */
2516  if (BTreeTupleIsPivot(itup))
2517  return false;
2518 
2519  /*
2520  * Leaf tuples that are not the page high key (non-pivot tuples)
2521  * should never be truncated. (Note that tupnatts must have been
2522  * inferred, even with a posting list tuple, because only pivot
2523  * tuples store tupnatts directly.)
2524  */
2525  return tupnatts == natts;
2526  }
2527  else
2528  {
2529  /*
2530  * Rightmost page doesn't contain a page high key, so tuple was
2531  * checked above as ordinary leaf tuple
2532  */
2533  Assert(!P_RIGHTMOST(opaque));
2534 
2535  /*
2536  * !heapkeyspace high key tuple contains only key attributes. Note
2537  * that tupnatts will only have been explicitly represented in
2538  * !heapkeyspace indexes that happen to have non-key attributes.
2539  */
2540  if (!heapkeyspace)
2541  return tupnatts == nkeyatts;
2542 
2543  /* Use generic heapkeyspace pivot tuple handling */
2544  }
2545  }
2546  else /* !P_ISLEAF(opaque) */
2547  {
2548  if (offnum == P_FIRSTDATAKEY(opaque))
2549  {
2550  /*
2551  * The first tuple on any internal page (possibly the first after
2552  * its high key) is its negative infinity tuple. Negative
2553  * infinity tuples are always truncated to zero attributes. They
2554  * are a particular kind of pivot tuple.
2555  */
2556  if (heapkeyspace)
2557  return tupnatts == 0;
2558 
2559  /*
2560  * The number of attributes won't be explicitly represented if the
2561  * negative infinity tuple was generated during a page split that
2562  * occurred with a version of Postgres before v11. There must be
2563  * a problem when there is an explicit representation that is
2564  * non-zero, or when there is no explicit representation and the
2565  * tuple is evidently not a pre-pg_upgrade tuple.
2566  *
2567  * Prior to v11, downlinks always had P_HIKEY as their offset.
2568  * Accept that as an alternative indication of a valid
2569  * !heapkeyspace negative infinity tuple.
2570  */
2571  return tupnatts == 0 ||
2573  }
2574  else
2575  {
2576  /*
2577  * !heapkeyspace downlink tuple with separator key contains only
2578  * key attributes. Note that tupnatts will only have been
2579  * explicitly represented in !heapkeyspace indexes that happen to
2580  * have non-key attributes.
2581  */
2582  if (!heapkeyspace)
2583  return tupnatts == nkeyatts;
2584 
2585  /* Use generic heapkeyspace pivot tuple handling */
2586  }
2587 
2588  }
2589 
2590  /* Handle heapkeyspace pivot tuples (excluding minus infinity items) */
2591  Assert(heapkeyspace);
2592 
2593  /*
2594  * Explicit representation of the number of attributes is mandatory with
2595  * heapkeyspace index pivot tuples, regardless of whether or not there are
2596  * non-key attributes.
2597  */
2598  if (!BTreeTupleIsPivot(itup))
2599  return false;
2600 
2601  /* Pivot tuple should not use posting list representation (redundant) */
2602  if (BTreeTupleIsPosting(itup))
2603  return false;
2604 
2605  /*
2606  * Heap TID is a tiebreaker key attribute, so it cannot be untruncated
2607  * when any other key attribute is truncated
2608  */
2609  if (BTreeTupleGetHeapTID(itup) != NULL && tupnatts != nkeyatts)
2610  return false;
2611 
2612  /*
2613  * Pivot tuple must have at least one untruncated key attribute (minus
2614  * infinity pivot tuples are the only exception). Pivot tuples can never
2615  * represent that there is a value present for a key attribute that
2616  * exceeds pg_index.indnkeyatts for the index.
2617  */
2618  return tupnatts > 0 && tupnatts <= nkeyatts;
2619 }
signed short int16
Definition: c.h:428
#define ItemPointerGetOffsetNumberNoCheck(pointer)
Definition: itemptr.h:108
#define P_IGNORE(opaque)
Definition: nbtree.h:224
static bool BTreeTupleIsPivot(IndexTuple itup)
Definition: nbtree.h:473
static ItemPointer BTreeTupleGetHeapTID(IndexTuple itup)
Definition: nbtree.h:631
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:369
ItemPointerData t_tid
Definition: itup.h:37
#define BT_OFFSET_MASK
Definition: nbtree.h:462
#define BTreeTupleGetNAtts(itup, rel)
Definition: nbtree.h:570
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:357
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:71
#define StaticAssertStmt(condition, errmessage)
Definition: c.h:918
#define FirstOffsetNumber
Definition: off.h:27
IndexTupleData * IndexTuple
Definition: itup.h:53
#define IndexRelationGetNumberOfAttributes(relation)
Definition: rel.h:489
#define IndexRelationGetNumberOfKeyAttributes(relation)
Definition: rel.h:496
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:235
static bool BTreeTupleIsPosting(IndexTuple itup)
Definition: nbtree.h:485
#define Assert(condition)
Definition: c.h:804
#define INDEX_MAX_KEYS
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
#define ItemPointerGetOffsetNumber(pointer)
Definition: itemptr.h:117
#define P_HIKEY
Definition: nbtree.h:367
#define BT_PIVOT_HEAP_TID_ATTR
Definition: nbtree.h:465
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:218
#define PageGetItem(page, itemId)
Definition: bufpage.h:340
#define P_ISLEAF(opaque)
Definition: nbtree.h:219

◆ _bt_check_third_page()

void _bt_check_third_page ( Relation  rel,
Relation  heap,
bool  needheaptidspace,
Page  page,
IndexTuple  newtup 
)

Definition at line 2634 of file nbtutils.c.

References BTMaxItemSize, BTMaxItemSizeNoHeapTid, BTREE_NOVAC_VERSION, BTREE_VERSION, BTreeTupleGetHeapTID(), elog, ereport, errcode(), errdetail(), errhint(), errmsg(), ERROR, errtableconstraint(), IndexTupleSize, ItemPointerGetBlockNumber, ItemPointerGetOffsetNumber, MAXALIGN, P_ISLEAF, PageGetSpecialPointer, and RelationGetRelationName.

Referenced by _bt_buildadd(), and _bt_findinsertloc().

2636 {
2637  Size itemsz;
2638  BTPageOpaque opaque;
2639 
2640  itemsz = MAXALIGN(IndexTupleSize(newtup));
2641 
2642  /* Double check item size against limit */
2643  if (itemsz <= BTMaxItemSize(page))
2644  return;
2645 
2646  /*
2647  * Tuple is probably too large to fit on page, but it's possible that the
2648  * index uses version 2 or version 3, or that page is an internal page, in
2649  * which case a slightly higher limit applies.
2650  */
2651  if (!needheaptidspace && itemsz <= BTMaxItemSizeNoHeapTid(page))
2652  return;
2653 
2654  /*
2655  * Internal page insertions cannot fail here, because that would mean that
2656  * an earlier leaf level insertion that should have failed didn't
2657  */
2658  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
2659  if (!P_ISLEAF(opaque))
2660  elog(ERROR, "cannot insert oversized tuple of size %zu on internal page of index \"%s\"",
2661  itemsz, RelationGetRelationName(rel));
2662 
2663  ereport(ERROR,
2664  (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
2665  errmsg("index row size %zu exceeds btree version %u maximum %zu for index \"%s\"",
2666  itemsz,
2667  needheaptidspace ? BTREE_VERSION : BTREE_NOVAC_VERSION,
2668  needheaptidspace ? BTMaxItemSize(page) :
2669  BTMaxItemSizeNoHeapTid(page),
2671  errdetail("Index row references tuple (%u,%u) in relation \"%s\".",
2674  RelationGetRelationName(heap)),
2675  errhint("Values larger than 1/3 of a buffer page cannot be indexed.\n"
2676  "Consider a function index of an MD5 hash of the value, "
2677  "or use full text indexing."),
2679 }
int errhint(const char *fmt,...)
Definition: elog.c:1156
static ItemPointer BTreeTupleGetHeapTID(IndexTuple itup)
Definition: nbtree.h:631
#define BTREE_VERSION
Definition: nbtree.h:148
#define BTMaxItemSizeNoHeapTid(page)
Definition: nbtree.h:168
int errcode(int sqlerrcode)
Definition: elog.c:698
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:71
int errtableconstraint(Relation rel, const char *conname)
Definition: relcache.c:5793
#define ERROR
Definition: elog.h:46
int errdetail(const char *fmt,...)
Definition: elog.c:1042
#define RelationGetRelationName(relation)
Definition: rel.h:511
#define BTREE_NOVAC_VERSION
Definition: nbtree.h:150
#define ereport(elevel,...)
Definition: elog.h:157
size_t Size
Definition: c.h:540
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
#define MAXALIGN(LEN)
Definition: c.h:757
#define ItemPointerGetOffsetNumber(pointer)
Definition: itemptr.h:117
#define BTMaxItemSize(page)
Definition: nbtree.h:162
int errmsg(const char *fmt,...)
Definition: elog.c:909
#define elog(elevel,...)
Definition: elog.h:232
#define ItemPointerGetBlockNumber(pointer)
Definition: itemptr.h:98
#define IndexTupleSize(itup)
Definition: itup.h:71
#define P_ISLEAF(opaque)
Definition: nbtree.h:219

◆ _bt_checkkeys()

bool _bt_checkkeys ( IndexScanDesc  scan,
IndexTuple  tuple,
int  tupnatts,
ScanDirection  dir,
bool continuescan 
)

Definition at line 1355 of file nbtutils.c.

References _bt_check_rowcompare(), Assert, BTreeTupleGetNAtts, BTreeTupleIsPivot(), DatumGetBool, FunctionCall2Coll(), index_getattr, IndexScanDescData::indexRelation, sort-test::key, BTScanOpaqueData::keyData, BTScanOpaqueData::numberOfKeys, IndexScanDescData::opaque, RelationGetDescr, ScanDirectionIsBackward, ScanDirectionIsForward, ScanKeyData::sk_argument, ScanKeyData::sk_attno, SK_BT_NULLS_FIRST, SK_BT_REQBKWD, SK_BT_REQFWD, ScanKeyData::sk_collation, ScanKeyData::sk_flags, ScanKeyData::sk_func, SK_ISNULL, SK_ROW_HEADER, SK_SEARCHNOTNULL, SK_SEARCHNULL, and test().

Referenced by _bt_readpage().

1357 {
1358  TupleDesc tupdesc;
1359  BTScanOpaque so;
1360  int keysz;
1361  int ikey;
1362  ScanKey key;
1363 
1364  Assert(BTreeTupleGetNAtts(tuple, scan->indexRelation) == tupnatts);
1365 
1366  *continuescan = true; /* default assumption */
1367 
1368  tupdesc = RelationGetDescr(scan->indexRelation);
1369  so = (BTScanOpaque) scan->opaque;
1370  keysz = so->numberOfKeys;
1371 
1372  for (key = so->keyData, ikey = 0; ikey < keysz; key++, ikey++)
1373  {
1374  Datum datum;
1375  bool isNull;
1376  Datum test;
1377 
1378  if (key->sk_attno > tupnatts)
1379  {
1380  /*
1381  * This attribute is truncated (must be high key). The value for
1382  * this attribute in the first non-pivot tuple on the page to the
1383  * right could be any possible value. Assume that truncated
1384  * attribute passes the qual.
1385  */
1387  Assert(BTreeTupleIsPivot(tuple));
1388  continue;
1389  }
1390 
1391  /* row-comparison keys need special processing */
1392  if (key->sk_flags & SK_ROW_HEADER)
1393  {
1394  if (_bt_check_rowcompare(key, tuple, tupnatts, tupdesc, dir,
1395  continuescan))
1396  continue;
1397  return false;
1398  }
1399 
1400  datum = index_getattr(tuple,
1401  key->sk_attno,
1402  tupdesc,
1403  &isNull);
1404 
1405  if (key->sk_flags & SK_ISNULL)
1406  {
1407  /* Handle IS NULL/NOT NULL tests */
1408  if (key->sk_flags & SK_SEARCHNULL)
1409  {
1410  if (isNull)
1411  continue; /* tuple satisfies this qual */
1412  }
1413  else
1414  {
1416  if (!isNull)
1417  continue; /* tuple satisfies this qual */
1418  }
1419 
1420  /*
1421  * Tuple fails this qual. If it's a required qual for the current
1422  * scan direction, then we can conclude no further tuples will
1423  * pass, either.
1424  */
1425  if ((key->sk_flags & SK_BT_REQFWD) &&
1427  *continuescan = false;
1428  else if ((key->sk_flags & SK_BT_REQBKWD) &&
1430  *continuescan = false;
1431 
1432  /*
1433  * In any case, this indextuple doesn't match the qual.
1434  */
1435  return false;
1436  }
1437 
1438  if (isNull)
1439  {
1440  if (key->sk_flags & SK_BT_NULLS_FIRST)
1441  {
1442  /*
1443  * Since NULLs are sorted before non-NULLs, we know we have
1444  * reached the lower limit of the range of values for this
1445  * index attr. On a backward scan, we can stop if this qual
1446  * is one of the "must match" subset. We can stop regardless
1447  * of whether the qual is > or <, so long as it's required,
1448  * because it's not possible for any future tuples to pass. On
1449  * a forward scan, however, we must keep going, because we may
1450  * have initially positioned to the start of the index.
1451  */
1452  if ((key->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) &&
1454  *continuescan = false;
1455  }
1456  else
1457  {
1458  /*
1459  * Since NULLs are sorted after non-NULLs, we know we have
1460  * reached the upper limit of the range of values for this
1461  * index attr. On a forward scan, we can stop if this qual is
1462  * one of the "must match" subset. We can stop regardless of
1463  * whether the qual is > or <, so long as it's required,
1464  * because it's not possible for any future tuples to pass. On
1465  * a backward scan, however, we must keep going, because we
1466  * may have initially positioned to the end of the index.
1467  */
1468  if ((key->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) &&
1470  *continuescan = false;
1471  }
1472 
1473  /*
1474  * In any case, this indextuple doesn't match the qual.
1475  */
1476  return false;
1477  }
1478 
1479  test = FunctionCall2Coll(&key->sk_func, key->sk_collation,
1480  datum, key->sk_argument);
1481 
1482  if (!DatumGetBool(test))
1483  {
1484  /*
1485  * Tuple fails this qual. If it's a required qual for the current
1486  * scan direction, then we can conclude no further tuples will
1487  * pass, either.
1488  *
1489  * Note: because we stop the scan as soon as any required equality
1490  * qual fails, it is critical that equality quals be used for the
1491  * initial positioning in _bt_first() when they are available. See
1492  * comments in _bt_first().
1493  */
1494  if ((key->sk_flags & SK_BT_REQFWD) &&
1496  *continuescan = false;
1497  else if ((key->sk_flags & SK_BT_REQBKWD) &&
1499  *continuescan = false;
1500 
1501  /*
1502  * In any case, this indextuple doesn't match the qual.
1503  */
1504  return false;
1505  }
1506  }
1507 
1508  /* If we get here, the tuple passes all index quals. */
1509  return true;
1510 }
static void test(void)
static bool BTreeTupleIsPivot(IndexTuple itup)
Definition: nbtree.h:473
#define ScanDirectionIsForward(direction)
Definition: sdir.h:55
#define RelationGetDescr(relation)
Definition: rel.h:503
Datum FunctionCall2Coll(FmgrInfo *flinfo, Oid collation, Datum arg1, Datum arg2)
Definition: fmgr.c:1148
#define BTreeTupleGetNAtts(itup, rel)
Definition: nbtree.h:570
Relation indexRelation
Definition: relscan.h:118
#define ScanDirectionIsBackward(direction)
Definition: sdir.h:41
BTScanOpaqueData * BTScanOpaque
Definition: nbtree.h:1072
FmgrInfo sk_func
Definition: skey.h:71
#define DatumGetBool(X)
Definition: postgres.h:437
#define SK_SEARCHNOTNULL
Definition: skey.h:122
#define SK_ISNULL
Definition: skey.h:115
#define SK_ROW_HEADER
Definition: skey.h:117
#define SK_BT_NULLS_FIRST
Definition: nbtree.h:1083
uintptr_t Datum
Definition: postgres.h:411
static bool _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts, TupleDesc tupdesc, ScanDirection dir, bool *continuescan)
Definition: nbtutils.c:1522
#define SK_BT_REQFWD
Definition: nbtree.h:1079
int sk_flags
Definition: skey.h:66
#define Assert(condition)
Definition: c.h:804
#define index_getattr(tup, attnum, tupleDesc, isnull)
Definition: itup.h:100
ScanKey keyData
Definition: nbtree.h:1035
Oid sk_collation
Definition: skey.h:70
#define SK_BT_REQBKWD
Definition: nbtree.h:1080
Datum sk_argument
Definition: skey.h:72
#define SK_SEARCHNULL
Definition: skey.h:121
AttrNumber sk_attno
Definition: skey.h:67

◆ _bt_checkpage()

void _bt_checkpage ( Relation  rel,
Buffer  buf 
)

Definition at line 794 of file nbtpage.c.

References BufferGetBlockNumber(), BufferGetPage, ereport, errcode(), errhint(), errmsg(), ERROR, MAXALIGN, PageGetSpecialSize, PageIsNew, and RelationGetRelationName.

Referenced by _bt_getbuf(), _bt_relandgetbuf(), _bt_search_insert(), bt_recheck_sibling_links(), btvacuumpage(), and palloc_btree_page().

795 {
796  Page page = BufferGetPage(buf);
797 
798  /*
799  * ReadBuffer verifies that every newly-read page passes
800  * PageHeaderIsValid, which means it either contains a reasonably sane
801  * page header or is all-zero. We have to defend against the all-zero
802  * case, however.
803  */
804  if (PageIsNew(page))
805  ereport(ERROR,
806  (errcode(ERRCODE_INDEX_CORRUPTED),
807  errmsg("index \"%s\" contains unexpected zero page at block %u",
810  errhint("Please REINDEX it.")));
811 
812  /*
813  * Additionally check that the special area looks sane.
814  */
815  if (PageGetSpecialSize(page) != MAXALIGN(sizeof(BTPageOpaqueData)))
816  ereport(ERROR,
817  (errcode(ERRCODE_INDEX_CORRUPTED),
818  errmsg("index \"%s\" contains corrupted page at block %u",
821  errhint("Please REINDEX it.")));
822 }
int errhint(const char *fmt,...)
Definition: elog.c:1156
int errcode(int sqlerrcode)
Definition: elog.c:698
#define ERROR
Definition: elog.h:46
static char * buf
Definition: pg_test_fsync.c:68
#define RelationGetRelationName(relation)
Definition: rel.h:511
#define BufferGetPage(buffer)
Definition: bufmgr.h:169
#define ereport(elevel,...)
Definition: elog.h:157
#define MAXALIGN(LEN)
Definition: c.h:757
#define PageGetSpecialSize(page)
Definition: bufpage.h:300
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2748
#define PageIsNew(page)
Definition: bufpage.h:229
int errmsg(const char *fmt,...)
Definition: elog.c:909
Pointer Page
Definition: bufpage.h:78

◆ _bt_compare()

int32 _bt_compare ( Relation  rel,
BTScanInsert  key,
Page  page,
OffsetNumber  offnum 
)

Definition at line 644 of file nbtsearch.c.

References _bt_check_natts(), BTScanInsertData::allequalimage, Assert, BTreeTupleGetHeapTID(), BTreeTupleGetMaxHeapTID(), BTreeTupleGetNAtts, BTreeTupleIsPosting(), DatumGetInt32, FunctionCall2Coll(), BTScanInsertData::heapkeyspace, i, index_getattr, IndexRelationGetNumberOfKeyAttributes, INVERT_COMPARE_RESULT, ItemPointerCompare(), BTScanInsertData::keysz, Min, P_FIRSTDATAKEY, P_ISLEAF, PageGetItem, PageGetItemId, PageGetSpecialPointer, BTScanInsertData::pivotsearch, RelationGetDescr, BTScanInsertData::scankeys, BTScanInsertData::scantid, ScanKeyData::sk_argument, ScanKeyData::sk_attno, SK_BT_DESC, SK_BT_NULLS_FIRST, ScanKeyData::sk_collation, ScanKeyData::sk_flags, ScanKeyData::sk_func, and SK_ISNULL.

Referenced by _bt_binsrch(), _bt_binsrch_insert(), _bt_check_unique(), _bt_findinsertloc(), _bt_moveright(), _bt_search_insert(), bt_rootdescend(), invariant_g_offset(), invariant_l_nontarget_offset(), invariant_l_offset(), and invariant_leq_offset().

648 {
649  TupleDesc itupdesc = RelationGetDescr(rel);
651  IndexTuple itup;
652  ItemPointer heapTid;
653  ScanKey scankey;
654  int ncmpkey;
655  int ntupatts;
656  int32 result;
657 
658  Assert(_bt_check_natts(rel, key->heapkeyspace, page, offnum));
660  Assert(key->heapkeyspace || key->scantid == NULL);
661 
662  /*
663  * Force result ">" if target item is first data item on an internal page
664  * --- see NOTE above.
665  */
666  if (!P_ISLEAF(opaque) && offnum == P_FIRSTDATAKEY(opaque))
667  return 1;
668 
669  itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
670  ntupatts = BTreeTupleGetNAtts(itup, rel);
671 
672  /*
673  * The scan key is set up with the attribute number associated with each
674  * term in the key. It is important that, if the index is multi-key, the
675  * scan contain the first k key attributes, and that they be in order. If
676  * you think about how multi-key ordering works, you'll understand why
677  * this is.
678  *
679  * We don't test for violation of this condition here, however. The
680  * initial setup for the index scan had better have gotten it right (see
681  * _bt_first).
682  */
683 
684  ncmpkey = Min(ntupatts, key->keysz);
685  Assert(key->heapkeyspace || ncmpkey == key->keysz);
686  Assert(!BTreeTupleIsPosting(itup) || key->allequalimage);
687  scankey = key->scankeys;
688  for (int i = 1; i <= ncmpkey; i++)
689  {
690  Datum datum;
691  bool isNull;
692 
693  datum = index_getattr(itup, scankey->sk_attno, itupdesc, &isNull);
694 
695  if (scankey->sk_flags & SK_ISNULL) /* key is NULL */
696  {
697  if (isNull)
698  result = 0; /* NULL "=" NULL */
699  else if (scankey->sk_flags & SK_BT_NULLS_FIRST)
700  result = -1; /* NULL "<" NOT_NULL */
701  else
702  result = 1; /* NULL ">" NOT_NULL */
703  }
704  else if (isNull) /* key is NOT_NULL and item is NULL */
705  {
706  if (scankey->sk_flags & SK_BT_NULLS_FIRST)
707  result = 1; /* NOT_NULL ">" NULL */
708  else
709  result = -1; /* NOT_NULL "<" NULL */
710  }
711  else
712  {
713  /*
714  * The sk_func needs to be passed the index value as left arg and
715  * the sk_argument as right arg (they might be of different
716  * types). Since it is convenient for callers to think of
717  * _bt_compare as comparing the scankey to the index item, we have
718  * to flip the sign of the comparison result. (Unless it's a DESC
719  * column, in which case we *don't* flip the sign.)
720  */
721  result = DatumGetInt32(FunctionCall2Coll(&scankey->sk_func,
722  scankey->sk_collation,
723  datum,
724  scankey->sk_argument));
725 
726  if (!(scankey->sk_flags & SK_BT_DESC))
727  INVERT_COMPARE_RESULT(result);
728  }
729 
730  /* if the keys are unequal, return the difference */
731  if (result != 0)
732  return result;
733 
734  scankey++;
735  }
736 
737  /*
738  * All non-truncated attributes (other than heap TID) were found to be
739  * equal. Treat truncated attributes as minus infinity when scankey has a
740  * key attribute value that would otherwise be compared directly.
741  *
742  * Note: it doesn't matter if ntupatts includes non-key attributes;
743  * scankey won't, so explicitly excluding non-key attributes isn't
744  * necessary.
745  */
746  if (key->keysz > ntupatts)
747  return 1;
748 
749  /*
750  * Use the heap TID attribute and scantid to try to break the tie. The
751  * rules are the same as any other key attribute -- only the
752  * representation differs.
753  */
754  heapTid = BTreeTupleGetHeapTID(itup);
755  if (key->scantid == NULL)
756  {
757  /*
758  * Most searches have a scankey that is considered greater than a
759  * truncated pivot tuple if and when the scankey has equal values for
760  * attributes up to and including the least significant untruncated
761  * attribute in tuple.
762  *
763  * For example, if an index has the minimum two attributes (single
764  * user key attribute, plus heap TID attribute), and a page's high key
765  * is ('foo', -inf), and scankey is ('foo', <omitted>), the search
766  * will not descend to the page to the left. The search will descend
767  * right instead. The truncated attribute in pivot tuple means that
768  * all non-pivot tuples on the page to the left are strictly < 'foo',
769  * so it isn't necessary to descend left. In other words, search
770  * doesn't have to descend left because it isn't interested in a match
771  * that has a heap TID value of -inf.
772  *
773  * However, some searches (pivotsearch searches) actually require that
774  * we descend left when this happens. -inf is treated as a possible
775  * match for omitted scankey attribute(s). This is needed by page
776  * deletion, which must re-find leaf pages that are targets for
777  * deletion using their high keys.
778  *
779  * Note: the heap TID part of the test ensures that scankey is being
780  * compared to a pivot tuple with one or more truncated key
781  * attributes.
782  *
783  * Note: pg_upgrade'd !heapkeyspace indexes must always descend to the
784  * left here, since they have no heap TID attribute (and cannot have
785  * any -inf key values in any case, since truncation can only remove
786  * non-key attributes). !heapkeyspace searches must always be
787  * prepared to deal with matches on both sides of the pivot once the
788  * leaf level is reached.
789  */
790  if (key->heapkeyspace && !key->pivotsearch &&
791  key->keysz == ntupatts && heapTid == NULL)
792  return 1;
793 
794  /* All provided scankey arguments found to be equal */
795  return 0;
796  }
797 
798  /*
799  * Treat truncated heap TID as minus infinity, since scankey has a key
800  * attribute value (scantid) that would otherwise be compared directly
801  */
803  if (heapTid == NULL)
804  return 1;
805 
806  /*
807  * Scankey must be treated as equal to a posting list tuple if its scantid
808  * value falls within the range of the posting list. In all other cases
809  * there can only be a single heap TID value, which is compared directly
810  * with scantid.
811  */
813  result = ItemPointerCompare(key->scantid, heapTid);
814  if (result <= 0 || !BTreeTupleIsPosting(itup))
815  return result;
816  else
817  {
818  result = ItemPointerCompare(key->scantid,
820  if (result > 0)
821  return 1;
822  }
823 
824  return 0;
825 }
int32 ItemPointerCompare(ItemPointer arg1, ItemPointer arg2)
Definition: itemptr.c:52
static ItemPointer BTreeTupleGetHeapTID(IndexTuple itup)
Definition: nbtree.h:631
#define DatumGetInt32(X)
Definition: postgres.h:516
#define RelationGetDescr(relation)
Definition: rel.h:503
ItemPointer scantid
Definition: nbtree.h:789
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:369
#define Min(x, y)
Definition: c.h:986
bool allequalimage
Definition: nbtree.h:785
Datum FunctionCall2Coll(FmgrInfo *flinfo, Oid collation, Datum arg1, Datum arg2)
Definition: fmgr.c:1148
#define BTreeTupleGetNAtts(itup, rel)
Definition: nbtree.h:570
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:71
signed int int32
Definition: c.h:429
IndexTupleData * IndexTuple
Definition: itup.h:53
FmgrInfo sk_func
Definition: skey.h:71
#define IndexRelationGetNumberOfKeyAttributes(relation)
Definition: rel.h:496
#define SK_ISNULL
Definition: skey.h:115
bool pivotsearch
Definition: nbtree.h:788
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:235
#define SK_BT_NULLS_FIRST
Definition: nbtree.h:1083
static ItemPointer BTreeTupleGetMaxHeapTID(IndexTuple itup)
Definition: nbtree.h:657
uintptr_t Datum
Definition: postgres.h:411
static bool BTreeTupleIsPosting(IndexTuple itup)
Definition: nbtree.h:485
bool _bt_check_natts(Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum)
Definition: nbtutils.c:2466
int sk_flags
Definition: skey.h:66
#define Assert(condition)
Definition: c.h:804
#define SK_BT_DESC
Definition: nbtree.h:1082
bool heapkeyspace
Definition: nbtree.h:784
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
#define index_getattr(tup, attnum, tupleDesc, isnull)
Definition: itup.h:100
ScanKeyData scankeys[INDEX_MAX_KEYS]
Definition: nbtree.h:791
Oid sk_collation
Definition: skey.h:70
int i
Datum sk_argument
Definition: skey.h:72
#define INVERT_COMPARE_RESULT(var)
Definition: c.h:1125
#define PageGetItem(page, itemId)
Definition: bufpage.h:340
AttrNumber sk_attno
Definition: skey.h:67
#define P_ISLEAF(opaque)
Definition: nbtree.h:219

◆ _bt_conditionallockbuf()

bool _bt_conditionallockbuf ( Relation  rel,
Buffer  buf 
)

Definition at line 1105 of file nbtpage.c.

References BufferGetPage, ConditionalLockBuffer(), RelationUsesLocalBuffers, and VALGRIND_MAKE_MEM_DEFINED.

Referenced by _bt_getbuf(), and _bt_search_insert().

1106 {
1107  /* ConditionalLockBuffer() asserts that pin is held by this backend */
1108  if (!ConditionalLockBuffer(buf))
1109  return false;
1110 
1111  if (!RelationUsesLocalBuffers(rel))
1113 
1114  return true;
1115 }
#define VALGRIND_MAKE_MEM_DEFINED(addr, size)
Definition: memdebug.h:26
static char * buf
Definition: pg_test_fsync.c:68
#define BufferGetPage(buffer)
Definition: bufmgr.h:169
bool ConditionalLockBuffer(Buffer buffer)
Definition: bufmgr.c:4033
#define RelationUsesLocalBuffers(relation)
Definition: rel.h:610

◆ _bt_dedup_finish_pending()

Size _bt_dedup_finish_pending ( Page  newpage,
BTDedupState  state 
)

Definition at line 551 of file nbtdedup.c.

References _bt_form_posting(), Assert, BTDedupStateData::base, BTDedupInterval::baseoff, BTDedupStateData::baseoff, elog, ERROR, BTDedupStateData::htids, IndexTupleSize, BTDedupStateData::intervals, InvalidOffsetNumber, MAXALIGN, BTDedupStateData::nhtids, BTDedupStateData::nintervals, BTDedupInterval::nitems, BTDedupStateData::nitems, OffsetNumberNext, PageAddItem, PageGetMaxOffsetNumber, pfree(), and BTDedupStateData::phystupsize.

Referenced by _bt_dedup_pass(), and btree_xlog_dedup().

552 {
553  OffsetNumber tupoff;
554  Size tuplesz;
555  Size spacesaving;
556 
557  Assert(state->nitems > 0);
558  Assert(state->nitems <= state->nhtids);
559  Assert(state->intervals[state->nintervals].baseoff == state->baseoff);
560 
561  tupoff = OffsetNumberNext(PageGetMaxOffsetNumber(newpage));
562  if (state->nitems == 1)
563  {
564  /* Use original, unchanged base tuple */
565  tuplesz = IndexTupleSize(state->base);
566  if (PageAddItem(newpage, (Item) state->base, tuplesz, tupoff,
567  false, false) == InvalidOffsetNumber)
568  elog(ERROR, "deduplication failed to add tuple to page");
569 
570  spacesaving = 0;
571  }
572  else
573  {
574  IndexTuple final;
575 
576  /* Form a tuple with a posting list */
577  final = _bt_form_posting(state->base, state->htids, state->nhtids);
578  tuplesz = IndexTupleSize(final);
579  Assert(tuplesz <= state->maxpostingsize);
580 
581  /* Save final number of items for posting list */
582  state->intervals[state->nintervals].nitems = state->nitems;
583 
584  Assert(tuplesz == MAXALIGN(IndexTupleSize(final)));
585  if (PageAddItem(newpage, (Item) final, tuplesz, tupoff, false,
586  false) == InvalidOffsetNumber)
587  elog(ERROR, "deduplication failed to add tuple to page");
588 
589  pfree(final);
590  spacesaving = state->phystupsize - (tuplesz + sizeof(ItemIdData));
591  /* Increment nintervals, since we wrote a new posting list tuple */
592  state->nintervals++;
593  Assert(spacesaving > 0 && spacesaving < BLCKSZ);
594  }
595 
596  /* Reset state for next pending posting list */
597  state->nhtids = 0;
598  state->nitems = 0;
599  state->phystupsize = 0;
600 
601  return spacesaving;
602 }
IndexTuple base
Definition: nbtree.h:871
IndexTuple _bt_form_posting(IndexTuple base, ItemPointer htids, int nhtids)
Definition: nbtdedup.c:857
OffsetNumber baseoff
Definition: nbtree.h:872
Pointer Item
Definition: item.h:17
#define PageAddItem(page, item, size, offsetNumber, overwrite, is_heap)
Definition: bufpage.h:416
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:357
uint16 OffsetNumber
Definition: off.h:24
ItemPointer htids
Definition: nbtree.h:876
void pfree(void *pointer)
Definition: mcxt.c:1169
Size phystupsize
Definition: nbtree.h:879
#define ERROR
Definition: elog.h:46
BTDedupInterval intervals[MaxIndexTuplesPerPage]
Definition: nbtree.h:888
struct ItemIdData ItemIdData
#define InvalidOffsetNumber
Definition: off.h:26
uint16 nitems
Definition: nbtree.h:842
#define Assert(condition)
Definition: c.h:804
#define OffsetNumberNext(offsetNumber)
Definition: off.h:52
size_t Size
Definition: c.h:540
#define MAXALIGN(LEN)
Definition: c.h:757
#define elog(elevel,...)
Definition: elog.h:232
OffsetNumber baseoff
Definition: nbtree.h:841
#define IndexTupleSize(itup)
Definition: itup.h:71

◆ _bt_dedup_pass()

void _bt_dedup_pass ( Relation  rel,
Buffer  buf,
Relation  heapRel,
IndexTuple  newitem,
Size  newitemsz,
bool  bottomupdedup 
)

Definition at line 57 of file nbtdedup.c.

References _bt_dedup_finish_pending(), _bt_dedup_save_htid(), _bt_dedup_start_pending(), _bt_do_singleval(), _bt_keep_natts_fast(), _bt_singleval_fillfactor(), Assert, BTDedupStateData::base, BTDedupStateData::baseoff, BTDedupStateData::basetupsize, BTMaxItemSize, BTP_HAS_GARBAGE, BTPageOpaqueData::btpo_flags, BufferGetPage, BTDedupStateData::deduplicate, elog, END_CRIT_SECTION, ERROR, BTDedupStateData::htids, INDEX_SIZE_MASK, IndexRelationGetNumberOfKeyAttributes, BTDedupStateData::intervals, InvalidOffsetNumber, ItemIdGetLength, ItemIdIsDead, MarkBufferDirty(), BTDedupStateData::maxpostingsize, Min, BTDedupStateData::nhtids, xl_btree_dedup::nintervals, BTDedupStateData::nintervals, BTDedupStateData::nitems, BTDedupStateData::nmaxitems, OffsetNumberNext, P_FIRSTDATAKEY, P_HAS_GARBAGE, P_HIKEY, P_RIGHTMOST, PageAddItem, PageGetExactFreeSpace(), PageGetItem, PageGetItemId, PageGetLSN, PageGetMaxOffsetNumber, PageGetSpecialPointer, PageGetTempPageCopySpecial(), PageRestoreTempPage(), PageSetLSN, palloc(), pfree(), BTDedupStateData::phystupsize, REGBUF_STANDARD, RelationNeedsWAL, SizeOfBtreeDedup, START_CRIT_SECTION, XLOG_BTREE_DEDUP, XLogBeginInsert(), XLogInsert(), XLogRegisterBufData(), XLogRegisterBuffer(), and XLogRegisterData().

Referenced by _bt_delete_or_dedup_one_page().

59 {
60  OffsetNumber offnum,
61  minoff,
62  maxoff;
63  Page page = BufferGetPage(buf);
65  Page newpage;
67  Size pagesaving = 0;
68  bool singlevalstrat = false;
69  int nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
70 
71  /* Passed-in newitemsz is MAXALIGNED but does not include line pointer */
72  newitemsz += sizeof(ItemIdData);
73 
74  /*
75  * Initialize deduplication state.
76  *
77  * It would be possible for maxpostingsize (limit on posting list tuple
78  * size) to be set to one third of the page. However, it seems like a
79  * good idea to limit the size of posting lists to one sixth of a page.
80  * That ought to leave us with a good split point when pages full of
81  * duplicates can be split several times.
82  */
83  state = (BTDedupState) palloc(sizeof(BTDedupStateData));
84  state->deduplicate = true;
85  state->nmaxitems = 0;
86  state->maxpostingsize = Min(BTMaxItemSize(page) / 2, INDEX_SIZE_MASK);
87  /* Metadata about base tuple of current pending posting list */
88  state->base = NULL;
90  state->basetupsize = 0;
91  /* Metadata about current pending posting list TIDs */
92  state->htids = palloc(state->maxpostingsize);
93  state->nhtids = 0;
94  state->nitems = 0;
95  /* Size of all physical tuples to be replaced by pending posting list */
96  state->phystupsize = 0;
97  /* nintervals should be initialized to zero */
98  state->nintervals = 0;
99 
100  minoff = P_FIRSTDATAKEY(opaque);
101  maxoff = PageGetMaxOffsetNumber(page);
102 
103  /*
104  * Consider applying "single value" strategy, though only if the page
105  * seems likely to be split in the near future
106  */
107  if (!bottomupdedup)
108  singlevalstrat = _bt_do_singleval(rel, page, state, minoff, newitem);
109 
110  /*
111  * Deduplicate items from page, and write them to newpage.
112  *
113  * Copy the original page's LSN into newpage copy. This will become the
114  * updated version of the page. We need this because XLogInsert will
115  * examine the LSN and possibly dump it in a page image.
116  */
117  newpage = PageGetTempPageCopySpecial(page);
118  PageSetLSN(newpage, PageGetLSN(page));
119 
120  /* Copy high key, if any */
121  if (!P_RIGHTMOST(opaque))
122  {
123  ItemId hitemid = PageGetItemId(page, P_HIKEY);
124  Size hitemsz = ItemIdGetLength(hitemid);
125  IndexTuple hitem = (IndexTuple) PageGetItem(page, hitemid);
126 
127  if (PageAddItem(newpage, (Item) hitem, hitemsz, P_HIKEY,
128  false, false) == InvalidOffsetNumber)
129  elog(ERROR, "deduplication failed to add highkey");
130  }
131 
132  for (offnum = minoff;
133  offnum <= maxoff;
134  offnum = OffsetNumberNext(offnum))
135  {
136  ItemId itemid = PageGetItemId(page, offnum);
137  IndexTuple itup = (IndexTuple) PageGetItem(page, itemid);
138 
139  Assert(!ItemIdIsDead(itemid));
140 
141  if (offnum == minoff)
142  {
143  /*
144  * No previous/base tuple for the data item -- use the data item
145  * as base tuple of pending posting list
146  */
147  _bt_dedup_start_pending(state, itup, offnum);
148  }
149  else if (state->deduplicate &&
150  _bt_keep_natts_fast(rel, state->base, itup) > nkeyatts &&
151  _bt_dedup_save_htid(state, itup))
152  {
153  /*
154  * Tuple is equal to base tuple of pending posting list. Heap
155  * TID(s) for itup have been saved in state.
156  */
157  }
158  else
159  {
160  /*
161  * Tuple is not equal to pending posting list tuple, or
162  * _bt_dedup_save_htid() opted to not merge current item into
163  * pending posting list for some other reason (e.g., adding more
164  * TIDs would have caused posting list to exceed current
165  * maxpostingsize).
166  *
167  * If state contains pending posting list with more than one item,
168  * form new posting tuple, and actually update the page. Else
169  * reset the state and move on without modifying the page.
170  */
171  pagesaving += _bt_dedup_finish_pending(newpage, state);
172 
173  if (singlevalstrat)
174  {
175  /*
176  * Single value strategy's extra steps.
177  *
178  * Lower maxpostingsize for sixth and final large posting list
179  * tuple at the point where 5 maxpostingsize-capped tuples
180  * have either been formed or observed.
181  *
182  * When a sixth maxpostingsize-capped item is formed/observed,
183  * stop merging together tuples altogether. The few tuples
184  * that remain at the end of the page won't be merged together
185  * at all (at least not until after a future page split takes
186  * place).
187  */
188  if (state->nmaxitems == 5)
189  _bt_singleval_fillfactor(page, state, newitemsz);
190  else if (state->nmaxitems == 6)
191  {
192  state->deduplicate = false;
193  singlevalstrat = false; /* won't be back here */
194  }
195  }
196 
197  /* itup starts new pending posting list */
198  _bt_dedup_start_pending(state, itup, offnum);
199  }
200  }
201 
202  /* Handle the last item */
203  pagesaving += _bt_dedup_finish_pending(newpage, state);
204 
205  /*
206  * If no items suitable for deduplication were found, newpage must be
207  * exactly the same as the original page, so just return from function.
208  *
209  * We could determine whether or not to proceed on the basis the space
210  * savings being sufficient to avoid an immediate page split instead. We
211  * don't do that because there is some small value in nbtsplitloc.c always
212  * operating against a page that is fully deduplicated (apart from
213  * newitem). Besides, most of the cost has already been paid.
214  */
215  if (state->nintervals == 0)
216  {
217  /* cannot leak memory here */
218  pfree(newpage);
219  pfree(state->htids);
220  pfree(state);
221  return;
222  }
223 
224  /*
225  * By here, it's clear that deduplication will definitely go ahead.
226  *
227  * Clear the BTP_HAS_GARBAGE page flag. The index must be a heapkeyspace
228  * index, and as such we'll never pay attention to BTP_HAS_GARBAGE anyway.
229  * But keep things tidy.
230  */
231  if (P_HAS_GARBAGE(opaque))
232  {
233  BTPageOpaque nopaque = (BTPageOpaque) PageGetSpecialPointer(newpage);
234 
235  nopaque->btpo_flags &= ~BTP_HAS_GARBAGE;
236  }
237 
239 
240  PageRestoreTempPage(newpage, page);
242 
243  /* XLOG stuff */
244  if (RelationNeedsWAL(rel))
245  {
246  XLogRecPtr recptr;
247  xl_btree_dedup xlrec_dedup;
248 
249  xlrec_dedup.nintervals = state->nintervals;
250 
251  XLogBeginInsert();
253  XLogRegisterData((char *) &xlrec_dedup, SizeOfBtreeDedup);
254 
255  /*
256  * The intervals array is not in the buffer, but pretend that it is.
257  * When XLogInsert stores the whole buffer, the array need not be
258  * stored too.
259  */
260  XLogRegisterBufData(0, (char *) state->intervals,
261  state->nintervals * sizeof(BTDedupInterval));
262 
263  recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DEDUP);
264 
265  PageSetLSN(page, recptr);
266  }
267 
269 
270  /* Local space accounting should agree with page accounting */
271  Assert(pagesaving < newitemsz || PageGetExactFreeSpace(page) >= newitemsz);
272 
273  /* cannot leak memory here */
274  pfree(state->htids);
275  pfree(state);
276 }
void XLogRegisterBufData(uint8 block_id, char *data, int len)
Definition: xloginsert.c:378
IndexTuple base
Definition: nbtree.h:871
void PageRestoreTempPage(Page tempPage, Page oldPage)
Definition: bufpage.c:424
uint16 nintervals
Definition: nbtxlog.h:172
OffsetNumber baseoff
Definition: nbtree.h:872
void MarkBufferDirty(Buffer buffer)
Definition: bufmgr.c:1565
void XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags)
Definition: xloginsert.c:232
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:369
#define Min(x, y)
Definition: c.h:986
#define END_CRIT_SECTION()
Definition: miscadmin.h:149
Pointer Item
Definition: item.h:17
#define INDEX_SIZE_MASK
Definition: itup.h:65
#define P_HAS_GARBAGE(opaque)
Definition: nbtree.h:225
#define START_CRIT_SECTION()
Definition: miscadmin.h:147
#define PageAddItem(page, item, size, offsetNumber, overwrite, is_heap)
Definition: bufpage.h:416
void _bt_dedup_start_pending(BTDedupState state, IndexTuple base, OffsetNumber baseoff)
Definition: nbtdedup.c:429
#define ItemIdIsDead(itemId)
Definition: itemid.h:113
Size _bt_dedup_finish_pending(Page newpage, BTDedupState state)
Definition: nbtdedup.c:551
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:357
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:71
uint16 OffsetNumber
Definition: off.h:24
Page PageGetTempPageCopySpecial(Page page)
Definition: bufpage.c:402
ItemPointer htids
Definition: nbtree.h:876
void pfree(void *pointer)
Definition: mcxt.c:1169
#define ItemIdGetLength(itemId)
Definition: itemid.h:59
Size phystupsize
Definition: nbtree.h:879
#define ERROR
Definition: elog.h:46
BTDedupInterval intervals[MaxIndexTuplesPerPage]
Definition: nbtree.h:888
static bool _bt_do_singleval(Relation rel, Page page, BTDedupState state, OffsetNumber minoff, IndexTuple newitem)
Definition: nbtdedup.c:775
static char * buf
Definition: pg_test_fsync.c:68
bool _bt_dedup_save_htid(BTDedupState state, IndexTuple itup)
Definition: nbtdedup.c:480
IndexTupleData * IndexTuple
Definition: itup.h:53
#define REGBUF_STANDARD
Definition: xloginsert.h:35
struct ItemIdData ItemIdData
#define SizeOfBtreeDedup
Definition: nbtxlog.h:177
#define IndexRelationGetNumberOfKeyAttributes(relation)
Definition: rel.h:496
#define BufferGetPage(buffer)
Definition: bufmgr.h:169
#define XLOG_BTREE_DEDUP
Definition: nbtxlog.h:33
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:235
void XLogRegisterData(char *data, int len)
Definition: xloginsert.c:340
XLogRecPtr XLogInsert(RmgrId rmid, uint8 info)
Definition: xloginsert.c:434
#define InvalidOffsetNumber
Definition: off.h:26
uint64 XLogRecPtr
Definition: xlogdefs.h:21
#define Assert(condition)
Definition: c.h:804
Definition: regguts.h:317
#define OffsetNumberNext(offsetNumber)
Definition: off.h:52
size_t Size
Definition: c.h:540
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
Size basetupsize
Definition: nbtree.h:873
#define RelationNeedsWAL(relation)
Definition: rel.h:601
Size maxpostingsize
Definition: nbtree.h:868
#define PageGetLSN(page)
Definition: bufpage.h:366
Size PageGetExactFreeSpace(Page page)
Definition: bufpage.c:951
#define BTMaxItemSize(page)
Definition: nbtree.h:162
#define P_HIKEY
Definition: nbtree.h:367
int _bt_keep_natts_fast(Relation rel, IndexTuple lastleft, IndexTuple firstright)
Definition: nbtutils.c:2419
void * palloc(Size size)
Definition: mcxt.c:1062
bool deduplicate
Definition: nbtree.h:866
#define elog(elevel,...)
Definition: elog.h:232
BTDedupStateData * BTDedupState
Definition: nbtree.h:891
void XLogBeginInsert(void)
Definition: xloginsert.c:135
uint16 btpo_flags
Definition: nbtree.h:67
#define PageSetLSN(page, lsn)
Definition: bufpage.h:368
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:218
#define BTP_HAS_GARBAGE
Definition: nbtree.h:80
#define PageGetItem(page, itemId)
Definition: bufpage.h:340
Pointer Page
Definition: bufpage.h:78
static void _bt_singleval_fillfactor(Page page, BTDedupState state, Size newitemsz)
Definition: nbtdedup.c:815

◆ _bt_dedup_save_htid()

bool _bt_dedup_save_htid ( BTDedupState  state,
IndexTuple  itup 
)

Definition at line 480 of file nbtdedup.c.

References Assert, BTDedupStateData::basetupsize, BTreeTupleGetNPosting(), BTreeTupleGetPosting(), BTreeTupleIsPivot(), BTreeTupleIsPosting(), BTDedupStateData::htids, IndexTupleSize, MAXALIGN, BTDedupStateData::maxpostingsize, BTDedupStateData::nhtids, BTDedupStateData::nitems, BTDedupStateData::nmaxitems, BTDedupStateData::phystupsize, and IndexTupleData::t_tid.

Referenced by _bt_bottomupdel_pass(), _bt_dedup_pass(), _bt_load(), and btree_xlog_dedup().

481 {
482  int nhtids;
483  ItemPointer htids;
484  Size mergedtupsz;
485 
486  Assert(!BTreeTupleIsPivot(itup));
487 
488  if (!BTreeTupleIsPosting(itup))
489  {
490  nhtids = 1;
491  htids = &itup->t_tid;
492  }
493  else
494  {
495  nhtids = BTreeTupleGetNPosting(itup);
496  htids = BTreeTupleGetPosting(itup);
497  }
498 
499  /*
500  * Don't append (have caller finish pending posting list as-is) if
501  * appending heap TID(s) from itup would put us over maxpostingsize limit.
502  *
503  * This calculation needs to match the code used within _bt_form_posting()
504  * for new posting list tuples.
505  */
506  mergedtupsz = MAXALIGN(state->basetupsize +
507  (state->nhtids + nhtids) * sizeof(ItemPointerData));
508 
509  if (mergedtupsz > state->maxpostingsize)
510  {
511  /*
512  * Count this as an oversized item for single value strategy, though
513  * only when there are 50 TIDs in the final posting list tuple. This
514  * limit (which is fairly arbitrary) avoids confusion about how many
515  * 1/6 of a page tuples have been encountered/created by the current
516  * deduplication pass.
517  *
518  * Note: We deliberately don't consider which deduplication pass
519  * merged together tuples to create this item (could be a previous
520  * deduplication pass, or current pass). See _bt_do_singleval()
521  * comments.
522  */
523  if (state->nhtids > 50)
524  state->nmaxitems++;
525 
526  return false;
527  }
528 
529  /*
530  * Save heap TIDs to pending posting list tuple -- itup can be merged into
531  * pending posting list
532  */
533  state->nitems++;
534  memcpy(state->htids + state->nhtids, htids,
535  sizeof(ItemPointerData) * nhtids);
536  state->nhtids += nhtids;
537  state->phystupsize += MAXALIGN(IndexTupleSize(itup)) + sizeof(ItemIdData);
538 
539  return true;
540 }
static bool BTreeTupleIsPivot(IndexTuple itup)
Definition: nbtree.h:473
static ItemPointer BTreeTupleGetPosting(IndexTuple posting)
Definition: nbtree.h:530
ItemPointerData t_tid
Definition: itup.h:37
ItemPointer htids
Definition: nbtree.h:876
Size phystupsize
Definition: nbtree.h:879
struct ItemIdData ItemIdData
static bool BTreeTupleIsPosting(IndexTuple itup)
Definition: nbtree.h:485
#define Assert(condition)
Definition: c.h:804
size_t Size
Definition: c.h:540
#define MAXALIGN(LEN)
Definition: c.h:757
Size basetupsize
Definition: nbtree.h:873
Size maxpostingsize
Definition: nbtree.h:868
static uint16 BTreeTupleGetNPosting(IndexTuple posting)
Definition: nbtree.h:511
#define IndexTupleSize(itup)
Definition: itup.h:71

◆ _bt_dedup_start_pending()

void _bt_dedup_start_pending ( BTDedupState  state,
IndexTuple  base,
OffsetNumber  baseoff 
)

Definition at line 429 of file nbtdedup.c.

References Assert, BTDedupStateData::base, BTDedupInterval::baseoff, BTDedupStateData::baseoff, BTDedupStateData::basetupsize, BTreeTupleGetNPosting(), BTreeTupleGetPosting(), BTreeTupleGetPostingOffset(), BTreeTupleIsPivot(), BTreeTupleIsPosting(), BTDedupStateData::htids, IndexTupleSize, BTDedupStateData::intervals, MAXALIGN, BTDedupStateData::nhtids, BTDedupStateData::nintervals, BTDedupStateData::nitems, BTDedupStateData::phystupsize, and IndexTupleData::t_tid.

Referenced by _bt_bottomupdel_pass(), _bt_dedup_pass(), _bt_load(), and btree_xlog_dedup().

431 {
432  Assert(state->nhtids == 0);
433  Assert(state->nitems == 0);
434  Assert(!BTreeTupleIsPivot(base));
435 
436  /*
437  * Copy heap TID(s) from new base tuple for new candidate posting list
438  * into working state's array
439  */
440  if (!BTreeTupleIsPosting(base))
441  {
442  memcpy(state->htids, &base->t_tid, sizeof(ItemPointerData));
443  state->nhtids = 1;
444  state->basetupsize = IndexTupleSize(base);
445  }
446  else
447  {
448  int nposting;
449 
450  nposting = BTreeTupleGetNPosting(base);
451  memcpy(state->htids, BTreeTupleGetPosting(base),
452  sizeof(ItemPointerData) * nposting);
453  state->nhtids = nposting;
454  /* basetupsize should not include existing posting list */
456  }
457 
458  /*
459  * Save new base tuple itself -- it'll be needed if we actually create a
460  * new posting list from new pending posting list.
461  *
462  * Must maintain physical size of all existing tuples (including line
463  * pointer overhead) so that we can calculate space savings on page.
464  */
465  state->nitems = 1;
466  state->base = base;
467  state->baseoff = baseoff;
468  state->phystupsize = MAXALIGN(IndexTupleSize(base)) + sizeof(ItemIdData);
469  /* Also save baseoff in pending state for interval */
470  state->intervals[state->nintervals].baseoff = state->baseoff;
471 }
IndexTuple base
Definition: nbtree.h:871
static bool BTreeTupleIsPivot(IndexTuple itup)
Definition: nbtree.h:473
OffsetNumber baseoff
Definition: nbtree.h:872
static ItemPointer BTreeTupleGetPosting(IndexTuple posting)
Definition: nbtree.h:530
ItemPointerData t_tid
Definition: itup.h:37
ItemPointer htids
Definition: nbtree.h:876
Size phystupsize
Definition: nbtree.h:879
BTDedupInterval intervals[MaxIndexTuplesPerPage]
Definition: nbtree.h:888
struct ItemIdData ItemIdData
static bool BTreeTupleIsPosting(IndexTuple itup)
Definition: nbtree.h:485
static uint32 BTreeTupleGetPostingOffset(IndexTuple posting)
Definition: nbtree.h:522
#define Assert(condition)
Definition: c.h:804
#define MAXALIGN(LEN)
Definition: c.h:757
Size basetupsize
Definition: nbtree.h:873
static uint16 BTreeTupleGetNPosting(IndexTuple posting)
Definition: nbtree.h:511
OffsetNumber baseoff
Definition: nbtree.h:841
#define IndexTupleSize(itup)
Definition: itup.h:71

◆ _bt_delitems_delete_check()

void _bt_delitems_delete_check ( Relation  rel,
Buffer  buf,
Relation  heapRel,
TM_IndexDeleteOp delstate 
)

Definition at line 1529 of file nbtpage.c.

References _bt_delitems_cmp(), _bt_delitems_delete(), Assert, TM_IndexDeleteOp::bottomup, BTreeTupleGetHeapTID(), BTreeTupleGetMaxHeapTID(), BTreeTupleGetNPosting(), BTreeTupleGetPostingN(), BTreeTupleIsPosting(), BufferGetPage, BTVacuumPostingData::deletetids, TM_IndexDeleteOp::deltids, i, TM_IndexDelete::id, TM_IndexStatus::idxoffnum, InvalidOffsetNumber, InvalidTransactionId, ItemPointerCompare(), ItemPointerEquals(), BTVacuumPostingData::itup, TM_IndexStatus::knowndeletable, MaxIndexTuplesPerPage, BTVacuumPostingData::ndeletedtids, TM_IndexDeleteOp::ndeltids, OffsetNumberIsValid, offsetof, PageGetItem, PageGetItemId, palloc(), pfree(), qsort, RelationNeedsWAL, TM_IndexDeleteOp::status, IndexTupleData::t_tid, table_index_delete_tuples(), TM_IndexDelete::tid, BTVacuumPostingData::updatedoffset, and XLogStandbyInfoActive.

Referenced by _bt_bottomupdel_pass(), and _bt_simpledel_pass().

1531 {
1532  Page page = BufferGetPage(buf);
1533  TransactionId latestRemovedXid;
1534  OffsetNumber postingidxoffnum = InvalidOffsetNumber;
1535  int ndeletable = 0,
1536  nupdatable = 0;
1539 
1540  /* Use tableam interface to determine which tuples to delete first */
1541  latestRemovedXid = table_index_delete_tuples(heapRel, delstate);
1542 
1543  /* Should not WAL-log latestRemovedXid unless it's required */
1544  if (!XLogStandbyInfoActive() || !RelationNeedsWAL(rel))
1545  latestRemovedXid = InvalidTransactionId;
1546 
1547  /*
1548  * Construct a leaf-page-wise description of what _bt_delitems_delete()
1549  * needs to do to physically delete index tuples from the page.
1550  *
1551  * Must sort deltids array to restore leaf-page-wise order (original order
1552  * before call to tableam). This is the order that the loop expects.
1553  *
1554  * Note that deltids array might be a lot smaller now. It might even have
1555  * no entries at all (with bottom-up deletion caller), in which case there
1556  * is nothing left to do.
1557  */
1558  qsort(delstate->deltids, delstate->ndeltids, sizeof(TM_IndexDelete),
1560  if (delstate->ndeltids == 0)
1561  {
1562  Assert(delstate->bottomup);
1563  return;
1564  }
1565 
1566  /* We definitely have to delete at least one index tuple (or one TID) */
1567  for (int i = 0; i < delstate->ndeltids; i++)
1568  {
1569  TM_IndexStatus *dstatus = delstate->status + delstate->deltids[i].id;
1570  OffsetNumber idxoffnum = dstatus->idxoffnum;
1571  ItemId itemid = PageGetItemId(page, idxoffnum);
1572  IndexTuple itup = (IndexTuple) PageGetItem(page, itemid);
1573  int nestedi,
1574  nitem;
1575  BTVacuumPosting vacposting;
1576 
1577  Assert(OffsetNumberIsValid(idxoffnum));
1578 
1579  if (idxoffnum == postingidxoffnum)
1580  {
1581  /*
1582  * This deltid entry is a TID from a posting list tuple that has
1583  * already been completely processed
1584  */
1585  Assert(BTreeTupleIsPosting(itup));
1587  &delstate->deltids[i].tid) < 0);
1589  &delstate->deltids[i].tid) >= 0);
1590  continue;
1591  }
1592 
1593  if (!BTreeTupleIsPosting(itup))
1594  {
1595  /* Plain non-pivot tuple */
1596  Assert(ItemPointerEquals(&itup->t_tid, &delstate->deltids[i].tid));
1597  if (dstatus->knowndeletable)
1598  deletable[ndeletable++] = idxoffnum;
1599  continue;
1600  }
1601 
1602  /*
1603  * itup is a posting list tuple whose lowest deltids entry (which may
1604  * or may not be for the first TID from itup) is considered here now.
1605  * We should process all of the deltids entries for the posting list
1606  * together now, though (not just the lowest). Remember to skip over
1607  * later itup-related entries during later iterations of outermost
1608  * loop.
1609  */
1610  postingidxoffnum = idxoffnum; /* Remember work in outermost loop */
1611  nestedi = i; /* Initialize for first itup deltids entry */
1612  vacposting = NULL; /* Describes final action for itup */
1613  nitem = BTreeTupleGetNPosting(itup);
1614  for (int p = 0; p < nitem; p++)
1615  {
1616  ItemPointer ptid = BTreeTupleGetPostingN(itup, p);
1617  int ptidcmp = -1;
1618 
1619  /*
1620  * This nested loop reuses work across ptid TIDs taken from itup.
1621  * We take advantage of the fact that both itup's TIDs and deltids
1622  * entries (within a single itup/posting list grouping) must both
1623  * be in ascending TID order.
1624  */
1625  for (; nestedi < delstate->ndeltids; nestedi++)
1626  {
1627  TM_IndexDelete *tcdeltid = &delstate->deltids[nestedi];
1628  TM_IndexStatus *tdstatus = (delstate->status + tcdeltid->id);
1629 
1630  /* Stop once we get past all itup related deltids entries */
1631  Assert(tdstatus->idxoffnum >= idxoffnum);
1632  if (tdstatus->idxoffnum != idxoffnum)
1633  break;
1634 
1635  /* Skip past non-deletable itup related entries up front */
1636  if (!tdstatus->knowndeletable)
1637  continue;
1638 
1639  /* Entry is first partial ptid match (or an exact match)? */
1640  ptidcmp = ItemPointerCompare(&tcdeltid->tid, ptid);
1641  if (ptidcmp >= 0)
1642  {
1643  /* Greater than or equal (partial or exact) match... */
1644  break;
1645  }
1646  }
1647 
1648  /* ...exact ptid match to a deletable deltids entry? */
1649  if (ptidcmp != 0)
1650  continue;
1651 
1652  /* Exact match for deletable deltids entry -- ptid gets deleted */
1653  if (vacposting == NULL)
1654  {
1655  vacposting = palloc(offsetof(BTVacuumPostingData, deletetids) +
1656  nitem * sizeof(uint16));
1657  vacposting->itup = itup;
1658  vacposting->updatedoffset = idxoffnum;
1659  vacposting->ndeletedtids = 0;
1660  }
1661  vacposting->deletetids[vacposting->ndeletedtids++] = p;
1662  }
1663 
1664  /* Final decision on itup, a posting list tuple */
1665 
1666  if (vacposting == NULL)
1667  {
1668  /* No TIDs to delete from itup -- do nothing */
1669  }
1670  else if (vacposting->ndeletedtids == nitem)
1671  {
1672  /* Straight delete of itup (to delete all TIDs) */
1673  deletable[ndeletable++] = idxoffnum;
1674  /* Turns out we won't need granular information */
1675  pfree(vacposting);
1676  }
1677  else
1678  {
1679  /* Delete some (but not all) TIDs from itup */
1680  Assert(vacposting->ndeletedtids > 0 &&
1681  vacposting->ndeletedtids < nitem);
1682  updatable[nupdatable++] = vacposting;
1683  }
1684  }
1685 
1686  /* Physically delete tuples (or TIDs) using deletable (or updatable) */
1687  _bt_delitems_delete(rel, buf, latestRemovedXid, deletable, ndeletable,
1688  updatable, nupdatable);
1689 
1690  /* be tidy */
1691  for (int i = 0; i < nupdatable; i++)
1692  pfree(updatable[i]);
1693 }
int32 ItemPointerCompare(ItemPointer arg1, ItemPointer arg2)
Definition: itemptr.c:52
static TransactionId table_index_delete_tuples(Relation rel, TM_IndexDeleteOp *delstate)
Definition: tableam.h:1325
TM_IndexDelete * deltids
Definition: tableam.h:228
uint16 ndeletedtids
Definition: nbtree.h:908
uint32 TransactionId
Definition: c.h:587
static ItemPointer BTreeTupleGetHeapTID(IndexTuple itup)
Definition: nbtree.h:631
OffsetNumber updatedoffset
Definition: nbtree.h:905
ItemPointerData t_tid
Definition: itup.h:37
IndexTuple itup
Definition: nbtree.h:904
bool knowndeletable
Definition: tableam.h:196
OffsetNumber idxoffnum
Definition: tableam.h:195
uint16 OffsetNumber
Definition: off.h:24
unsigned short uint16
Definition: c.h:440
void pfree(void *pointer)
Definition: mcxt.c:1169
static char * buf
Definition: pg_test_fsync.c:68
IndexTupleData * IndexTuple
Definition: itup.h:53
#define InvalidTransactionId
Definition: transam.h:31
#define BufferGetPage(buffer)
Definition: bufmgr.h:169
static int _bt_delitems_cmp(const void *a, const void *b)
Definition: nbtpage.c:1475
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:235
static ItemPointer BTreeTupleGetMaxHeapTID(IndexTuple itup)
Definition: nbtree.h:657
uint16 deletetids[FLEXIBLE_ARRAY_MEMBER]
Definition: nbtree.h:909
TM_IndexStatus * status
Definition: tableam.h:229
#define XLogStandbyInfoActive()
Definition: xlog.h:180
static void _bt_delitems_delete(Relation rel, Buffer buf, TransactionId latestRemovedXid, OffsetNumber *deletable, int ndeletable, BTVacuumPosting *updatable, int nupdatable)
Definition: nbtpage.c:1297
ItemPointerData tid
Definition: tableam.h:189
#define InvalidOffsetNumber
Definition: off.h:26
static ItemPointer BTreeTupleGetPostingN(IndexTuple posting, int n)
Definition: nbtree.h:537
static bool BTreeTupleIsPosting(IndexTuple itup)
Definition: nbtree.h:485
#define Assert(condition)
Definition: c.h:804
#define RelationNeedsWAL(relation)
Definition: rel.h:601
bool ItemPointerEquals(ItemPointer pointer1, ItemPointer pointer2)
Definition: itemptr.c:29
#define MaxIndexTuplesPerPage
Definition: itup.h:145
void * palloc(Size size)
Definition: mcxt.c:1062
int i
#define OffsetNumberIsValid(offsetNumber)
Definition: off.h:39
static uint16 BTreeTupleGetNPosting(IndexTuple posting)
Definition: nbtree.h:511
#define qsort(a, b, c, d)
Definition: port.h:505
#define offsetof(type, field)
Definition: c.h:727
#define PageGetItem(page, itemId)
Definition: bufpage.h:340
Pointer Page
Definition: bufpage.h:78

◆ _bt_delitems_vacuum()

void _bt_delitems_vacuum ( Relation  rel,
Buffer  buf,
OffsetNumber deletable,
int  ndeletable,
BTVacuumPosting updatable,
int  nupdatable 
)

Definition at line 1167 of file nbtpage.c.

References _bt_delitems_update(), Assert, BTP_HAS_GARBAGE, BTPageOpaqueData::btpo_cycleid, BTPageOpaqueData::btpo_flags, BufferGetBlockNumber(), BufferGetPage, elog, END_CRIT_SECTION, i, IndexTupleSize, BTVacuumPostingData::itup, MarkBufferDirty(), MAXALIGN, MaxIndexTuplesPerPage, xl_btree_vacuum::ndeleted, xl_btree_vacuum::nupdated, PageGetSpecialPointer, PageIndexMultiDelete(), PageIndexTupleOverwrite(), PageSetLSN, PANIC, pfree(), REGBUF_STANDARD, RelationGetRelationName, RelationNeedsWAL, SizeOfBtreeVacuum, START_CRIT_SECTION, XLOG_BTREE_VACUUM, XLogBeginInsert(), XLogInsert(), XLogRegisterBufData(), XLogRegisterBuffer(), and XLogRegisterData().

Referenced by btvacuumpage().

1170 {
1171  Page page = BufferGetPage(buf);
1172  BTPageOpaque opaque;
1173  bool needswal = RelationNeedsWAL(rel);
1174  char *updatedbuf = NULL;
1175  Size updatedbuflen = 0;
1176  OffsetNumber updatedoffsets[MaxIndexTuplesPerPage];
1177 
1178  /* Shouldn't be called unless there's something to do */
1179  Assert(ndeletable > 0 || nupdatable > 0);
1180 
1181  /* Generate new version of posting lists without deleted TIDs */
1182  if (nupdatable > 0)
1183  updatedbuf = _bt_delitems_update(updatable, nupdatable,
1184  updatedoffsets, &updatedbuflen,
1185  needswal);
1186 
1187  /* No ereport(ERROR) until changes are logged */
1189 
1190  /*
1191  * Handle posting tuple updates.
1192  *
1193  * Deliberately do this before handling simple deletes. If we did it the
1194  * other way around (i.e. WAL record order -- simple deletes before
1195  * updates) then we'd have to make compensating changes to the 'updatable'
1196  * array of offset numbers.
1197  *
1198  * PageIndexTupleOverwrite() won't unset each item's LP_DEAD bit when it
1199  * happens to already be set. It's important that we not interfere with
1200  * any future simple index tuple deletion operations.
1201  */
1202  for (int i = 0; i < nupdatable; i++)
1203  {
1204  OffsetNumber updatedoffset = updatedoffsets[i];
1205  IndexTuple itup;
1206  Size itemsz;
1207 
1208  itup = updatable[i]->itup;
1209  itemsz = MAXALIGN(IndexTupleSize(itup));
1210  if (!PageIndexTupleOverwrite(page, updatedoffset, (Item) itup,
1211  itemsz))
1212  elog(PANIC, "failed to update partially dead item in block %u of index \"%s\"",
1214  }
1215 
1216  /* Now handle simple deletes of entire tuples */
1217  if (ndeletable > 0)
1218  PageIndexMultiDelete(page, deletable, ndeletable);
1219 
1220  /*
1221  * We can clear the vacuum cycle ID since this page has certainly been
1222  * processed by the current vacuum scan.
1223  */
1224  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1225  opaque->btpo_cycleid = 0;
1226 
1227  /*
1228  * Clear the BTP_HAS_GARBAGE page flag.
1229  *
1230  * This flag indicates the presence of LP_DEAD items on the page (though
1231  * not reliably). Note that we only rely on it with pg_upgrade'd
1232  * !heapkeyspace indexes. That's why clearing it here won't usually
1233  * interfere with simple index tuple deletion.
1234  */
1235  opaque->btpo_flags &= ~BTP_HAS_GARBAGE;
1236 
1238 
1239  /* XLOG stuff */
1240  if (needswal)
1241  {
1242  XLogRecPtr recptr;
1243  xl_btree_vacuum xlrec_vacuum;
1244 
1245  xlrec_vacuum.ndeleted = ndeletable;
1246  xlrec_vacuum.nupdated = nupdatable;
1247 
1248  XLogBeginInsert();
1250  XLogRegisterData((char *) &xlrec_vacuum, SizeOfBtreeVacuum);
1251 
1252  if (ndeletable > 0)
1253  XLogRegisterBufData(0, (char *) deletable,
1254  ndeletable * sizeof(OffsetNumber));
1255 
1256  if (nupdatable > 0)
1257  {
1258  XLogRegisterBufData(0, (char *) updatedoffsets,
1259  nupdatable * sizeof(OffsetNumber));
1260  XLogRegisterBufData(0, updatedbuf, updatedbuflen);
1261  }
1262 
1263  recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_VACUUM);
1264 
1265  PageSetLSN(page, recptr);
1266  }
1267 
1268  END_CRIT_SECTION();
1269 
1270  /* can't leak memory here */
1271  if (updatedbuf != NULL)
1272  pfree(updatedbuf);
1273  /* free tuples allocated within _bt_delitems_update() */
1274  for (int i = 0; i < nupdatable; i++)
1275  pfree(updatable[i]->itup);
1276 }
void XLogRegisterBufData(uint8 block_id, char *data, int len)
Definition: xloginsert.c:378
static char * _bt_delitems_update(BTVacuumPosting *updatable, int nupdatable, OffsetNumber *updatedoffsets, Size *updatedbuflen, bool needswal)
Definition: nbtpage.c:1416
void MarkBufferDirty(Buffer buffer)
Definition: bufmgr.c:1565
void XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags)
Definition: xloginsert.c:232
#define END_CRIT_SECTION()
Definition: miscadmin.h:149
IndexTuple itup
Definition: nbtree.h:904
Pointer Item
Definition: item.h:17
#define START_CRIT_SECTION()
Definition: miscadmin.h:147
uint16 nupdated
Definition: nbtxlog.h:224
#define PANIC
Definition: elog.h:50
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:71
uint16 OffsetNumber
Definition: off.h:24
void pfree(void *pointer)
Definition: mcxt.c:1169
#define SizeOfBtreeVacuum
Definition: nbtxlog.h:231
BTCycleId btpo_cycleid
Definition: nbtree.h:68
bool PageIndexTupleOverwrite(Page page, OffsetNumber offnum, Item newtup, Size newsize)
Definition: bufpage.c:1398
static char * buf
Definition: pg_test_fsync.c:68
#define REGBUF_STANDARD
Definition: xloginsert.h:35
#define XLOG_BTREE_VACUUM
Definition: nbtxlog.h:39
#define RelationGetRelationName(relation)
Definition: rel.h:511
#define BufferGetPage(buffer)
Definition: bufmgr.h:169
uint16 ndeleted
Definition: nbtxlog.h:223
void XLogRegisterData(char *data, int len)
Definition: xloginsert.c:340
XLogRecPtr XLogInsert(RmgrId rmid, uint8 info)
Definition: xloginsert.c:434
uint64 XLogRecPtr
Definition: xlogdefs.h:21
#define Assert(condition)
Definition: c.h:804
void PageIndexMultiDelete(Page page, OffsetNumber *itemnos, int nitems)
Definition: bufpage.c:1154
size_t Size
Definition: c.h:540
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
#define MAXALIGN(LEN)
Definition: c.h:757
#define RelationNeedsWAL(relation)
Definition: rel.h:601
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2748
#define MaxIndexTuplesPerPage
Definition: itup.h:145
#define elog(elevel,...)
Definition: elog.h:232
int i
void XLogBeginInsert(void)
Definition: xloginsert.c:135
uint16 btpo_flags
Definition: nbtree.h:67
#define PageSetLSN(page, lsn)
Definition: bufpage.h:368
#define BTP_HAS_GARBAGE
Definition: nbtree.h:80
Pointer Page
Definition: bufpage.h:78
#define IndexTupleSize(itup)
Definition: itup.h:71

◆ _bt_doinsert()

bool _bt_doinsert ( Relation  rel,
IndexTuple  itup,
IndexUniqueCheck  checkUnique,
bool  indexUnchanged,
Relation  heapRel 
)

Definition at line 99 of file nbtinsert.c.

References _bt_check_unique(), _bt_findinsertloc(), _bt_freestack(), _bt_insertonpg(), _bt_mkscankey(), _bt_relbuf(), _bt_search_insert(), BTScanInsertData::anynullkeys, Assert, BTInsertStateData::bounds_valid, BTInsertStateData::buf, BufferGetBlockNumber(), CheckForSerializableConflictIn(), BTScanInsertData::heapkeyspace, IndexTupleSize, InvalidBuffer, BTInsertStateData::itemsz, BTInsertStateData::itup, BTInsertStateData::itup_key, MAXALIGN, pfree(), BTInsertStateData::postingoff, BTScanInsertData::scantid, SpeculativeInsertionWait(), IndexTupleData::t_tid, TransactionIdIsValid, UNIQUE_CHECK_EXISTING, UNIQUE_CHECK_NO, unlikely, XactLockTableWait(), and XLTW_InsertIndex.

Referenced by btinsert().

102 {
103  bool is_unique = false;
104  BTInsertStateData insertstate;
105  BTScanInsert itup_key;
106  BTStack stack;
107  bool checkingunique = (checkUnique != UNIQUE_CHECK_NO);
108 
109  /* we need an insertion scan key to do our search, so build one */
110  itup_key = _bt_mkscankey(rel, itup);
111 
112  if (checkingunique)
113  {
114  if (!itup_key->anynullkeys)
115  {
116  /* No (heapkeyspace) scantid until uniqueness established */
117  itup_key->scantid = NULL;
118  }
119  else
120  {
121  /*
122  * Scan key for new tuple contains NULL key values. Bypass
123  * checkingunique steps. They are unnecessary because core code
124  * considers NULL unequal to every value, including NULL.
125  *
126  * This optimization avoids O(N^2) behavior within the
127  * _bt_findinsertloc() heapkeyspace path when a unique index has a
128  * large number of "duplicates" with NULL key values.
129  */
130  checkingunique = false;
131  /* Tuple is unique in the sense that core code cares about */
132  Assert(checkUnique != UNIQUE_CHECK_EXISTING);
133  is_unique = true;
134  }
135  }
136 
137  /*
138  * Fill in the BTInsertState working area, to track the current page and
139  * position within the page to insert on.
140  *
141  * Note that itemsz is passed down to lower level code that deals with
142  * inserting the item. It must be MAXALIGN()'d. This ensures that space
143  * accounting code consistently considers the alignment overhead that we
144  * expect PageAddItem() will add later. (Actually, index_form_tuple() is
145  * already conservative about alignment, but we don't rely on that from
146  * this distance. Besides, preserving the "true" tuple size in index
147  * tuple headers for the benefit of nbtsplitloc.c might happen someday.
148  * Note that heapam does not MAXALIGN() each heap tuple's lp_len field.)
149  */
150  insertstate.itup = itup;
151  insertstate.itemsz = MAXALIGN(IndexTupleSize(itup));
152  insertstate.itup_key = itup_key;
153  insertstate.bounds_valid = false;
154  insertstate.buf = InvalidBuffer;
155  insertstate.postingoff = 0;
156 
157 search:
158 
159  /*
160  * Find and lock the leaf page that the tuple should be added to by
161  * searching from the root page. insertstate.buf will hold a buffer that
162  * is locked in exclusive mode afterwards.
163  */
164  stack = _bt_search_insert(rel, &insertstate);
165 
166  /*
167  * checkingunique inserts are not allowed to go ahead when two tuples with
168  * equal key attribute values would be visible to new MVCC snapshots once
169  * the xact commits. Check for conflicts in the locked page/buffer (if
170  * needed) here.
171  *
172  * It might be necessary to check a page to the right in _bt_check_unique,
173  * though that should be very rare. In practice the first page the value
174  * could be on (with scantid omitted) is almost always also the only page
175  * that a matching tuple might be found on. This is due to the behavior
176  * of _bt_findsplitloc with duplicate tuples -- a group of duplicates can
177  * only be allowed to cross a page boundary when there is no candidate
178  * leaf page split point that avoids it. Also, _bt_check_unique can use
179  * the leaf page high key to determine that there will be no duplicates on
180  * the right sibling without actually visiting it (it uses the high key in
181  * cases where the new item happens to belong at the far right of the leaf
182  * page).
183  *
184  * NOTE: obviously, _bt_check_unique can only detect keys that are already
185  * in the index; so it cannot defend against concurrent insertions of the
186  * same key. We protect against that by means of holding a write lock on
187  * the first page the value could be on, with omitted/-inf value for the
188  * implicit heap TID tiebreaker attribute. Any other would-be inserter of
189  * the same key must acquire a write lock on the same page, so only one
190  * would-be inserter can be making the check at one time. Furthermore,
191  * once we are past the check we hold write locks continuously until we
192  * have performed our insertion, so no later inserter can fail to see our
193  * insertion. (This requires some care in _bt_findinsertloc.)
194  *
195  * If we must wait for another xact, we release the lock while waiting,
196  * and then must perform a new search.
197  *
198  * For a partial uniqueness check, we don't wait for the other xact. Just
199  * let the tuple in and return false for possibly non-unique, or true for
200  * definitely unique.
201  */
202  if (checkingunique)
203  {
204  TransactionId xwait;
205  uint32 speculativeToken;
206 
207  xwait = _bt_check_unique(rel, &insertstate, heapRel, checkUnique,
208  &is_unique, &speculativeToken);
209 
210  if (unlikely(TransactionIdIsValid(xwait)))
211  {
212  /* Have to wait for the other guy ... */
213  _bt_relbuf(rel, insertstate.buf);
214  insertstate.buf = InvalidBuffer;
215 
216  /*
217  * If it's a speculative insertion, wait for it to finish (ie. to
218  * go ahead with the insertion, or kill the tuple). Otherwise
219  * wait for the transaction to finish as usual.
220  */
221  if (speculativeToken)
222  SpeculativeInsertionWait(xwait, speculativeToken);
223  else
224  XactLockTableWait(xwait, rel, &itup->t_tid, XLTW_InsertIndex);
225 
226  /* start over... */
227  if (stack)
228  _bt_freestack(stack);
229  goto search;
230  }
231 
232  /* Uniqueness is established -- restore heap tid as scantid */
233  if (itup_key->heapkeyspace)
234  itup_key->scantid = &itup->t_tid;
235  }
236 
237  if (checkUnique != UNIQUE_CHECK_EXISTING)
238  {
239  OffsetNumber newitemoff;
240 
241  /*
242  * The only conflict predicate locking cares about for indexes is when
243  * an index tuple insert conflicts with an existing lock. We don't
244  * know the actual page we're going to insert on for sure just yet in
245  * checkingunique and !heapkeyspace cases, but it's okay to use the
246  * first page the value could be on (with scantid omitted) instead.
247  */
249 
250  /*
251  * Do the insertion. Note that insertstate contains cached binary
252  * search bounds established within _bt_check_unique when insertion is
253  * checkingunique.
254  */
255  newitemoff = _bt_findinsertloc(rel, &insertstate, checkingunique,
256  indexUnchanged, stack, heapRel);
257  _bt_insertonpg(rel, itup_key, insertstate.buf, InvalidBuffer, stack,
258  itup, insertstate.itemsz, newitemoff,
259  insertstate.postingoff, false);
260  }
261  else
262  {
263  /* just release the buffer */
264  _bt_relbuf(rel, insertstate.buf);
265  }
266 
267  /* be tidy */
268  if (stack)
269  _bt_freestack(stack);
270  pfree(itup_key);
271 
272  return is_unique;
273 }
static void _bt_insertonpg(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, BTStack stack, IndexTuple itup, Size itemsz, OffsetNumber newitemoff, int postingoff, bool split_only_page)
Definition: nbtinsert.c:1103
void _bt_freestack(BTStack stack)
Definition: nbtutils.c:175
bool bounds_valid
Definition: nbtree.h:821
uint32 TransactionId
Definition: c.h:587
BTScanInsert _bt_mkscankey(Relation rel, IndexTuple itup)
Definition: nbtutils.c:90
ItemPointer scantid
Definition: nbtree.h:789
ItemPointerData t_tid
Definition: itup.h:37
#define InvalidBuffer
Definition: buf.h:25
uint16 OffsetNumber
Definition: off.h:24
void pfree(void *pointer)
Definition: mcxt.c:1169
void SpeculativeInsertionWait(TransactionId xid, uint32 token)
Definition: lmgr.c:797
unsigned int uint32
Definition: c.h:441
IndexTuple itup
Definition: nbtree.h:809
static BTStack _bt_search_insert(Relation rel, BTInsertState insertstate)
Definition: nbtinsert.c:314
static OffsetNumber _bt_findinsertloc(Relation rel, BTInsertState insertstate, bool checkingunique, bool indexUnchanged, BTStack stack, Relation heapRel)
Definition: nbtinsert.c:815
bool anynullkeys
Definition: nbtree.h:786
void CheckForSerializableConflictIn(Relation relation, ItemPointer tid, BlockNumber blkno)
Definition: predicate.c:4446
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:1035
void XactLockTableWait(TransactionId xid, Relation rel, ItemPointer ctid, XLTW_Oper oper)
Definition: lmgr.c:640
BTScanInsert itup_key
Definition: nbtree.h:811
#define Assert(condition)
Definition: c.h:804
bool heapkeyspace
Definition: nbtree.h:784
#define MAXALIGN(LEN)
Definition: c.h:757
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2748
static TransactionId _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel, IndexUniqueCheck checkUnique, bool *is_unique, uint32 *speculativeToken)
Definition: nbtinsert.c:405
#define unlikely(x)
Definition: c.h:273
#define TransactionIdIsValid(xid)
Definition: transam.h:41
#define IndexTupleSize(itup)
Definition: itup.h:71

◆ _bt_end_vacuum()

void _bt_end_vacuum ( Relation  rel)

Definition at line 2026 of file nbtutils.c.

References LockRelId::dbId, i, LockInfoData::lockRelId, LW_EXCLUSIVE, LWLockAcquire(), LWLockRelease(), BTVacInfo::num_vacuums, RelationData::rd_lockInfo, LockRelId::relId, BTOneVacInfo::relid, and BTVacInfo::vacuums.

Referenced by _bt_end_vacuum_callback(), and btbulkdelete().

2027 {
2028  int i;
2029 
2030  LWLockAcquire(BtreeVacuumLock, LW_EXCLUSIVE);
2031 
2032  /* Find the array entry */
2033  for (i = 0; i < btvacinfo->num_vacuums; i++)
2034  {
2035  BTOneVacInfo *vac = &btvacinfo->vacuums[i];
2036 
2037  if (vac->relid.relId == rel->rd_lockInfo.lockRelId.relId &&
2038  vac->relid.dbId == rel->rd_lockInfo.lockRelId.dbId)
2039  {
2040  /* Remove it by shifting down the last entry */
2041  *vac = btvacinfo->vacuums[btvacinfo->num_vacuums - 1];
2043  break;
2044  }
2045  }
2046 
2047  LWLockRelease(BtreeVacuumLock);
2048 }
LockRelId lockRelId
Definition: rel.h:45
static BTVacInfo * btvacinfo
Definition: nbtutils.c:1922
LockRelId relid
Definition: nbtutils.c:1910
Oid dbId
Definition: rel.h:40
BTOneVacInfo vacuums[FLEXIBLE_ARRAY_MEMBER]
Definition: nbtutils.c:1919
void LWLockRelease(LWLock *lock)
Definition: lwlock.c:1803
LockInfoData rd_lockInfo
Definition: rel.h:112
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1199
int num_vacuums
Definition: nbtutils.c:1917
int i
Oid relId
Definition: rel.h:39

◆ _bt_end_vacuum_callback()

void _bt_end_vacuum_callback ( int  code,
Datum  arg 
)

Definition at line 2054 of file nbtutils.c.

References _bt_end_vacuum(), and DatumGetPointer.

Referenced by btbulkdelete().

2055 {
2057 }
void _bt_end_vacuum(Relation rel)
Definition: nbtutils.c:2026
#define DatumGetPointer(X)
Definition: postgres.h:593
void * arg

◆ _bt_findsplitloc()

OffsetNumber _bt_findsplitloc ( Relation  rel,
Page  origpage,
OffsetNumber  newitemoff,
Size  newitemsz,
IndexTuple  newitem,
bool newitemonleft 
)

Definition at line 130 of file nbtsplitloc.c.

References _bt_afternewitemoff(), _bt_bestsplitloc(), _bt_defaultinterval(), _bt_deltasortsplits(), _bt_recsplitloc(), _bt_strategy(), Assert, BTGetFillFactor, BTREE_NONLEAF_FILLFACTOR, BTREE_SINGLEVAL_FILLFACTOR, BTreeTupleIsPosting(), elog, ERROR, SplitPoint::firstrightoff, i, IndexRelationGetNumberOfKeyAttributes, FindSplitData::interval, FindSplitData::is_leaf, FindSplitData::is_rightmost, ItemIdGetLength, FindSplitData::leftspace, MAXALIGN, FindSplitData::maxsplits, FindSplitData::minfirstrightsz, FindSplitData::newitem, FindSplitData::newitemoff, SplitPoint::newitemonleft, FindSplitData::newitemsz, FindSplitData::nsplits, OffsetNumberNext, FindSplitData::olddataitemstotal, FindSplitData::origpage, P_FIRSTDATAKEY, P_HIKEY, P_ISLEAF, P_RIGHTMOST, PageGetExactFreeSpace(), PageGetItemId, PageGetMaxOffsetNumber, PageGetPageSize, PageGetSpecialPointer, palloc(), pfree(), FindSplitData::rel, RelationGetRelationName, FindSplitData::rightspace, SizeOfPageHeaderData, SPLIT_DEFAULT, SPLIT_MANY_DUPLICATES, SPLIT_SINGLE_VALUE, and FindSplitData::splits.

Referenced by _bt_split().

136 {
137  BTPageOpaque opaque;
138  int leftspace,
139  rightspace,
140  olddataitemstotal,
141  olddataitemstoleft,
142  perfectpenalty,
143  leaffillfactor;
145  FindSplitStrat strategy;
146  ItemId itemid;
147  OffsetNumber offnum,
148  maxoff,
149  firstrightoff;
150  double fillfactormult;
151  bool usemult;
152  SplitPoint leftpage,
153  rightpage;
154 
155  opaque = (BTPageOpaque) PageGetSpecialPointer(origpage);
156  maxoff = PageGetMaxOffsetNumber(origpage);
157 
158  /* Total free space available on a btree page, after fixed overhead */
159  leftspace = rightspace =
161  MAXALIGN(sizeof(BTPageOpaqueData));
162 
163  /* The right page will have the same high key as the old page */
164  if (!P_RIGHTMOST(opaque))
165  {
166  itemid = PageGetItemId(origpage, P_HIKEY);
167  rightspace -= (int) (MAXALIGN(ItemIdGetLength(itemid)) +
168  sizeof(ItemIdData));
169  }
170 
171  /* Count up total space in data items before actually scanning 'em */
172  olddataitemstotal = rightspace - (int) PageGetExactFreeSpace(origpage);
173  leaffillfactor = BTGetFillFactor(rel);
174 
175  /* Passed-in newitemsz is MAXALIGNED but does not include line pointer */
176  newitemsz += sizeof(ItemIdData);
177  state.rel = rel;
178  state.origpage = origpage;
179  state.newitem = newitem;
180  state.newitemsz = newitemsz;
181  state.is_leaf = P_ISLEAF(opaque);
182  state.is_rightmost = P_RIGHTMOST(opaque);
183  state.leftspace = leftspace;
184  state.rightspace = rightspace;
185  state.olddataitemstotal = olddataitemstotal;
186  state.minfirstrightsz = SIZE_MAX;
187  state.newitemoff = newitemoff;
188 
189  /* newitem cannot be a posting list item */
190  Assert(!BTreeTupleIsPosting(newitem));
191 
192  /*
193  * nsplits should never exceed maxoff because there will be at most as
194  * many candidate split points as there are points _between_ tuples, once
195  * you imagine that the new item is already on the original page (the
196  * final number of splits may be slightly lower because not all points
197  * between tuples will be legal).
198  */
199  state.maxsplits = maxoff;
200  state.splits = palloc(sizeof(SplitPoint) * state.maxsplits);
201  state.nsplits = 0;
202 
203  /*
204  * Scan through the data items and calculate space usage for a split at
205  * each possible position
206  */
207  olddataitemstoleft = 0;
208 
209  for (offnum = P_FIRSTDATAKEY(opaque);
210  offnum <= maxoff;
211  offnum = OffsetNumberNext(offnum))
212  {
213  Size itemsz;
214 
215  itemid = PageGetItemId(origpage, offnum);
216  itemsz = MAXALIGN(ItemIdGetLength(itemid)) + sizeof(ItemIdData);
217 
218  /*
219  * When item offset number is not newitemoff, neither side of the
220  * split can be newitem. Record a split after the previous data item
221  * from original page, but before the current data item from original
222  * page. (_bt_recsplitloc() will reject the split when there are no
223  * previous items, which we rely on.)
224  */
225  if (offnum < newitemoff)
226  _bt_recsplitloc(&state, offnum, false, olddataitemstoleft, itemsz);
227  else if (offnum > newitemoff)
228  _bt_recsplitloc(&state, offnum, true, olddataitemstoleft, itemsz);
229  else
230  {
231  /*
232  * Record a split after all "offnum < newitemoff" original page
233  * data items, but before newitem
234  */
235  _bt_recsplitloc(&state, offnum, false, olddataitemstoleft, itemsz);
236 
237  /*
238  * Record a split after newitem, but before data item from
239  * original page at offset newitemoff/current offset
240  */
241  _bt_recsplitloc(&state, offnum, true, olddataitemstoleft, itemsz);
242  }
243 
244  olddataitemstoleft += itemsz;
245  }
246 
247  /*
248  * Record a split after all original page data items, but before newitem.
249  * (Though only when it's possible that newitem will end up alone on new
250  * right page.)
251  */
252  Assert(olddataitemstoleft == olddataitemstotal);
253  if (newitemoff > maxoff)
254  _bt_recsplitloc(&state, newitemoff, false, olddataitemstotal, 0);
255 
256  /*
257  * I believe it is not possible to fail to find a feasible split, but just
258  * in case ...
259  */
260  if (state.nsplits == 0)
261  elog(ERROR, "could not find a feasible split point for index \"%s\"",
263 
264  /*
265  * Start search for a split point among list of legal split points. Give
266  * primary consideration to equalizing available free space in each half
267  * of the split initially (start with default strategy), while applying
268  * rightmost and split-after-new-item optimizations where appropriate.
269  * Either of the two other fallback strategies may be required for cases
270  * with a large number of duplicates around the original/space-optimal
271  * split point.
272  *
273  * Default strategy gives some weight to suffix truncation in deciding a
274  * split point on leaf pages. It attempts to select a split point where a
275  * distinguishing attribute appears earlier in the new high key for the
276  * left side of the split, in order to maximize the number of trailing
277  * attributes that can be truncated away. Only candidate split points
278  * that imply an acceptable balance of free space on each side are
279  * considered. See _bt_defaultinterval().
280  */
281  if (!state.is_leaf)
282  {
283  /* fillfactormult only used on rightmost page */
284  usemult = state.is_rightmost;
285  fillfactormult = BTREE_NONLEAF_FILLFACTOR / 100.0;
286  }
287  else if (state.is_rightmost)
288  {
289  /* Rightmost leaf page -- fillfactormult always used */
290  usemult = true;
291  fillfactormult = leaffillfactor / 100.0;
292  }
293  else if (_bt_afternewitemoff(&state, maxoff, leaffillfactor, &usemult))
294  {
295  /*
296  * New item inserted at rightmost point among a localized grouping on
297  * a leaf page -- apply "split after new item" optimization, either by
298  * applying leaf fillfactor multiplier, or by choosing the exact split
299  * point that leaves newitem as lastleft. (usemult is set for us.)
300  */
301  if (usemult)
302  {
303  /* fillfactormult should be set based on leaf fillfactor */
304  fillfactormult = leaffillfactor / 100.0;
305  }
306  else
307  {
308  /* find precise split point after newitemoff */
309  for (int i = 0; i < state.nsplits; i++)
310  {
311  SplitPoint *split = state.splits + i;
312 
313  if (split->newitemonleft &&
314  newitemoff == split->firstrightoff)
315  {
316  pfree(state.splits);
317  *newitemonleft = true;
318  return newitemoff;
319  }
320  }
321 
322  /*
323  * Cannot legally split after newitemoff; proceed with split
324  * without using fillfactor multiplier. This is defensive, and
325  * should never be needed in practice.
326  */
327  fillfactormult = 0.50;
328  }
329  }
330  else
331  {
332  /* Other leaf page. 50:50 page split. */
333  usemult = false;
334  /* fillfactormult not used, but be tidy */
335  fillfactormult = 0.50;
336  }
337 
338  /*
339  * Save leftmost and rightmost splits for page before original ordinal
340  * sort order is lost by delta/fillfactormult sort
341  */
342  leftpage = state.splits[0];
343  rightpage = state.splits[state.nsplits - 1];
344 
345  /* Give split points a fillfactormult-wise delta, and sort on deltas */
346  _bt_deltasortsplits(&state, fillfactormult, usemult);
347 
348  /* Determine split interval for default strategy */
349  state.interval = _bt_defaultinterval(&state);
350 
351  /*
352  * Determine if default strategy/split interval will produce a
353  * sufficiently distinguishing split, or if we should change strategies.
354  * Alternative strategies change the range of split points that are
355  * considered acceptable (split interval), and possibly change
356  * fillfactormult, in order to deal with pages with a large number of
357  * duplicates gracefully.
358  *
359  * Pass low and high splits for the entire page (actually, they're for an
360  * imaginary version of the page that includes newitem). These are used
361  * when the initial split interval encloses split points that are full of
362  * duplicates, and we need to consider if it's even possible to avoid
363  * appending a heap TID.
364  */
365  perfectpenalty = _bt_strategy(&state, &leftpage, &rightpage, &strategy);
366 
367  if (strategy == SPLIT_DEFAULT)
368  {
369  /*
370  * Default strategy worked out (always works out with internal page).
371  * Original split interval still stands.
372  */
373  }
374 
375  /*
376  * Many duplicates strategy is used when a heap TID would otherwise be
377  * appended, but the page isn't completely full of logical duplicates.
378  *
379  * The split interval is widened to include all legal candidate split
380  * points. There might be a few as two distinct values in the whole-page
381  * split interval, though it's also possible that most of the values on
382  * the page are unique. The final split point will either be to the
383  * immediate left or to the immediate right of the group of duplicate
384  * tuples that enclose the first/delta-optimal split point (perfect
385  * penalty was set so that the lowest delta split point that avoids
386  * appending a heap TID will be chosen). Maximizing the number of
387  * attributes that can be truncated away is not a goal of the many
388  * duplicates strategy.
389  *
390  * Single value strategy is used when it is impossible to avoid appending
391  * a heap TID. It arranges to leave the left page very full. This
392  * maximizes space utilization in cases where tuples with the same
393  * attribute values span many pages. Newly inserted duplicates will tend
394  * to have higher heap TID values, so we'll end up splitting to the right
395  * consistently. (Single value strategy is harmless though not
396  * particularly useful with !heapkeyspace indexes.)
397  */
398  else if (strategy == SPLIT_MANY_DUPLICATES)
399  {
400  Assert(state.is_leaf);
401  /* Shouldn't try to truncate away extra user attributes */
402  Assert(perfectpenalty ==
404  /* No need to resort splits -- no change in fillfactormult/deltas */
405  state.interval = state.nsplits;
406  }
407  else if (strategy == SPLIT_SINGLE_VALUE)
408  {
409  Assert(state.is_leaf);
410  /* Split near the end of the page */
411  usemult = true;
412  fillfactormult = BTREE_SINGLEVAL_FILLFACTOR / 100.0;
413  /* Resort split points with new delta */
414  _bt_deltasortsplits(&state, fillfactormult, usemult);
415  /* Appending a heap TID is unavoidable, so interval of 1 is fine */
416  state.interval = 1;
417  }
418 
419  /*
420  * Search among acceptable split points (using final split interval) for
421  * the entry that has the lowest penalty, and is therefore expected to
422  * maximize fan-out. Sets *newitemonleft for us.
423  */
424  firstrightoff = _bt_bestsplitloc(&state, perfectpenalty, newitemonleft,
425  strategy);
426  pfree(state.splits);
427 
428  return firstrightoff;
429 }
bool is_rightmost
Definition: nbtsplitloc.c:49
Size minfirstrightsz
Definition: nbtsplitloc.c:54
static int _bt_defaultinterval(FindSplitData *state)
Definition: nbtsplitloc.c:882
IndexTuple newitem
Definition: nbtsplitloc.c:46
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:369
bool newitemonleft
Definition: nbtsplitloc.c:37
OffsetNumber firstrightoff
Definition: nbtsplitloc.c:36
#define SizeOfPageHeaderData
Definition: bufpage.h:216
static int _bt_strategy(FindSplitData *state, SplitPoint *leftpage, SplitPoint *rightpage, FindSplitStrat *strategy)
Definition: nbtsplitloc.c:940
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:357
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:71
SplitPoint * splits
Definition: nbtsplitloc.c:59
static void _bt_recsplitloc(FindSplitData *state, OffsetNumber firstrightoff, bool newitemonleft, int olddataitemstoleft, Size firstrightofforigpagetuplesz)
Definition: nbtsplitloc.c:450
uint16 OffsetNumber
Definition: off.h:24
void pfree(void *pointer)
Definition: mcxt.c:1169
#define ItemIdGetLength(itemId)
Definition: itemid.h:59
#define BTREE_NONLEAF_FILLFACTOR
Definition: nbtree.h:200
#define ERROR
Definition: elog.h:46
static OffsetNumber _bt_bestsplitloc(FindSplitData *state, int perfectpenalty, bool *newitemonleft, FindSplitStrat strategy)
Definition: nbtsplitloc.c:794
#define PageGetPageSize(page)
Definition: bufpage.h:268
static bool _bt_afternewitemoff(FindSplitData *state, OffsetNumber maxoff, int leaffillfactor, bool *usemult)
Definition: nbtsplitloc.c:636
#define RelationGetRelationName(relation)
Definition: rel.h:511
struct ItemIdData ItemIdData
Relation rel
Definition: nbtsplitloc.c:44
#define IndexRelationGetNumberOfKeyAttributes(relation)
Definition: rel.h:496
#define BTREE_SINGLEVAL_FILLFACTOR
Definition: nbtree.h:201
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:235
#define BTGetFillFactor(relation)
Definition: nbtree.h:1093
FindSplitStrat
Definition: nbtsplitloc.c:20
static bool BTreeTupleIsPosting(IndexTuple itup)
Definition: nbtree.h:485
int olddataitemstotal
Definition: nbtsplitloc.c:53
#define Assert(condition)
Definition: c.h:804
Definition: regguts.h:317
OffsetNumber newitemoff
Definition: nbtsplitloc.c:50
#define OffsetNumberNext(offsetNumber)
Definition: off.h:52
size_t Size
Definition: c.h:540
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
#define MAXALIGN(LEN)
Definition: c.h:757
Size PageGetExactFreeSpace(Page page)
Definition: bufpage.c:951
#define P_HIKEY
Definition: nbtree.h:367
void * palloc(Size size)
Definition: mcxt.c:1062
#define elog(elevel,...)
Definition: elog.h:232
int i
static void _bt_deltasortsplits(FindSplitData *state, double fillfactormult, bool usemult)
Definition: nbtsplitloc.c:567
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:218
#define P_ISLEAF(opaque)
Definition: nbtree.h:219

◆ _bt_finish_split()

void _bt_finish_split ( Relation  rel,
Buffer  lbuf,
BTStack  stack 
)

Definition at line 2213 of file nbtinsert.c.

References _bt_getbuf(), _bt_insert_parent(), _bt_relbuf(), Assert, BT_WRITE, BTMetaPageData::btm_root, BTPageGetMeta, BTPageOpaqueData::btpo_next, BTREE_METAPAGE, BufferGetBlockNumber(), BufferGetPage, DEBUG1, elog, P_INCOMPLETE_SPLIT, P_LEFTMOST, P_RIGHTMOST, and PageGetSpecialPointer.

Referenced by _bt_getstackbuf(), _bt_moveright(), and _bt_stepright().

2214 {
2215  Page lpage = BufferGetPage(lbuf);
2216  BTPageOpaque lpageop = (BTPageOpaque) PageGetSpecialPointer(lpage);
2217  Buffer rbuf;
2218  Page rpage;
2219  BTPageOpaque rpageop;
2220  bool wasroot;
2221  bool wasonly;
2222 
2223  Assert(P_INCOMPLETE_SPLIT(lpageop));
2224 
2225  /* Lock right sibling, the one missing the downlink */
2226  rbuf = _bt_getbuf(rel, lpageop->btpo_next, BT_WRITE);
2227  rpage = BufferGetPage(rbuf);
2228  rpageop = (BTPageOpaque) PageGetSpecialPointer(rpage);
2229 
2230  /* Could this be a root split? */
2231  if (!stack)
2232  {
2233  Buffer metabuf;
2234  Page metapg;
2235  BTMetaPageData *metad;
2236 
2237  /* acquire lock on the metapage */
2238  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
2239  metapg = BufferGetPage(metabuf);
2240  metad = BTPageGetMeta(metapg);
2241 
2242  wasroot = (metad->btm_root == BufferGetBlockNumber(lbuf));
2243 
2244  _bt_relbuf(rel, metabuf);
2245  }
2246  else
2247  wasroot = false;
2248 
2249  /* Was this the only page on the level before split? */
2250  wasonly = (P_LEFTMOST(lpageop) && P_RIGHTMOST(rpageop));
2251 
2252  elog(DEBUG1, "finishing incomplete split of %u/%u",
2254 
2255  _bt_insert_parent(rel, lbuf, rbuf, stack, wasroot, wasonly);
2256 }
BlockNumber btpo_next
Definition: nbtree.h:65
#define DEBUG1
Definition: elog.h:25
Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access)
Definition: nbtpage.c:871
static void _bt_insert_parent(Relation rel, Buffer buf, Buffer rbuf, BTStack stack, bool isroot, bool isonly)
Definition: nbtinsert.c:2077
#define P_INCOMPLETE_SPLIT(opaque)
Definition: nbtree.h:226
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:71
#define BTPageGetMeta(p)
Definition: nbtree.h:119
#define P_LEFTMOST(opaque)
Definition: nbtree.h:217
#define BufferGetPage(buffer)
Definition: bufmgr.h:169
#define BTREE_METAPAGE
Definition: nbtree.h:146
BlockNumber btm_root
Definition: nbtree.h:105
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:1035
#define Assert(condition)
Definition: c.h:804
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2748
#define elog(elevel,...)
Definition: elog.h:232
#define BT_WRITE
Definition: nbtree.h:713
int Buffer
Definition: buf.h:23
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:218
Pointer Page
Definition: bufpage.h:78

◆ _bt_first()

bool _bt_first ( IndexScanDesc  scan,
ScanDirection  dir 
)

Definition at line 848 of file nbtsearch.c.

References _bt_binsrch(), _bt_drop_lock_and_maybe_pin(), _bt_endpoint(), _bt_freestack(), _bt_initialize_more_data(), _bt_metaversion(), _bt_parallel_done(), _bt_parallel_readpage(), _bt_parallel_seize(), _bt_preprocess_keys(), _bt_readpage(), _bt_search(), _bt_steppage(), _bt_unlockbuf(), Assert, BT_READ, BTEqualStrategyNumber, BTGreaterEqualStrategyNumber, BTGreaterStrategyNumber, BTLessEqualStrategyNumber, BTLessStrategyNumber, BTORDER_PROC, BTScanPosInvalidate, BTScanPosIsValid, buf, BTScanPosData::buf, BufferGetBlockNumber(), BufferIsValid, cur, BTScanOpaqueData::currPos, BTScanOpaqueData::currTuples, DatumGetPointer, elog, ERROR, get_opfamily_proc(), i, index_getprocinfo(), INDEX_MAX_KEYS, IndexScanDescData::indexRelation, InvalidBlockNumber, InvalidOid, InvalidStrategy, BTScanPosData::itemIndex, BTScanPosData::items, BTScanOpaqueData::keyData, BTScanOpaqueData::numberOfKeys, OffsetNumberPrev, IndexScanDescData::opaque, P_NONE, IndexScanDescData::parallel_scan, pgstat_count_index_scan, PredicateLockPage(), PredicateLockRelation(), BTScanOpaqueData::qual_ok, RelationData::rd_opcintype, RelationData::rd_opfamily, RegProcedureIsValid, RelationGetRelationName, ScanDirectionIsBackward, ScanDirectionIsForward, ScanKeyEntryInitialize(), ScanKeyEntryInitializeWithInfo(), ScanKeyData::sk_argument, ScanKeyData::sk_attno, SK_BT_DESC, SK_BT_NULLS_FIRST, ScanKeyData::sk_collation, ScanKeyData::sk_flags, SK_ISNULL, SK_ROW_END, SK_ROW_HEADER, SK_ROW_MEMBER, SK_SEARCHNOTNULL, ScanKeyData::sk_strategy, ScanKeyData::sk_subtype, status(), IndexScanDescData::xs_heaptid, IndexScanDescData::xs_itup, IndexScanDescData::xs_snapshot, and IndexScanDescData::xs_want_itup.

Referenced by btgetbitmap(), and btgettuple().

849 {
850  Relation rel = scan->indexRelation;
851  BTScanOpaque so = (BTScanOpaque) scan->opaque;
852  Buffer buf;
853  BTStack stack;
854  OffsetNumber offnum;
855  StrategyNumber strat;
856  bool nextkey;
857  bool goback;
858  BTScanInsertData inskey;
859  ScanKey startKeys[INDEX_MAX_KEYS];
860  ScanKeyData notnullkeys[INDEX_MAX_KEYS];
861  int keysCount = 0;
862  int i;
863  bool status;
864  StrategyNumber strat_total;
865  BTScanPosItem *currItem;
866  BlockNumber blkno;
867 
869 
871 
872  /*
873  * Examine the scan keys and eliminate any redundant keys; also mark the
874  * keys that must be matched to continue the scan.
875  */
876  _bt_preprocess_keys(scan);
877 
878  /*
879  * Quit now if _bt_preprocess_keys() discovered that the scan keys can
880  * never be satisfied (eg, x == 1 AND x > 2).
881  */
882  if (!so->qual_ok)
883  {
884  /* Notify any other workers that we're done with this scan key. */
885  _bt_parallel_done(scan);
886  return false;
887  }
888 
889  /*
890  * For parallel scans, get the starting page from shared state. If the
891  * scan has not started, proceed to find out first leaf page in the usual
892  * way while keeping other participating processes waiting. If the scan
893  * has already begun, use the page number from the shared structure.
894  */
895  if (scan->parallel_scan != NULL)
896  {
897  status = _bt_parallel_seize(scan, &blkno);
898  if (!status)
899  return false;
900  else if (blkno == P_NONE)
901  {
902  _bt_parallel_done(scan);
903  return false;
904  }
905  else if (blkno != InvalidBlockNumber)
906  {
907  if (!_bt_parallel_readpage(scan, blkno, dir))
908  return false;
909  goto readcomplete;
910  }
911  }
912 
913  /*----------
914  * Examine the scan keys to discover where we need to start the scan.
915  *
916  * We want to identify the keys that can be used as starting boundaries;
917  * these are =, >, or >= keys for a forward scan or =, <, <= keys for
918  * a backwards scan. We can use keys for multiple attributes so long as
919  * the prior attributes had only =, >= (resp. =, <=) keys. Once we accept
920  * a > or < boundary or find an attribute with no boundary (which can be
921  * thought of as the same as "> -infinity"), we can't use keys for any
922  * attributes to its right, because it would break our simplistic notion
923  * of what initial positioning strategy to use.
924  *
925  * When the scan keys include cross-type operators, _bt_preprocess_keys
926  * may not be able to eliminate redundant keys; in such cases we will
927  * arbitrarily pick a usable one for each attribute. This is correct
928  * but possibly not optimal behavior. (For example, with keys like
929  * "x >= 4 AND x >= 5" we would elect to scan starting at x=4 when
930  * x=5 would be more efficient.) Since the situation only arises given
931  * a poorly-worded query plus an incomplete opfamily, live with it.
932  *
933  * When both equality and inequality keys appear for a single attribute
934  * (again, only possible when cross-type operators appear), we *must*
935  * select one of the equality keys for the starting point, because
936  * _bt_checkkeys() will stop the scan as soon as an equality qual fails.
937  * For example, if we have keys like "x >= 4 AND x = 10" and we elect to
938  * start at x=4, we will fail and stop before reaching x=10. If multiple
939  * equality quals survive preprocessing, however, it doesn't matter which
940  * one we use --- by definition, they are either redundant or
941  * contradictory.
942  *
943  * Any regular (not SK_SEARCHNULL) key implies a NOT NULL qualifier.
944  * If the index stores nulls at the end of the index we'll be starting
945  * from, and we have no boundary key for the column (which means the key
946  * we deduced NOT NULL from is an inequality key that constrains the other
947  * end of the index), then we cons up an explicit SK_SEARCHNOTNULL key to
948  * use as a boundary key. If we didn't do this, we might find ourselves
949  * traversing a lot of null entries at the start of the scan.
950  *
951  * In this loop, row-comparison keys are treated the same as keys on their
952  * first (leftmost) columns. We'll add on lower-order columns of the row
953  * comparison below, if possible.
954  *
955  * The selected scan keys (at most one per index column) are remembered by
956  * storing their addresses into the local startKeys[] array.
957  *----------
958  */
959  strat_total = BTEqualStrategyNumber;
960  if (so->numberOfKeys > 0)
961  {
962  AttrNumber curattr;
963  ScanKey chosen;
964  ScanKey impliesNN;
965  ScanKey cur;
966 
967  /*
968  * chosen is the so-far-chosen key for the current attribute, if any.
969  * We don't cast the decision in stone until we reach keys for the
970  * next attribute.
971  */
972  curattr = 1;
973  chosen = NULL;
974  /* Also remember any scankey that implies a NOT NULL constraint */
975  impliesNN = NULL;
976 
977  /*
978  * Loop iterates from 0 to numberOfKeys inclusive; we use the last
979  * pass to handle after-last-key processing. Actual exit from the
980  * loop is at one of the "break" statements below.
981  */
982  for (cur = so->keyData, i = 0;; cur++, i++)
983  {
984  if (i >= so->numberOfKeys || cur->sk_attno != curattr)
985  {
986  /*
987  * Done looking at keys for curattr. If we didn't find a
988  * usable boundary key, see if we can deduce a NOT NULL key.
989  */
990  if (chosen == NULL && impliesNN != NULL &&
991  ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
994  {
995  /* Yes, so build the key in notnullkeys[keysCount] */
996  chosen = &notnullkeys[keysCount];
997  ScanKeyEntryInitialize(chosen,
999  (impliesNN->sk_flags &
1001  curattr,
1002  ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
1005  InvalidOid,
1006  InvalidOid,
1007  InvalidOid,
1008  (Datum) 0);
1009  }
1010 
1011  /*
1012  * If we still didn't find a usable boundary key, quit; else
1013  * save the boundary key pointer in startKeys.
1014  */
1015  if (chosen == NULL)
1016  break;
1017  startKeys[keysCount++] = chosen;
1018 
1019  /*
1020  * Adjust strat_total, and quit if we have stored a > or <
1021  * key.
1022  */
1023  strat = chosen->sk_strategy;
1024  if (strat != BTEqualStrategyNumber)
1025  {
1026  strat_total = strat;
1027  if (strat == BTGreaterStrategyNumber ||
1028  strat == BTLessStrategyNumber)
1029  break;
1030  }
1031 
1032  /*
1033  * Done if that was the last attribute, or if next key is not
1034  * in sequence (implying no boundary key is available for the
1035  * next attribute).
1036  */
1037  if (i >= so->numberOfKeys ||
1038  cur->sk_attno != curattr + 1)
1039  break;
1040 
1041  /*
1042  * Reset for next attr.
1043  */
1044  curattr = cur->sk_attno;
1045  chosen = NULL;
1046  impliesNN = NULL;
1047  }
1048 
1049  /*
1050  * Can we use this key as a starting boundary for this attr?
1051  *
1052  * If not, does it imply a NOT NULL constraint? (Because
1053  * SK_SEARCHNULL keys are always assigned BTEqualStrategyNumber,
1054  * *any* inequality key works for that; we need not test.)
1055  */
1056  switch (cur->sk_strategy)
1057  {
1058  case BTLessStrategyNumber:
1060  if (chosen == NULL)
1061  {
1062  if (ScanDirectionIsBackward(dir))
1063  chosen = cur;
1064  else
1065  impliesNN = cur;
1066  }
1067  break;
1068  case BTEqualStrategyNumber:
1069  /* override any non-equality choice */
1070  chosen = cur;
1071  break;
1074  if (chosen == NULL)
1075  {
1076  if (ScanDirectionIsForward(dir))
1077  chosen = cur;
1078  else
1079  impliesNN = cur;
1080  }
1081  break;
1082  }
1083  }
1084  }
1085 
1086  /*
1087  * If we found no usable boundary keys, we have to start from one end of
1088  * the tree. Walk down that edge to the first or last key, and scan from
1089  * there.
1090  */
1091  if (keysCount == 0)
1092  {
1093  bool match;
1094 
1095  match = _bt_endpoint(scan, dir);
1096 
1097  if (!match)
1098  {
1099  /* No match, so mark (parallel) scan finished */
1100  _bt_parallel_done(scan);
1101  }
1102 
1103  return match;
1104  }
1105 
1106  /*
1107  * We want to start the scan somewhere within the index. Set up an
1108  * insertion scankey we can use to search for the boundary point we
1109  * identified above. The insertion scankey is built using the keys
1110  * identified by startKeys[]. (Remaining insertion scankey fields are
1111  * initialized after initial-positioning strategy is finalized.)
1112  */
1113  Assert(keysCount <= INDEX_MAX_KEYS);
1114  for (i = 0; i < keysCount; i++)
1115  {
1116  ScanKey cur = startKeys[i];
1117 
1118  Assert(cur->sk_attno == i + 1);
1119 
1120  if (cur->sk_flags & SK_ROW_HEADER)
1121  {
1122  /*
1123  * Row comparison header: look to the first row member instead.
1124  *
1125  * The member scankeys are already in insertion format (ie, they
1126  * have sk_func = 3-way-comparison function), but we have to watch
1127  * out for nulls, which _bt_preprocess_keys didn't check. A null
1128  * in the first row member makes the condition unmatchable, just
1129  * like qual_ok = false.
1130  */
1131  ScanKey subkey = (ScanKey) DatumGetPointer(cur->sk_argument);
1132 
1133  Assert(subkey->sk_flags & SK_ROW_MEMBER);
1134  if (subkey->sk_flags & SK_ISNULL)
1135  {
1136  _bt_parallel_done(scan);
1137  return false;
1138  }
1139  memcpy(inskey.scankeys + i, subkey, sizeof(ScanKeyData));
1140 
1141  /*
1142  * If the row comparison is the last positioning key we accepted,
1143  * try to add additional keys from the lower-order row members.
1144  * (If we accepted independent conditions on additional index
1145  * columns, we use those instead --- doesn't seem worth trying to
1146  * determine which is more restrictive.) Note that this is OK
1147  * even if the row comparison is of ">" or "<" type, because the
1148  * condition applied to all but the last row member is effectively
1149  * ">=" or "<=", and so the extra keys don't break the positioning
1150  * scheme. But, by the same token, if we aren't able to use all
1151  * the row members, then the part of the row comparison that we
1152  * did use has to be treated as just a ">=" or "<=" condition, and
1153  * so we'd better adjust strat_total accordingly.
1154  */
1155  if (i == keysCount - 1)
1156  {
1157  bool used_all_subkeys = false;
1158 
1159  Assert(!(subkey->sk_flags & SK_ROW_END));
1160  for (;;)
1161  {
1162  subkey++;
1163  Assert(subkey->sk_flags & SK_ROW_MEMBER);
1164  if (subkey->sk_attno != keysCount + 1)
1165  break; /* out-of-sequence, can't use it */
1166  if (subkey->sk_strategy != cur->sk_strategy)
1167  break; /* wrong direction, can't use it */
1168  if (subkey->sk_flags & SK_ISNULL)
1169  break; /* can't use null keys */
1170  Assert(keysCount < INDEX_MAX_KEYS);
1171  memcpy(inskey.scankeys + keysCount, subkey,
1172  sizeof(ScanKeyData));
1173  keysCount++;
1174  if (subkey->sk_flags & SK_ROW_END)
1175  {
1176  used_all_subkeys = true;
1177  break;
1178  }
1179  }
1180  if (!used_all_subkeys)
1181  {
1182  switch (strat_total)
1183  {
1184  case BTLessStrategyNumber:
1185  strat_total = BTLessEqualStrategyNumber;
1186  break;
1188  strat_total = BTGreaterEqualStrategyNumber;
1189  break;
1190  }
1191  }
1192  break; /* done with outer loop */
1193  }
1194  }
1195  else
1196  {
1197  /*
1198  * Ordinary comparison key. Transform the search-style scan key
1199  * to an insertion scan key by replacing the sk_func with the
1200  * appropriate btree comparison function.
1201  *
1202  * If scankey operator is not a cross-type comparison, we can use
1203  * the cached comparison function; otherwise gotta look it up in
1204  * the catalogs. (That can't lead to infinite recursion, since no
1205  * indexscan initiated by syscache lookup will use cross-data-type
1206  * operators.)
1207  *
1208  * We support the convention that sk_subtype == InvalidOid means
1209  * the opclass input type; this is a hack to simplify life for
1210  * ScanKeyInit().
1211  */
1212  if (cur->sk_subtype == rel->rd_opcintype[i] ||
1213  cur->sk_subtype == InvalidOid)
1214  {
1215  FmgrInfo *procinfo;
1216 
1217  procinfo = index_getprocinfo(rel, cur->sk_attno, BTORDER_PROC);
1218  ScanKeyEntryInitializeWithInfo(inskey.scankeys + i,
1219  cur->sk_flags,
1220  cur->sk_attno,
1222  cur->sk_subtype,
1223  cur->sk_collation,
1224  procinfo,
1225  cur->sk_argument);
1226  }
1227  else
1228  {
1229  RegProcedure cmp_proc;
1230 
1231  cmp_proc = get_opfamily_proc(rel->rd_opfamily[i],
1232  rel->rd_opcintype[i],
1233  cur->sk_subtype,
1234  BTORDER_PROC);
1235  if (!RegProcedureIsValid(cmp_proc))
1236  elog(ERROR, "missing support function %d(%u,%u) for attribute %d of index \"%s\"",
1237  BTORDER_PROC, rel->rd_opcintype[i], cur->sk_subtype,
1238  cur->sk_attno, RelationGetRelationName(rel));
1239  ScanKeyEntryInitialize(inskey.scankeys + i,
1240  cur->sk_flags,
1241  cur->sk_attno,
1243  cur->sk_subtype,
1244  cur->sk_collation,
1245  cmp_proc,
1246  cur->sk_argument);
1247  }
1248  }
1249  }
1250 
1251  /*----------
1252  * Examine the selected initial-positioning strategy to determine exactly
1253  * where we need to start the scan, and set flag variables to control the
1254  * code below.
1255  *
1256  * If nextkey = false, _bt_search and _bt_binsrch will locate the first
1257  * item >= scan key. If nextkey = true, they will locate the first
1258  * item > scan key.
1259  *
1260  * If goback = true, we will then step back one item, while if
1261  * goback = false, we will start the scan on the located item.
1262  *----------
1263  */
1264  switch (strat_total)
1265  {
1266  case BTLessStrategyNumber:
1267 
1268  /*
1269  * Find first item >= scankey, then back up one to arrive at last
1270  * item < scankey. (Note: this positioning strategy is only used
1271  * for a backward scan, so that is always the correct starting
1272  * position.)
1273  */
1274  nextkey = false;
1275  goback = true;
1276  break;
1277 
1279 
1280  /*
1281  * Find first item > scankey, then back up one to arrive at last
1282  * item <= scankey. (Note: this positioning strategy is only used
1283  * for a backward scan, so that is always the correct starting
1284  * position.)
1285  */
1286  nextkey = true;
1287  goback = true;
1288  break;
1289 
1290  case BTEqualStrategyNumber:
1291 
1292  /*
1293  * If a backward scan was specified, need to start with last equal
1294  * item not first one.
1295  */
1296  if (ScanDirectionIsBackward(dir))
1297  {
1298  /*
1299  * This is the same as the <= strategy. We will check at the
1300  * end whether the found item is actually =.
1301  */
1302  nextkey = true;
1303  goback = true;
1304  }
1305  else
1306  {
1307  /*
1308  * This is the same as the >= strategy. We will check at the
1309  * end whether the found item is actually =.
1310  */
1311  nextkey = false;
1312  goback = false;
1313  }
1314  break;
1315 
1317 
1318  /*
1319  * Find first item >= scankey. (This is only used for forward
1320  * scans.)
1321  */
1322  nextkey = false;
1323  goback = false;
1324  break;
1325 
1327 
1328  /*
1329  * Find first item > scankey. (This is only used for forward
1330  * scans.)
1331  */
1332  nextkey = true;
1333  goback = false;
1334  break;
1335 
1336  default:
1337  /* can't get here, but keep compiler quiet */
1338  elog(ERROR, "unrecognized strat_total: %d", (int) strat_total);
1339  return false;
1340  }
1341 
1342  /* Initialize remaining insertion scan key fields */
1343  _bt_metaversion(rel, &inskey.heapkeyspace, &inskey.allequalimage);
1344  inskey.anynullkeys = false; /* unused */
1345  inskey.nextkey = nextkey;
1346  inskey.pivotsearch = false;
1347  inskey.scantid = NULL;
1348  inskey.keysz = keysCount;
1349 
1350  /*
1351  * Use the manufactured insertion scan key to descend the tree and
1352  * position ourselves on the target leaf page.
1353  */
1354  stack = _bt_search(rel, &inskey, &buf, BT_READ, scan->xs_snapshot);
1355 
1356  /* don't need to keep the stack around... */
1357  _bt_freestack(stack);
1358 
1359  if (!BufferIsValid(buf))
1360  {
1361  /*
1362  * We only get here if the index is completely empty. Lock relation
1363  * because nothing finer to lock exists.
1364  */
1365  PredicateLockRelation(rel, scan->xs_snapshot);
1366 
1367  /*
1368  * mark parallel scan as done, so that all the workers can finish
1369  * their scan
1370  */
1371  _bt_parallel_done(scan);
1373 
1374  return false;
1375  }
1376  else
1378  scan->xs_snapshot);
1379 
1380  _bt_initialize_more_data(so, dir);
1381 
1382  /* position to the precise item on the page */
1383  offnum = _bt_binsrch(rel, &inskey, buf);
1384 
1385  /*
1386  * If nextkey = false, we are positioned at the first item >= scan key, or
1387  * possibly at the end of a page on which all the existing items are less
1388  * than the scan key and we know that everything on later pages is greater
1389  * than or equal to scan key.
1390  *
1391  * If nextkey = true, we are positioned at the first item > scan key, or
1392  * possibly at the end of a page on which all the existing items are less
1393  * than or equal to the scan key and we know that everything on later
1394  * pages is greater than scan key.
1395  *
1396  * The actually desired starting point is either this item or the prior
1397  * one, or in the end-of-page case it's the first item on the next page or
1398  * the last item on this page. Adjust the starting offset if needed. (If
1399  * this results in an offset before the first item or after the last one,
1400  * _bt_readpage will report no items found, and then we'll step to the
1401  * next page as needed.)
1402  */
1403  if (goback)
1404  offnum = OffsetNumberPrev(offnum);
1405 
1406  /* remember which buffer we have pinned, if any */
1408  so->currPos.buf = buf;
1409 
1410  /*
1411  * Now load data from the first page of the scan.
1412  */
1413  if (!_bt_readpage(scan, dir, offnum))
1414  {
1415  /*
1416  * There's no actually-matching data on this page. Try to advance to
1417  * the next page. Return false if there's no matching data at all.
1418  */
1419  _bt_unlockbuf(scan->indexRelation, so->currPos.buf);
1420  if (!_bt_steppage(scan, dir))
1421  return false;
1422  }
1423  else
1424  {
1425  /* Drop the lock, and maybe the pin, on the current page */
1427  }
1428 
1429 readcomplete:
1430  /* OK, itemIndex says what to return */
1431  currItem = &so->currPos.items[so->currPos.itemIndex];
1432  scan->xs_heaptid = currItem->heapTid;
1433  if (scan->xs_want_itup)
1434  scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
1435 
1436  return true;
1437 }
#define InvalidStrategy
Definition: stratnum.h:24
Oid sk_subtype
Definition: skey.h:69
Definition: fmgr.h:56
#define SK_ROW_MEMBER
Definition: skey.h:118
void _bt_freestack(BTStack stack)
Definition: nbtutils.c:175
void PredicateLockPage(Relation relation, BlockNumber blkno, Snapshot snapshot)
Definition: predicate.c:2592
#define BTORDER_PROC
Definition: nbtree.h:700
#define BTGreaterStrategyNumber
Definition: stratnum.h:33
FmgrInfo * index_getprocinfo(Relation irel, AttrNumber attnum, uint16 procnum)
Definition: indexam.c:803
#define ScanDirectionIsForward(direction)
Definition: sdir.h:55
IndexTuple xs_itup
Definition: relscan.h:142
void ScanKeyEntryInitializeWithInfo(ScanKey entry, int flags, AttrNumber attributeNumber, StrategyNumber strategy, Oid subtype, Oid collation, FmgrInfo *finfo, Datum argument)
Definition: scankey.c:101
void _bt_parallel_done(IndexScanDesc scan)
Definition: nbtree.c:715
static bool _bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
Definition: nbtsearch.c:2146
static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp)
Definition: nbtsearch.c:65
int itemIndex
Definition: nbtree.h:981
void PredicateLockRelation(Relation relation, Snapshot snapshot)
Definition: predicate.c:2569
char * currTuples
Definition: nbtree.h:1055
regproc RegProcedure
Definition: c.h:585
#define P_NONE
Definition: nbtree.h:211
struct cursor * cur
Definition: ecpg.c:28
uint16 StrategyNumber
Definition: stratnum.h:22
#define BTScanPosIsValid(scanpos)
Definition: nbtree.h:1005
struct SnapshotData * xs_snapshot
Definition: relscan.h:119
uint32 BlockNumber
Definition: block.h:31
static OffsetNumber _bt_binsrch(Relation rel, BTScanInsert key, Buffer buf)
Definition: nbtsearch.c:343
#define BTLessEqualStrategyNumber
Definition: stratnum.h:30
Relation indexRelation
Definition: relscan.h:118
uint16 OffsetNumber
Definition: off.h:24
#define SK_ROW_END
Definition: skey.h:119
#define ScanDirectionIsBackward(direction)
Definition: sdir.h:41
#define BT_READ
Definition: nbtree.h:712
#define ERROR
Definition: elog.h:46
BTScanOpaqueData * BTScanOpaque
Definition: nbtree.h:1072
StrategyNumber sk_strategy
Definition: skey.h:68
ItemPointerData xs_heaptid
Definition: relscan.h:147
static char * buf
Definition: pg_test_fsync.c:68
void ScanKeyEntryInitialize(ScanKey entry, int flags, AttrNumber attributeNumber, StrategyNumber strategy, Oid subtype, Oid collation, RegProcedure procedure, Datum argument)
Definition: scankey.c:32
ScanKeyData * ScanKey
Definition: skey.h:75
static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:2379
#define RegProcedureIsValid(p)
Definition: c.h:712
IndexTupleData * IndexTuple
Definition: itup.h:53
#define RelationGetRelationName(relation)
Definition: rel.h:511
#define pgstat_count_index_scan(rel)
Definition: pgstat.h:1067
#define SK_SEARCHNOTNULL
Definition: skey.h:122
bool _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno)
Definition: nbtree.c:634
Oid * rd_opfamily
Definition: rel.h:202
#define SK_ISNULL
Definition: skey.h:115
#define SK_ROW_HEADER
Definition: skey.h:117
void _bt_metaversion(Relation rel, bool *heapkeyspace, bool *allequalimage)
Definition: nbtpage.c:736
void _bt_preprocess_keys(IndexScanDesc scan)
Definition: nbtutils.c:742
#define SK_BT_NULLS_FIRST
Definition: nbtree.h:1083
static void _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir)
Definition: nbtsearch.c:2469
struct ParallelIndexScanDescData * parallel_scan
Definition: relscan.h:166
uintptr_t Datum
Definition: postgres.h:411
#define InvalidOid
Definition: postgres_ext.h:36
int sk_flags
Definition: skey.h:66
#define Assert(condition)
Definition: c.h:804
#define SK_BT_DESC
Definition: nbtree.h:1082
Oid get_opfamily_proc(Oid opfamily, Oid lefttype, Oid righttype, int16 procnum)
Definition: lsyscache.c:794
#define BTScanPosInvalidate(scanpos)
Definition: nbtree.h:1011
#define INDEX_MAX_KEYS
#define InvalidBlockNumber
Definition: block.h:33
#define OffsetNumberPrev(offsetNumber)
Definition: off.h:54
#define BufferIsValid(bufnum)
Definition: bufmgr.h:123
BTScanPosItem items[MaxTIDsPerBTreePage]
Definition: nbtree.h:983
#define DatumGetPointer(X)
Definition: postgres.h:593
void _bt_unlockbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:1082
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2748
BTScanPosData currPos
Definition: nbtree.h:1068
ScanKey keyData
Definition: nbtree.h:1035
Oid sk_collation
Definition: skey.h:70
#define elog(elevel,...)
Definition: elog.h:232
int i
static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:1861
Buffer buf
Definition: nbtree.h:951
Oid * rd_opcintype
Definition: rel.h:203
static void static void status(const char *fmt,...) pg_attribute_printf(1
Definition: pg_regress.c:229
#define BTLessStrategyNumber
Definition: stratnum.h:29
int Buffer
Definition: buf.h:23
Datum sk_argument
Definition: skey.h:72
static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
Definition: nbtsearch.c:1509
int16 AttrNumber
Definition: attnum.h:21
BTStack _bt_search(Relation rel, BTScanInsert key, Buffer *bufP, int access, Snapshot snapshot)
Definition: nbtsearch.c:101
#define BTEqualStrategyNumber
Definition: stratnum.h:31
#define BTGreaterEqualStrategyNumber
Definition: stratnum.h:32
AttrNumber sk_attno
Definition: skey.h:67

◆ _bt_form_posting()

IndexTuple _bt_form_posting ( IndexTuple  base,
ItemPointer  htids,
int  nhtids 
)