PostgreSQL Source Code  git master
All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Pages
nbtree.h File Reference
#include "access/amapi.h"
#include "access/itup.h"
#include "access/sdir.h"
#include "access/tableam.h"
#include "access/xlogreader.h"
#include "catalog/pg_am_d.h"
#include "catalog/pg_index.h"
#include "lib/stringinfo.h"
#include "storage/bufmgr.h"
#include "storage/shm_toc.h"
Include dependency graph for nbtree.h:
This graph shows which files directly or indirectly include this file:

Go to the source code of this file.

Data Structures

struct  BTPageOpaqueData
 
struct  BTMetaPageData
 
struct  BTDeletedPageData
 
struct  BTPendingFSM
 
struct  BTVacState
 
struct  BTStackData
 
struct  BTScanInsertData
 
struct  BTInsertStateData
 
struct  BTDedupInterval
 
struct  BTDedupStateData
 
struct  BTVacuumPostingData
 
struct  BTScanPosItem
 
struct  BTScanPosData
 
struct  BTArrayKeyInfo
 
struct  BTScanOpaqueData
 
struct  BTReadPageState
 
struct  BTOptions
 

Macros

#define BTPageGetOpaque(page)   ((BTPageOpaque) PageGetSpecialPointer(page))
 
#define BTP_LEAF   (1 << 0) /* leaf page, i.e. not internal page */
 
#define BTP_ROOT   (1 << 1) /* root page (has no parent) */
 
#define BTP_DELETED   (1 << 2) /* page has been deleted from tree */
 
#define BTP_META   (1 << 3) /* meta-page */
 
#define BTP_HALF_DEAD   (1 << 4) /* empty, but still in tree */
 
#define BTP_SPLIT_END   (1 << 5) /* rightmost page of split group */
 
#define BTP_HAS_GARBAGE   (1 << 6) /* page has LP_DEAD tuples (deprecated) */
 
#define BTP_INCOMPLETE_SPLIT   (1 << 7) /* right sibling's downlink is missing */
 
#define BTP_HAS_FULLXID   (1 << 8) /* contains BTDeletedPageData */
 
#define MAX_BT_CYCLE_ID   0xFF7F
 
#define BTPageGetMeta(p)    ((BTMetaPageData *) PageGetContents(p))
 
#define BTREE_METAPAGE   0 /* first page is meta */
 
#define BTREE_MAGIC   0x053162 /* magic number in metapage */
 
#define BTREE_VERSION   4 /* current version number */
 
#define BTREE_MIN_VERSION   2 /* minimum supported version */
 
#define BTREE_NOVAC_VERSION   3 /* version with all meta fields set */
 
#define BTMaxItemSize(page)
 
#define BTMaxItemSizeNoHeapTid(page)
 
#define MaxTIDsPerBTreePage
 
#define BTREE_MIN_FILLFACTOR   10
 
#define BTREE_DEFAULT_FILLFACTOR   90
 
#define BTREE_NONLEAF_FILLFACTOR   70
 
#define BTREE_SINGLEVAL_FILLFACTOR   96
 
#define P_NONE   0
 
#define P_LEFTMOST(opaque)   ((opaque)->btpo_prev == P_NONE)
 
#define P_RIGHTMOST(opaque)   ((opaque)->btpo_next == P_NONE)
 
#define P_ISLEAF(opaque)   (((opaque)->btpo_flags & BTP_LEAF) != 0)
 
#define P_ISROOT(opaque)   (((opaque)->btpo_flags & BTP_ROOT) != 0)
 
#define P_ISDELETED(opaque)   (((opaque)->btpo_flags & BTP_DELETED) != 0)
 
#define P_ISMETA(opaque)   (((opaque)->btpo_flags & BTP_META) != 0)
 
#define P_ISHALFDEAD(opaque)   (((opaque)->btpo_flags & BTP_HALF_DEAD) != 0)
 
#define P_IGNORE(opaque)   (((opaque)->btpo_flags & (BTP_DELETED|BTP_HALF_DEAD)) != 0)
 
#define P_HAS_GARBAGE(opaque)   (((opaque)->btpo_flags & BTP_HAS_GARBAGE) != 0)
 
#define P_INCOMPLETE_SPLIT(opaque)   (((opaque)->btpo_flags & BTP_INCOMPLETE_SPLIT) != 0)
 
#define P_HAS_FULLXID(opaque)   (((opaque)->btpo_flags & BTP_HAS_FULLXID) != 0)
 
#define P_HIKEY   ((OffsetNumber) 1)
 
#define P_FIRSTKEY   ((OffsetNumber) 2)
 
#define P_FIRSTDATAKEY(opaque)   (P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY)
 
#define INDEX_ALT_TID_MASK   INDEX_AM_RESERVED_BIT
 
#define BT_OFFSET_MASK   0x0FFF
 
#define BT_STATUS_OFFSET_MASK   0xF000
 
#define BT_PIVOT_HEAP_TID_ATTR   0x1000
 
#define BT_IS_POSTING   0x2000
 
#define BTreeTupleGetNAtts(itup, rel)
 
#define BTCommuteStrategyNumber(strat)   (BTMaxStrategyNumber + 1 - (strat))
 
#define BTORDER_PROC   1
 
#define BTSORTSUPPORT_PROC   2
 
#define BTINRANGE_PROC   3
 
#define BTEQUALIMAGE_PROC   4
 
#define BTOPTIONS_PROC   5
 
#define BTNProcs   5
 
#define BT_READ   BUFFER_LOCK_SHARE
 
#define BT_WRITE   BUFFER_LOCK_EXCLUSIVE
 
#define BTScanPosIsPinned(scanpos)
 
#define BTScanPosUnpin(scanpos)
 
#define BTScanPosUnpinIfPinned(scanpos)
 
#define BTScanPosIsValid(scanpos)
 
#define BTScanPosInvalidate(scanpos)
 
#define SK_BT_REQFWD   0x00010000 /* required to continue forward scan */
 
#define SK_BT_REQBKWD   0x00020000 /* required to continue backward scan */
 
#define SK_BT_INDOPTION_SHIFT   24 /* must clear the above bits */
 
#define SK_BT_DESC   (INDOPTION_DESC << SK_BT_INDOPTION_SHIFT)
 
#define SK_BT_NULLS_FIRST   (INDOPTION_NULLS_FIRST << SK_BT_INDOPTION_SHIFT)
 
#define BTGetFillFactor(relation)
 
#define BTGetTargetPageFreeSpace(relation)    (BLCKSZ * (100 - BTGetFillFactor(relation)) / 100)
 
#define BTGetDeduplicateItems(relation)
 
#define PROGRESS_BTREE_PHASE_INDEXBUILD_TABLESCAN   2
 
#define PROGRESS_BTREE_PHASE_PERFORMSORT_1   3
 
#define PROGRESS_BTREE_PHASE_PERFORMSORT_2   4
 
#define PROGRESS_BTREE_PHASE_LEAF_LOAD   5
 

Typedefs

typedef uint16 BTCycleId
 
typedef struct BTPageOpaqueData BTPageOpaqueData
 
typedef BTPageOpaqueDataBTPageOpaque
 
typedef struct BTMetaPageData BTMetaPageData
 
typedef struct BTDeletedPageData BTDeletedPageData
 
typedef struct BTPendingFSM BTPendingFSM
 
typedef struct BTVacState BTVacState
 
typedef struct BTStackData BTStackData
 
typedef BTStackDataBTStack
 
typedef struct BTScanInsertData BTScanInsertData
 
typedef BTScanInsertDataBTScanInsert
 
typedef struct BTInsertStateData BTInsertStateData
 
typedef BTInsertStateDataBTInsertState
 
typedef struct BTDedupInterval BTDedupInterval
 
typedef struct BTDedupStateData BTDedupStateData
 
typedef BTDedupStateDataBTDedupState
 
typedef struct BTVacuumPostingData BTVacuumPostingData
 
typedef BTVacuumPostingDataBTVacuumPosting
 
typedef struct BTScanPosItem BTScanPosItem
 
typedef struct BTScanPosData BTScanPosData
 
typedef BTScanPosDataBTScanPos
 
typedef struct BTArrayKeyInfo BTArrayKeyInfo
 
typedef struct BTScanOpaqueData BTScanOpaqueData
 
typedef BTScanOpaqueDataBTScanOpaque
 
typedef struct BTReadPageState BTReadPageState
 
typedef struct BTOptions BTOptions
 

Functions

static void BTPageSetDeleted (Page page, FullTransactionId safexid)
 
static FullTransactionId BTPageGetDeleteXid (Page page)
 
static bool BTPageIsRecyclable (Page page, Relation heaprel)
 
 StaticAssertDecl (BT_OFFSET_MASK >=INDEX_MAX_KEYS, "BT_OFFSET_MASK can't fit INDEX_MAX_KEYS")
 
static bool BTreeTupleIsPivot (IndexTuple itup)
 
static bool BTreeTupleIsPosting (IndexTuple itup)
 
static void BTreeTupleSetPosting (IndexTuple itup, uint16 nhtids, int postingoffset)
 
static uint16 BTreeTupleGetNPosting (IndexTuple posting)
 
static uint32 BTreeTupleGetPostingOffset (IndexTuple posting)
 
static ItemPointer BTreeTupleGetPosting (IndexTuple posting)
 
static ItemPointer BTreeTupleGetPostingN (IndexTuple posting, int n)
 
static BlockNumber BTreeTupleGetDownLink (IndexTuple pivot)
 
static void BTreeTupleSetDownLink (IndexTuple pivot, BlockNumber blkno)
 
static void BTreeTupleSetNAtts (IndexTuple itup, uint16 nkeyatts, bool heaptid)
 
static BlockNumber BTreeTupleGetTopParent (IndexTuple leafhikey)
 
static void BTreeTupleSetTopParent (IndexTuple leafhikey, BlockNumber blkno)
 
static ItemPointer BTreeTupleGetHeapTID (IndexTuple itup)
 
static ItemPointer BTreeTupleGetMaxHeapTID (IndexTuple itup)
 
void btbuildempty (Relation index)
 
bool btinsert (Relation rel, Datum *values, bool *isnull, ItemPointer ht_ctid, Relation heapRel, IndexUniqueCheck checkUnique, bool indexUnchanged, struct IndexInfo *indexInfo)
 
IndexScanDesc btbeginscan (Relation rel, int nkeys, int norderbys)
 
Size btestimateparallelscan (int nkeys, int norderbys)
 
void btinitparallelscan (void *target)
 
bool btgettuple (IndexScanDesc scan, ScanDirection dir)
 
int64 btgetbitmap (IndexScanDesc scan, TIDBitmap *tbm)
 
void btrescan (IndexScanDesc scan, ScanKey scankey, int nscankeys, ScanKey orderbys, int norderbys)
 
void btparallelrescan (IndexScanDesc scan)
 
void btendscan (IndexScanDesc scan)
 
void btmarkpos (IndexScanDesc scan)
 
void btrestrpos (IndexScanDesc scan)
 
IndexBulkDeleteResultbtbulkdelete (IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state)
 
IndexBulkDeleteResultbtvacuumcleanup (IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
 
bool btcanreturn (Relation index, int attno)
 
int btgettreeheight (Relation rel)
 
bool _bt_parallel_seize (IndexScanDesc scan, BlockNumber *next_scan_page, BlockNumber *last_curr_page, bool first)
 
void _bt_parallel_release (IndexScanDesc scan, BlockNumber next_scan_page, BlockNumber curr_page)
 
void _bt_parallel_done (IndexScanDesc scan)
 
void _bt_parallel_primscan_schedule (IndexScanDesc scan, BlockNumber curr_page)
 
void _bt_dedup_pass (Relation rel, Buffer buf, IndexTuple newitem, Size newitemsz, bool bottomupdedup)
 
bool _bt_bottomupdel_pass (Relation rel, Buffer buf, Relation heapRel, Size newitemsz)
 
void _bt_dedup_start_pending (BTDedupState state, IndexTuple base, OffsetNumber baseoff)
 
bool _bt_dedup_save_htid (BTDedupState state, IndexTuple itup)
 
Size _bt_dedup_finish_pending (Page newpage, BTDedupState state)
 
IndexTuple _bt_form_posting (IndexTuple base, ItemPointer htids, int nhtids)
 
void _bt_update_posting (BTVacuumPosting vacposting)
 
IndexTuple _bt_swap_posting (IndexTuple newitem, IndexTuple oposting, int postingoff)
 
bool _bt_doinsert (Relation rel, IndexTuple itup, IndexUniqueCheck checkUnique, bool indexUnchanged, Relation heapRel)
 
void _bt_finish_split (Relation rel, Relation heaprel, Buffer lbuf, BTStack stack)
 
Buffer _bt_getstackbuf (Relation rel, Relation heaprel, BTStack stack, BlockNumber child)
 
OffsetNumber _bt_findsplitloc (Relation rel, Page origpage, OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem, bool *newitemonleft)
 
void _bt_initmetapage (Page page, BlockNumber rootbknum, uint32 level, bool allequalimage)
 
bool _bt_vacuum_needs_cleanup (Relation rel)
 
void _bt_set_cleanup_info (Relation rel, BlockNumber num_delpages)
 
void _bt_upgrademetapage (Page page)
 
Buffer _bt_getroot (Relation rel, Relation heaprel, int access)
 
Buffer _bt_gettrueroot (Relation rel)
 
int _bt_getrootheight (Relation rel)
 
void _bt_metaversion (Relation rel, bool *heapkeyspace, bool *allequalimage)
 
void _bt_checkpage (Relation rel, Buffer buf)
 
Buffer _bt_getbuf (Relation rel, BlockNumber blkno, int access)
 
Buffer _bt_allocbuf (Relation rel, Relation heaprel)
 
Buffer _bt_relandgetbuf (Relation rel, Buffer obuf, BlockNumber blkno, int access)
 
void _bt_relbuf (Relation rel, Buffer buf)
 
void _bt_lockbuf (Relation rel, Buffer buf, int access)
 
void _bt_unlockbuf (Relation rel, Buffer buf)
 
bool _bt_conditionallockbuf (Relation rel, Buffer buf)
 
void _bt_upgradelockbufcleanup (Relation rel, Buffer buf)
 
void _bt_pageinit (Page page, Size size)
 
void _bt_delitems_vacuum (Relation rel, Buffer buf, OffsetNumber *deletable, int ndeletable, BTVacuumPosting *updatable, int nupdatable)
 
void _bt_delitems_delete_check (Relation rel, Buffer buf, Relation heapRel, TM_IndexDeleteOp *delstate)
 
void _bt_pagedel (Relation rel, Buffer leafbuf, BTVacState *vstate)
 
void _bt_pendingfsm_init (Relation rel, BTVacState *vstate, bool cleanuponly)
 
void _bt_pendingfsm_finalize (Relation rel, BTVacState *vstate)
 
BTStack _bt_search (Relation rel, Relation heaprel, BTScanInsert key, Buffer *bufP, int access)
 
OffsetNumber _bt_binsrch_insert (Relation rel, BTInsertState insertstate)
 
int32 _bt_compare (Relation rel, BTScanInsert key, Page page, OffsetNumber offnum)
 
bool _bt_first (IndexScanDesc scan, ScanDirection dir)
 
bool _bt_next (IndexScanDesc scan, ScanDirection dir)
 
Buffer _bt_get_endpoint (Relation rel, uint32 level, bool rightmost)
 
BTScanInsert _bt_mkscankey (Relation rel, IndexTuple itup)
 
void _bt_freestack (BTStack stack)
 
bool _bt_start_prim_scan (IndexScanDesc scan, ScanDirection dir)
 
void _bt_start_array_keys (IndexScanDesc scan, ScanDirection dir)
 
void _bt_preprocess_keys (IndexScanDesc scan)
 
bool _bt_checkkeys (IndexScanDesc scan, BTReadPageState *pstate, bool arrayKeys, IndexTuple tuple, int tupnatts)
 
bool _bt_oppodir_checkkeys (IndexScanDesc scan, ScanDirection dir, IndexTuple finaltup)
 
void _bt_killitems (IndexScanDesc scan)
 
BTCycleId _bt_vacuum_cycleid (Relation rel)
 
BTCycleId _bt_start_vacuum (Relation rel)
 
void _bt_end_vacuum (Relation rel)
 
void _bt_end_vacuum_callback (int code, Datum arg)
 
Size BTreeShmemSize (void)
 
void BTreeShmemInit (void)
 
byteabtoptions (Datum reloptions, bool validate)
 
bool btproperty (Oid index_oid, int attno, IndexAMProperty prop, const char *propname, bool *res, bool *isnull)
 
char * btbuildphasename (int64 phasenum)
 
IndexTuple _bt_truncate (Relation rel, IndexTuple lastleft, IndexTuple firstright, BTScanInsert itup_key)
 
int _bt_keep_natts_fast (Relation rel, IndexTuple lastleft, IndexTuple firstright)
 
bool _bt_check_natts (Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum)
 
void _bt_check_third_page (Relation rel, Relation heap, bool needheaptidspace, Page page, IndexTuple newtup)
 
bool _bt_allequalimage (Relation rel, bool debugmessage)
 
bool btvalidate (Oid opclassoid)
 
void btadjustmembers (Oid opfamilyoid, Oid opclassoid, List *operators, List *functions)
 
IndexBuildResultbtbuild (Relation heap, Relation index, struct IndexInfo *indexInfo)
 
void _bt_parallel_build_main (dsm_segment *seg, shm_toc *toc)
 

Macro Definition Documentation

◆ BT_IS_POSTING

#define BT_IS_POSTING   0x2000

Definition at line 466 of file nbtree.h.

◆ BT_OFFSET_MASK

#define BT_OFFSET_MASK   0x0FFF

Definition at line 462 of file nbtree.h.

◆ BT_PIVOT_HEAP_TID_ATTR

#define BT_PIVOT_HEAP_TID_ATTR   0x1000

Definition at line 465 of file nbtree.h.

◆ BT_READ

#define BT_READ   BUFFER_LOCK_SHARE

Definition at line 719 of file nbtree.h.

◆ BT_STATUS_OFFSET_MASK

#define BT_STATUS_OFFSET_MASK   0xF000

Definition at line 463 of file nbtree.h.

◆ BT_WRITE

#define BT_WRITE   BUFFER_LOCK_EXCLUSIVE

Definition at line 720 of file nbtree.h.

◆ BTCommuteStrategyNumber

#define BTCommuteStrategyNumber (   strat)    (BTMaxStrategyNumber + 1 - (strat))

Definition at line 685 of file nbtree.h.

◆ BTEQUALIMAGE_PROC

#define BTEQUALIMAGE_PROC   4

Definition at line 710 of file nbtree.h.

◆ BTGetDeduplicateItems

#define BTGetDeduplicateItems (   relation)
Value:
(AssertMacro(relation->rd_rel->relkind == RELKIND_INDEX && \
relation->rd_rel->relam == BTREE_AM_OID), \
((relation)->rd_options ? \
((BTOptions *) (relation)->rd_options)->deduplicate_items : true))
#define AssertMacro(condition)
Definition: c.h:864

Definition at line 1136 of file nbtree.h.

◆ BTGetFillFactor

#define BTGetFillFactor (   relation)
Value:
(AssertMacro(relation->rd_rel->relkind == RELKIND_INDEX && \
relation->rd_rel->relam == BTREE_AM_OID), \
(relation)->rd_options ? \
((BTOptions *) (relation)->rd_options)->fillfactor : \
BTREE_DEFAULT_FILLFACTOR)

Definition at line 1128 of file nbtree.h.

◆ BTGetTargetPageFreeSpace

#define BTGetTargetPageFreeSpace (   relation)     (BLCKSZ * (100 - BTGetFillFactor(relation)) / 100)

Definition at line 1134 of file nbtree.h.

◆ BTINRANGE_PROC

#define BTINRANGE_PROC   3

Definition at line 709 of file nbtree.h.

◆ BTMaxItemSize

#define BTMaxItemSize (   page)
Value:
MAXALIGN(sizeof(BTPageOpaqueData))) / 3) - \
MAXALIGN(sizeof(ItemPointerData)))
static Size PageGetPageSize(Page page)
Definition: bufpage.h:276
#define SizeOfPageHeaderData
Definition: bufpage.h:216
#define MAXALIGN_DOWN(LEN)
Definition: c.h:828
#define MAXALIGN(LEN)
Definition: c.h:816

Definition at line 164 of file nbtree.h.

◆ BTMaxItemSizeNoHeapTid

#define BTMaxItemSizeNoHeapTid (   page)
Value:

Definition at line 169 of file nbtree.h.

◆ BTNProcs

#define BTNProcs   5

Definition at line 712 of file nbtree.h.

◆ BTOPTIONS_PROC

#define BTOPTIONS_PROC   5

Definition at line 711 of file nbtree.h.

◆ BTORDER_PROC

#define BTORDER_PROC   1

Definition at line 707 of file nbtree.h.

◆ BTP_DELETED

#define BTP_DELETED   (1 << 2) /* page has been deleted from tree */

Definition at line 78 of file nbtree.h.

◆ BTP_HALF_DEAD

#define BTP_HALF_DEAD   (1 << 4) /* empty, but still in tree */

Definition at line 80 of file nbtree.h.

◆ BTP_HAS_FULLXID

#define BTP_HAS_FULLXID   (1 << 8) /* contains BTDeletedPageData */

Definition at line 84 of file nbtree.h.

◆ BTP_HAS_GARBAGE

#define BTP_HAS_GARBAGE   (1 << 6) /* page has LP_DEAD tuples (deprecated) */

Definition at line 82 of file nbtree.h.

◆ BTP_INCOMPLETE_SPLIT

#define BTP_INCOMPLETE_SPLIT   (1 << 7) /* right sibling's downlink is missing */

Definition at line 83 of file nbtree.h.

◆ BTP_LEAF

#define BTP_LEAF   (1 << 0) /* leaf page, i.e. not internal page */

Definition at line 76 of file nbtree.h.

◆ BTP_META

#define BTP_META   (1 << 3) /* meta-page */

Definition at line 79 of file nbtree.h.

◆ BTP_ROOT

#define BTP_ROOT   (1 << 1) /* root page (has no parent) */

Definition at line 77 of file nbtree.h.

◆ BTP_SPLIT_END

#define BTP_SPLIT_END   (1 << 5) /* rightmost page of split group */

Definition at line 81 of file nbtree.h.

◆ BTPageGetMeta

#define BTPageGetMeta (   p)     ((BTMetaPageData *) PageGetContents(p))

Definition at line 121 of file nbtree.h.

◆ BTPageGetOpaque

#define BTPageGetOpaque (   page)    ((BTPageOpaque) PageGetSpecialPointer(page))

Definition at line 73 of file nbtree.h.

◆ BTREE_DEFAULT_FILLFACTOR

#define BTREE_DEFAULT_FILLFACTOR   90

Definition at line 200 of file nbtree.h.

◆ BTREE_MAGIC

#define BTREE_MAGIC   0x053162 /* magic number in metapage */

Definition at line 149 of file nbtree.h.

◆ BTREE_METAPAGE

#define BTREE_METAPAGE   0 /* first page is meta */

Definition at line 148 of file nbtree.h.

◆ BTREE_MIN_FILLFACTOR

#define BTREE_MIN_FILLFACTOR   10

Definition at line 199 of file nbtree.h.

◆ BTREE_MIN_VERSION

#define BTREE_MIN_VERSION   2 /* minimum supported version */

Definition at line 151 of file nbtree.h.

◆ BTREE_NONLEAF_FILLFACTOR

#define BTREE_NONLEAF_FILLFACTOR   70

Definition at line 201 of file nbtree.h.

◆ BTREE_NOVAC_VERSION

#define BTREE_NOVAC_VERSION   3 /* version with all meta fields set */

Definition at line 152 of file nbtree.h.

◆ BTREE_SINGLEVAL_FILLFACTOR

#define BTREE_SINGLEVAL_FILLFACTOR   96

Definition at line 202 of file nbtree.h.

◆ BTREE_VERSION

#define BTREE_VERSION   4 /* current version number */

Definition at line 150 of file nbtree.h.

◆ BTreeTupleGetNAtts

#define BTreeTupleGetNAtts (   itup,
  rel 
)
Value:
( \
(BTreeTupleIsPivot(itup)) ? \
( \
) \
: \
)
static OffsetNumber ItemPointerGetOffsetNumberNoCheck(const ItemPointerData *pointer)
Definition: itemptr.h:114
static bool BTreeTupleIsPivot(IndexTuple itup)
Definition: nbtree.h:480
#define BT_OFFSET_MASK
Definition: nbtree.h:462
#define IndexRelationGetNumberOfAttributes(relation)
Definition: rel.h:517

Definition at line 577 of file nbtree.h.

◆ BTScanPosInvalidate

#define BTScanPosInvalidate (   scanpos)
Value:
do { \
(scanpos).buf = InvalidBuffer; \
(scanpos).currPage = InvalidBlockNumber; \
} while (0)
#define InvalidBlockNumber
Definition: block.h:33
#define InvalidBuffer
Definition: buf.h:25
static char * buf
Definition: pg_test_fsync.c:72

Definition at line 1016 of file nbtree.h.

◆ BTScanPosIsPinned

#define BTScanPosIsPinned (   scanpos)
Value:
( \
AssertMacro(BlockNumberIsValid((scanpos).currPage) || \
!BufferIsValid((scanpos).buf)), \
BufferIsValid((scanpos).buf) \
)
static bool BlockNumberIsValid(BlockNumber blockNumber)
Definition: block.h:71
static bool BufferIsValid(Buffer bufnum)
Definition: bufmgr.h:351

Definition at line 993 of file nbtree.h.

◆ BTScanPosIsValid

#define BTScanPosIsValid (   scanpos)
Value:
( \
AssertMacro(BlockNumberIsValid((scanpos).currPage) || \
!BufferIsValid((scanpos).buf)), \
BlockNumberIsValid((scanpos).currPage) \
)

Definition at line 1010 of file nbtree.h.

◆ BTScanPosUnpin

#define BTScanPosUnpin (   scanpos)
Value:
do { \
ReleaseBuffer((scanpos).buf); \
(scanpos).buf = InvalidBuffer; \
} while (0)

Definition at line 999 of file nbtree.h.

◆ BTScanPosUnpinIfPinned

#define BTScanPosUnpinIfPinned (   scanpos)
Value:
do { \
if (BTScanPosIsPinned(scanpos)) \
BTScanPosUnpin(scanpos); \
} while (0)
#define BTScanPosIsPinned(scanpos)
Definition: nbtree.h:993

Definition at line 1004 of file nbtree.h.

◆ BTSORTSUPPORT_PROC

#define BTSORTSUPPORT_PROC   2

Definition at line 708 of file nbtree.h.

◆ INDEX_ALT_TID_MASK

#define INDEX_ALT_TID_MASK   INDEX_AM_RESERVED_BIT

Definition at line 459 of file nbtree.h.

◆ MAX_BT_CYCLE_ID

#define MAX_BT_CYCLE_ID   0xFF7F

Definition at line 93 of file nbtree.h.

◆ MaxTIDsPerBTreePage

#define MaxTIDsPerBTreePage
Value:
(int) ((BLCKSZ - SizeOfPageHeaderData - sizeof(BTPageOpaqueData)) / \
sizeof(ItemPointerData))

Definition at line 185 of file nbtree.h.

◆ P_FIRSTDATAKEY

#define P_FIRSTDATAKEY (   opaque)    (P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY)

Definition at line 369 of file nbtree.h.

◆ P_FIRSTKEY

#define P_FIRSTKEY   ((OffsetNumber) 2)

Definition at line 368 of file nbtree.h.

◆ P_HAS_FULLXID

#define P_HAS_FULLXID (   opaque)    (((opaque)->btpo_flags & BTP_HAS_FULLXID) != 0)

Definition at line 228 of file nbtree.h.

◆ P_HAS_GARBAGE

#define P_HAS_GARBAGE (   opaque)    (((opaque)->btpo_flags & BTP_HAS_GARBAGE) != 0)

Definition at line 226 of file nbtree.h.

◆ P_HIKEY

#define P_HIKEY   ((OffsetNumber) 1)

Definition at line 367 of file nbtree.h.

◆ P_IGNORE

#define P_IGNORE (   opaque)    (((opaque)->btpo_flags & (BTP_DELETED|BTP_HALF_DEAD)) != 0)

Definition at line 225 of file nbtree.h.

◆ P_INCOMPLETE_SPLIT

#define P_INCOMPLETE_SPLIT (   opaque)    (((opaque)->btpo_flags & BTP_INCOMPLETE_SPLIT) != 0)

Definition at line 227 of file nbtree.h.

◆ P_ISDELETED

#define P_ISDELETED (   opaque)    (((opaque)->btpo_flags & BTP_DELETED) != 0)

Definition at line 222 of file nbtree.h.

◆ P_ISHALFDEAD

#define P_ISHALFDEAD (   opaque)    (((opaque)->btpo_flags & BTP_HALF_DEAD) != 0)

Definition at line 224 of file nbtree.h.

◆ P_ISLEAF

#define P_ISLEAF (   opaque)    (((opaque)->btpo_flags & BTP_LEAF) != 0)

Definition at line 220 of file nbtree.h.

◆ P_ISMETA

#define P_ISMETA (   opaque)    (((opaque)->btpo_flags & BTP_META) != 0)

Definition at line 223 of file nbtree.h.

◆ P_ISROOT

#define P_ISROOT (   opaque)    (((opaque)->btpo_flags & BTP_ROOT) != 0)

Definition at line 221 of file nbtree.h.

◆ P_LEFTMOST

#define P_LEFTMOST (   opaque)    ((opaque)->btpo_prev == P_NONE)

Definition at line 218 of file nbtree.h.

◆ P_NONE

#define P_NONE   0

Definition at line 212 of file nbtree.h.

◆ P_RIGHTMOST

#define P_RIGHTMOST (   opaque)    ((opaque)->btpo_next == P_NONE)

Definition at line 219 of file nbtree.h.

◆ PROGRESS_BTREE_PHASE_INDEXBUILD_TABLESCAN

#define PROGRESS_BTREE_PHASE_INDEXBUILD_TABLESCAN   2

Definition at line 1147 of file nbtree.h.

◆ PROGRESS_BTREE_PHASE_LEAF_LOAD

#define PROGRESS_BTREE_PHASE_LEAF_LOAD   5

Definition at line 1150 of file nbtree.h.

◆ PROGRESS_BTREE_PHASE_PERFORMSORT_1

#define PROGRESS_BTREE_PHASE_PERFORMSORT_1   3

Definition at line 1148 of file nbtree.h.

◆ PROGRESS_BTREE_PHASE_PERFORMSORT_2

#define PROGRESS_BTREE_PHASE_PERFORMSORT_2   4

Definition at line 1149 of file nbtree.h.

◆ SK_BT_DESC

#define SK_BT_DESC   (INDOPTION_DESC << SK_BT_INDOPTION_SHIFT)

Definition at line 1117 of file nbtree.h.

◆ SK_BT_INDOPTION_SHIFT

#define SK_BT_INDOPTION_SHIFT   24 /* must clear the above bits */

Definition at line 1116 of file nbtree.h.

◆ SK_BT_NULLS_FIRST

#define SK_BT_NULLS_FIRST   (INDOPTION_NULLS_FIRST << SK_BT_INDOPTION_SHIFT)

Definition at line 1118 of file nbtree.h.

◆ SK_BT_REQBKWD

#define SK_BT_REQBKWD   0x00020000 /* required to continue backward scan */

Definition at line 1115 of file nbtree.h.

◆ SK_BT_REQFWD

#define SK_BT_REQFWD   0x00010000 /* required to continue forward scan */

Definition at line 1114 of file nbtree.h.

Typedef Documentation

◆ BTArrayKeyInfo

◆ BTCycleId

typedef uint16 BTCycleId

Definition at line 29 of file nbtree.h.

◆ BTDedupInterval

◆ BTDedupState

Definition at line 893 of file nbtree.h.

◆ BTDedupStateData

◆ BTDeletedPageData

◆ BTInsertState

Definition at line 835 of file nbtree.h.

◆ BTInsertStateData

◆ BTMetaPageData

◆ BTOptions

typedef struct BTOptions BTOptions

◆ BTPageOpaque

Definition at line 71 of file nbtree.h.

◆ BTPageOpaqueData

◆ BTPendingFSM

typedef struct BTPendingFSM BTPendingFSM

◆ BTReadPageState

◆ BTScanInsert

Definition at line 796 of file nbtree.h.

◆ BTScanInsertData

◆ BTScanOpaque

Definition at line 1073 of file nbtree.h.

◆ BTScanOpaqueData

◆ BTScanPos

Definition at line 991 of file nbtree.h.

◆ BTScanPosData

typedef struct BTScanPosData BTScanPosData

◆ BTScanPosItem

typedef struct BTScanPosItem BTScanPosItem

◆ BTStack

typedef BTStackData* BTStack

Definition at line 739 of file nbtree.h.

◆ BTStackData

typedef struct BTStackData BTStackData

◆ BTVacState

typedef struct BTVacState BTVacState

◆ BTVacuumPosting

Definition at line 914 of file nbtree.h.

◆ BTVacuumPostingData

Function Documentation

◆ _bt_allequalimage()

bool _bt_allequalimage ( Relation  rel,
bool  debugmessage 
)

Definition at line 5142 of file nbtutils.c.

5143 {
5144  bool allequalimage = true;
5145 
5146  /* INCLUDE indexes can never support deduplication */
5149  return false;
5150 
5151  for (int i = 0; i < IndexRelationGetNumberOfKeyAttributes(rel); i++)
5152  {
5153  Oid opfamily = rel->rd_opfamily[i];
5154  Oid opcintype = rel->rd_opcintype[i];
5155  Oid collation = rel->rd_indcollation[i];
5156  Oid equalimageproc;
5157 
5158  equalimageproc = get_opfamily_proc(opfamily, opcintype, opcintype,
5160 
5161  /*
5162  * If there is no BTEQUALIMAGE_PROC then deduplication is assumed to
5163  * be unsafe. Otherwise, actually call proc and see what it says.
5164  */
5165  if (!OidIsValid(equalimageproc) ||
5166  !DatumGetBool(OidFunctionCall1Coll(equalimageproc, collation,
5167  ObjectIdGetDatum(opcintype))))
5168  {
5169  allequalimage = false;
5170  break;
5171  }
5172  }
5173 
5174  if (debugmessage)
5175  {
5176  if (allequalimage)
5177  elog(DEBUG1, "index \"%s\" can safely use deduplication",
5179  else
5180  elog(DEBUG1, "index \"%s\" cannot use deduplication",
5182  }
5183 
5184  return allequalimage;
5185 }
#define OidIsValid(objectId)
Definition: c.h:780
#define DEBUG1
Definition: elog.h:30
#define elog(elevel,...)
Definition: elog.h:225
Datum OidFunctionCall1Coll(Oid functionId, Oid collation, Datum arg1)
Definition: fmgr.c:1411
int i
Definition: isn.c:72
Oid get_opfamily_proc(Oid opfamily, Oid lefttype, Oid righttype, int16 procnum)
Definition: lsyscache.c:796
#define BTEQUALIMAGE_PROC
Definition: nbtree.h:710
static bool DatumGetBool(Datum X)
Definition: postgres.h:90
static Datum ObjectIdGetDatum(Oid X)
Definition: postgres.h:252
unsigned int Oid
Definition: postgres_ext.h:31
#define RelationGetRelationName(relation)
Definition: rel.h:539
#define IndexRelationGetNumberOfKeyAttributes(relation)
Definition: rel.h:524
Oid * rd_opcintype
Definition: rel.h:208
Oid * rd_opfamily
Definition: rel.h:207
Oid * rd_indcollation
Definition: rel.h:217

References BTEQUALIMAGE_PROC, DatumGetBool(), DEBUG1, elog, get_opfamily_proc(), i, IndexRelationGetNumberOfAttributes, IndexRelationGetNumberOfKeyAttributes, ObjectIdGetDatum(), OidFunctionCall1Coll(), OidIsValid, RelationData::rd_indcollation, RelationData::rd_opcintype, RelationData::rd_opfamily, and RelationGetRelationName.

Referenced by _bt_leafbuild(), bt_index_check_internal(), and btbuildempty().

◆ _bt_allocbuf()

Buffer _bt_allocbuf ( Relation  rel,
Relation  heaprel 
)

Definition at line 869 of file nbtpage.c.

870 {
871  Buffer buf;
872  BlockNumber blkno;
873  Page page;
874 
875  Assert(heaprel != NULL);
876 
877  /*
878  * First see if the FSM knows of any free pages.
879  *
880  * We can't trust the FSM's report unreservedly; we have to check that the
881  * page is still free. (For example, an already-free page could have been
882  * re-used between the time the last VACUUM scanned it and the time the
883  * VACUUM made its FSM updates.)
884  *
885  * In fact, it's worse than that: we can't even assume that it's safe to
886  * take a lock on the reported page. If somebody else has a lock on it,
887  * or even worse our own caller does, we could deadlock. (The own-caller
888  * scenario is actually not improbable. Consider an index on a serial or
889  * timestamp column. Nearly all splits will be at the rightmost page, so
890  * it's entirely likely that _bt_split will call us while holding a lock
891  * on the page most recently acquired from FSM. A VACUUM running
892  * concurrently with the previous split could well have placed that page
893  * back in FSM.)
894  *
895  * To get around that, we ask for only a conditional lock on the reported
896  * page. If we fail, then someone else is using the page, and we may
897  * reasonably assume it's not free. (If we happen to be wrong, the worst
898  * consequence is the page will be lost to use till the next VACUUM, which
899  * is no big problem.)
900  */
901  for (;;)
902  {
903  blkno = GetFreeIndexPage(rel);
904  if (blkno == InvalidBlockNumber)
905  break;
906  buf = ReadBuffer(rel, blkno);
907  if (_bt_conditionallockbuf(rel, buf))
908  {
909  page = BufferGetPage(buf);
910 
911  /*
912  * It's possible to find an all-zeroes page in an index. For
913  * example, a backend might successfully extend the relation one
914  * page and then crash before it is able to make a WAL entry for
915  * adding the page. If we find a zeroed page then reclaim it
916  * immediately.
917  */
918  if (PageIsNew(page))
919  {
920  /* Okay to use page. Initialize and return it. */
922  return buf;
923  }
924 
925  if (BTPageIsRecyclable(page, heaprel))
926  {
927  /*
928  * If we are generating WAL for Hot Standby then create a WAL
929  * record that will allow us to conflict with queries running
930  * on standby, in case they have snapshots older than safexid
931  * value
932  */
934  {
935  xl_btree_reuse_page xlrec_reuse;
936 
937  /*
938  * Note that we don't register the buffer with the record,
939  * because this operation doesn't modify the page (that
940  * already happened, back when VACUUM deleted the page).
941  * This record only exists to provide a conflict point for
942  * Hot Standby. See record REDO routine comments.
943  */
944  xlrec_reuse.locator = rel->rd_locator;
945  xlrec_reuse.block = blkno;
946  xlrec_reuse.snapshotConflictHorizon = BTPageGetDeleteXid(page);
947  xlrec_reuse.isCatalogRel =
949 
950  XLogBeginInsert();
951  XLogRegisterData((char *) &xlrec_reuse, SizeOfBtreeReusePage);
952 
953  XLogInsert(RM_BTREE_ID, XLOG_BTREE_REUSE_PAGE);
954  }
955 
956  /* Okay to use page. Re-initialize and return it. */
958  return buf;
959  }
960  elog(DEBUG2, "FSM returned nonrecyclable page");
961  _bt_relbuf(rel, buf);
962  }
963  else
964  {
965  elog(DEBUG2, "FSM returned nonlockable page");
966  /* couldn't get lock, so just drop pin */
968  }
969  }
970 
971  /*
972  * Extend the relation by one page. Need to use RBM_ZERO_AND_LOCK or we
973  * risk a race condition against btvacuumscan --- see comments therein.
974  * This forces us to repeat the valgrind request that _bt_lockbuf()
975  * otherwise would make, as we can't use _bt_lockbuf() without introducing
976  * a race.
977  */
979  if (!RelationUsesLocalBuffers(rel))
981 
982  /* Initialize the new page before returning it */
983  page = BufferGetPage(buf);
984  Assert(PageIsNew(page));
986 
987  return buf;
988 }
uint32 BlockNumber
Definition: block.h:31
int Buffer
Definition: buf.h:23
Buffer ExtendBufferedRel(BufferManagerRelation bmr, ForkNumber forkNum, BufferAccessStrategy strategy, uint32 flags)
Definition: bufmgr.c:846
void ReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:4924
Buffer ReadBuffer(Relation reln, BlockNumber blockNum)
Definition: bufmgr.c:746
static Page BufferGetPage(Buffer buffer)
Definition: bufmgr.h:400
static Size BufferGetPageSize(Buffer buffer)
Definition: bufmgr.h:389
@ EB_LOCK_FIRST
Definition: bufmgr.h:86
#define BMR_REL(p_rel)
Definition: bufmgr.h:107
Pointer Page
Definition: bufpage.h:81
static bool PageIsNew(Page page)
Definition: bufpage.h:233
#define Assert(condition)
Definition: c.h:863
#define DEBUG2
Definition: elog.h:29
BlockNumber GetFreeIndexPage(Relation rel)
Definition: indexfsm.c:38
#define VALGRIND_MAKE_MEM_DEFINED(addr, size)
Definition: memdebug.h:26
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:1023
void _bt_pageinit(Page page, Size size)
Definition: nbtpage.c:1129
bool _bt_conditionallockbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:1093
static FullTransactionId BTPageGetDeleteXid(Page page)
Definition: nbtree.h:260
static bool BTPageIsRecyclable(Page page, Relation heaprel)
Definition: nbtree.h:291
#define XLOG_BTREE_REUSE_PAGE
Definition: nbtxlog.h:40
#define SizeOfBtreeReusePage
Definition: nbtxlog.h:192
#define RelationIsAccessibleInLogicalDecoding(relation)
Definition: rel.h:684
#define RelationNeedsWAL(relation)
Definition: rel.h:628
#define RelationUsesLocalBuffers(relation)
Definition: rel.h:637
@ MAIN_FORKNUM
Definition: relpath.h:58
RelFileLocator rd_locator
Definition: rel.h:57
FullTransactionId snapshotConflictHorizon
Definition: nbtxlog.h:187
RelFileLocator locator
Definition: nbtxlog.h:185
BlockNumber block
Definition: nbtxlog.h:186
#define XLogStandbyInfoActive()
Definition: xlog.h:123
XLogRecPtr XLogInsert(RmgrId rmid, uint8 info)
Definition: xloginsert.c:474
void XLogRegisterData(const char *data, uint32 len)
Definition: xloginsert.c:364
void XLogBeginInsert(void)
Definition: xloginsert.c:149

References _bt_conditionallockbuf(), _bt_pageinit(), _bt_relbuf(), Assert, xl_btree_reuse_page::block, BMR_REL, BTPageGetDeleteXid(), BTPageIsRecyclable(), buf, BufferGetPage(), BufferGetPageSize(), DEBUG2, EB_LOCK_FIRST, elog, ExtendBufferedRel(), GetFreeIndexPage(), InvalidBlockNumber, xl_btree_reuse_page::isCatalogRel, xl_btree_reuse_page::locator, MAIN_FORKNUM, PageIsNew(), RelationData::rd_locator, ReadBuffer(), RelationIsAccessibleInLogicalDecoding, RelationNeedsWAL, RelationUsesLocalBuffers, ReleaseBuffer(), SizeOfBtreeReusePage, xl_btree_reuse_page::snapshotConflictHorizon, VALGRIND_MAKE_MEM_DEFINED, XLOG_BTREE_REUSE_PAGE, XLogBeginInsert(), XLogInsert(), XLogRegisterData(), and XLogStandbyInfoActive.

Referenced by _bt_getroot(), _bt_newlevel(), and _bt_split().

◆ _bt_binsrch_insert()

OffsetNumber _bt_binsrch_insert ( Relation  rel,
BTInsertState  insertstate 
)

Definition at line 474 of file nbtsearch.c.

475 {
476  BTScanInsert key = insertstate->itup_key;
477  Page page;
478  BTPageOpaque opaque;
479  OffsetNumber low,
480  high,
481  stricthigh;
482  int32 result,
483  cmpval;
484 
485  page = BufferGetPage(insertstate->buf);
486  opaque = BTPageGetOpaque(page);
487 
488  Assert(P_ISLEAF(opaque));
489  Assert(!key->nextkey);
490  Assert(insertstate->postingoff == 0);
491 
492  if (!insertstate->bounds_valid)
493  {
494  /* Start new binary search */
495  low = P_FIRSTDATAKEY(opaque);
496  high = PageGetMaxOffsetNumber(page);
497  }
498  else
499  {
500  /* Restore result of previous binary search against same page */
501  low = insertstate->low;
502  high = insertstate->stricthigh;
503  }
504 
505  /* If there are no keys on the page, return the first available slot */
506  if (unlikely(high < low))
507  {
508  /* Caller can't reuse bounds */
509  insertstate->low = InvalidOffsetNumber;
510  insertstate->stricthigh = InvalidOffsetNumber;
511  insertstate->bounds_valid = false;
512  return low;
513  }
514 
515  /*
516  * Binary search to find the first key on the page >= scan key. (nextkey
517  * is always false when inserting).
518  *
519  * The loop invariant is: all slots before 'low' are < scan key, all slots
520  * at or after 'high' are >= scan key. 'stricthigh' is > scan key, and is
521  * maintained to save additional search effort for caller.
522  *
523  * We can fall out when high == low.
524  */
525  if (!insertstate->bounds_valid)
526  high++; /* establish the loop invariant for high */
527  stricthigh = high; /* high initially strictly higher */
528 
529  cmpval = 1; /* !nextkey comparison value */
530 
531  while (high > low)
532  {
533  OffsetNumber mid = low + ((high - low) / 2);
534 
535  /* We have low <= mid < high, so mid points at a real slot */
536 
537  result = _bt_compare(rel, key, page, mid);
538 
539  if (result >= cmpval)
540  low = mid + 1;
541  else
542  {
543  high = mid;
544  if (result != 0)
545  stricthigh = high;
546  }
547 
548  /*
549  * If tuple at offset located by binary search is a posting list whose
550  * TID range overlaps with caller's scantid, perform posting list
551  * binary search to set postingoff for caller. Caller must split the
552  * posting list when postingoff is set. This should happen
553  * infrequently.
554  */
555  if (unlikely(result == 0 && key->scantid != NULL))
556  {
557  /*
558  * postingoff should never be set more than once per leaf page
559  * binary search. That would mean that there are duplicate table
560  * TIDs in the index, which is never okay. Check for that here.
561  */
562  if (insertstate->postingoff != 0)
563  ereport(ERROR,
564  (errcode(ERRCODE_INDEX_CORRUPTED),
565  errmsg_internal("table tid from new index tuple (%u,%u) cannot find insert offset between offsets %u and %u of block %u in index \"%s\"",
566  ItemPointerGetBlockNumber(key->scantid),
568  low, stricthigh,
569  BufferGetBlockNumber(insertstate->buf),
570  RelationGetRelationName(rel))));
571 
572  insertstate->postingoff = _bt_binsrch_posting(key, page, mid);
573  }
574  }
575 
576  /*
577  * On a leaf page, a binary search always returns the first key >= scan
578  * key (at least in !nextkey case), which could be the last slot + 1. This
579  * is also the lower bound of cached search.
580  *
581  * stricthigh may also be the last slot + 1, which prevents caller from
582  * using bounds directly, but is still useful to us if we're called a
583  * second time with cached bounds (cached low will be < stricthigh when
584  * that happens).
585  */
586  insertstate->low = low;
587  insertstate->stricthigh = stricthigh;
588  insertstate->bounds_valid = true;
589 
590  return low;
591 }
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:3724
static OffsetNumber PageGetMaxOffsetNumber(Page page)
Definition: bufpage.h:372
signed int int32
Definition: c.h:508
#define unlikely(x)
Definition: c.h:326
int errmsg_internal(const char *fmt,...)
Definition: elog.c:1157
int errcode(int sqlerrcode)
Definition: elog.c:853
#define ERROR
Definition: elog.h:39
#define ereport(elevel,...)
Definition: elog.h:149
static OffsetNumber ItemPointerGetOffsetNumber(const ItemPointerData *pointer)
Definition: itemptr.h:124
static BlockNumber ItemPointerGetBlockNumber(const ItemPointerData *pointer)
Definition: itemptr.h:103
#define P_ISLEAF(opaque)
Definition: nbtree.h:220
#define BTPageGetOpaque(page)
Definition: nbtree.h:73
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:369
static int _bt_binsrch_posting(BTScanInsert key, Page page, OffsetNumber offnum)
Definition: nbtsearch.c:602
int32 _bt_compare(Relation rel, BTScanInsert key, Page page, OffsetNumber offnum)
Definition: nbtsearch.c:688
#define InvalidOffsetNumber
Definition: off.h:26
uint16 OffsetNumber
Definition: off.h:24
OffsetNumber stricthigh
Definition: nbtree.h:825
bool bounds_valid
Definition: nbtree.h:823
OffsetNumber low
Definition: nbtree.h:824
BTScanInsert itup_key
Definition: nbtree.h:813

References _bt_binsrch_posting(), _bt_compare(), Assert, BTInsertStateData::bounds_valid, BTPageGetOpaque, BTInsertStateData::buf, BufferGetBlockNumber(), BufferGetPage(), ereport, errcode(), errmsg_internal(), ERROR, InvalidOffsetNumber, ItemPointerGetBlockNumber(), ItemPointerGetOffsetNumber(), BTInsertStateData::itup_key, sort-test::key, BTInsertStateData::low, P_FIRSTDATAKEY, P_ISLEAF, PageGetMaxOffsetNumber(), BTInsertStateData::postingoff, RelationGetRelationName, BTInsertStateData::stricthigh, and unlikely.

Referenced by _bt_check_unique(), _bt_findinsertloc(), and bt_rootdescend().

◆ _bt_bottomupdel_pass()

bool _bt_bottomupdel_pass ( Relation  rel,
Buffer  buf,
Relation  heapRel,
Size  newitemsz 
)

Definition at line 307 of file nbtdedup.c.

309 {
310  OffsetNumber offnum,
311  minoff,
312  maxoff;
313  Page page = BufferGetPage(buf);
314  BTPageOpaque opaque = BTPageGetOpaque(page);
316  TM_IndexDeleteOp delstate;
317  bool neverdedup;
318  int nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
319 
320  /* Passed-in newitemsz is MAXALIGNED but does not include line pointer */
321  newitemsz += sizeof(ItemIdData);
322 
323  /* Initialize deduplication state */
325  state->deduplicate = true;
326  state->nmaxitems = 0;
327  state->maxpostingsize = BLCKSZ; /* We're not really deduplicating */
328  state->base = NULL;
329  state->baseoff = InvalidOffsetNumber;
330  state->basetupsize = 0;
331  state->htids = palloc(state->maxpostingsize);
332  state->nhtids = 0;
333  state->nitems = 0;
334  state->phystupsize = 0;
335  state->nintervals = 0;
336 
337  /*
338  * Initialize tableam state that describes bottom-up index deletion
339  * operation.
340  *
341  * We'll go on to ask the tableam to search for TIDs whose index tuples we
342  * can safely delete. The tableam will search until our leaf page space
343  * target is satisfied, or until the cost of continuing with the tableam
344  * operation seems too high. It focuses its efforts on TIDs associated
345  * with duplicate index tuples that we mark "promising".
346  *
347  * This space target is a little arbitrary. The tableam must be able to
348  * keep the costs and benefits in balance. We provide the tableam with
349  * exhaustive information about what might work, without directly
350  * concerning ourselves with avoiding work during the tableam call. Our
351  * role in costing the bottom-up deletion process is strictly advisory.
352  */
353  delstate.irel = rel;
354  delstate.iblknum = BufferGetBlockNumber(buf);
355  delstate.bottomup = true;
356  delstate.bottomupfreespace = Max(BLCKSZ / 16, newitemsz);
357  delstate.ndeltids = 0;
358  delstate.deltids = palloc(MaxTIDsPerBTreePage * sizeof(TM_IndexDelete));
359  delstate.status = palloc(MaxTIDsPerBTreePage * sizeof(TM_IndexStatus));
360 
361  minoff = P_FIRSTDATAKEY(opaque);
362  maxoff = PageGetMaxOffsetNumber(page);
363  for (offnum = minoff;
364  offnum <= maxoff;
365  offnum = OffsetNumberNext(offnum))
366  {
367  ItemId itemid = PageGetItemId(page, offnum);
368  IndexTuple itup = (IndexTuple) PageGetItem(page, itemid);
369 
370  Assert(!ItemIdIsDead(itemid));
371 
372  if (offnum == minoff)
373  {
374  /* itup starts first pending interval */
375  _bt_dedup_start_pending(state, itup, offnum);
376  }
377  else if (_bt_keep_natts_fast(rel, state->base, itup) > nkeyatts &&
378  _bt_dedup_save_htid(state, itup))
379  {
380  /* Tuple is equal; just added its TIDs to pending interval */
381  }
382  else
383  {
384  /* Finalize interval -- move its TIDs to delete state */
385  _bt_bottomupdel_finish_pending(page, state, &delstate);
386 
387  /* itup starts new pending interval */
388  _bt_dedup_start_pending(state, itup, offnum);
389  }
390  }
391  /* Finalize final interval -- move its TIDs to delete state */
392  _bt_bottomupdel_finish_pending(page, state, &delstate);
393 
394  /*
395  * We don't give up now in the event of having few (or even zero)
396  * promising tuples for the tableam because it's not up to us as the index
397  * AM to manage costs (note that the tableam might have heuristics of its
398  * own that work out what to do). We should at least avoid having our
399  * caller do a useless deduplication pass after we return in the event of
400  * zero promising tuples, though.
401  */
402  neverdedup = false;
403  if (state->nintervals == 0)
404  neverdedup = true;
405 
406  pfree(state->htids);
407  pfree(state);
408 
409  /* Ask tableam which TIDs are deletable, then physically delete them */
410  _bt_delitems_delete_check(rel, buf, heapRel, &delstate);
411 
412  pfree(delstate.deltids);
413  pfree(delstate.status);
414 
415  /* Report "success" to caller unconditionally to avoid deduplication */
416  if (neverdedup)
417  return true;
418 
419  /* Don't dedup when we won't end up back here any time soon anyway */
420  return PageGetExactFreeSpace(page) >= Max(BLCKSZ / 24, newitemsz);
421 }
Size PageGetExactFreeSpace(Page page)
Definition: bufpage.c:947
static Item PageGetItem(Page page, ItemId itemId)
Definition: bufpage.h:354
static ItemId PageGetItemId(Page page, OffsetNumber offsetNumber)
Definition: bufpage.h:243
#define Max(x, y)
Definition: c.h:1003
struct ItemIdData ItemIdData
#define ItemIdIsDead(itemId)
Definition: itemid.h:113
IndexTupleData * IndexTuple
Definition: itup.h:53
void pfree(void *pointer)
Definition: mcxt.c:1521
void * palloc(Size size)
Definition: mcxt.c:1317
bool _bt_dedup_save_htid(BTDedupState state, IndexTuple itup)
Definition: nbtdedup.c:484
void _bt_dedup_start_pending(BTDedupState state, IndexTuple base, OffsetNumber baseoff)
Definition: nbtdedup.c:433
static void _bt_bottomupdel_finish_pending(Page page, BTDedupState state, TM_IndexDeleteOp *delstate)
Definition: nbtdedup.c:648
void _bt_delitems_delete_check(Relation rel, Buffer buf, Relation heapRel, TM_IndexDeleteOp *delstate)
Definition: nbtpage.c:1513
#define MaxTIDsPerBTreePage
Definition: nbtree.h:185
BTDedupStateData * BTDedupState
Definition: nbtree.h:893
int _bt_keep_natts_fast(Relation rel, IndexTuple lastleft, IndexTuple firstright)
Definition: nbtutils.c:4877
#define OffsetNumberNext(offsetNumber)
Definition: off.h:52
TM_IndexStatus * status
Definition: tableam.h:254
int bottomupfreespace
Definition: tableam.h:249
Relation irel
Definition: tableam.h:246
TM_IndexDelete * deltids
Definition: tableam.h:253
BlockNumber iblknum
Definition: tableam.h:247
Definition: regguts.h:323

References _bt_bottomupdel_finish_pending(), _bt_dedup_save_htid(), _bt_dedup_start_pending(), _bt_delitems_delete_check(), _bt_keep_natts_fast(), Assert, TM_IndexDeleteOp::bottomup, TM_IndexDeleteOp::bottomupfreespace, BTPageGetOpaque, buf, BufferGetBlockNumber(), BufferGetPage(), TM_IndexDeleteOp::deltids, TM_IndexDeleteOp::iblknum, IndexRelationGetNumberOfKeyAttributes, InvalidOffsetNumber, TM_IndexDeleteOp::irel, ItemIdIsDead, Max, MaxTIDsPerBTreePage, TM_IndexDeleteOp::ndeltids, OffsetNumberNext, P_FIRSTDATAKEY, PageGetExactFreeSpace(), PageGetItem(), PageGetItemId(), PageGetMaxOffsetNumber(), palloc(), pfree(), and TM_IndexDeleteOp::status.

Referenced by _bt_delete_or_dedup_one_page().

◆ _bt_check_natts()

bool _bt_check_natts ( Relation  rel,
bool  heapkeyspace,
Page  page,
OffsetNumber  offnum 
)

Definition at line 4924 of file nbtutils.c.

4925 {
4928  BTPageOpaque opaque = BTPageGetOpaque(page);
4929  IndexTuple itup;
4930  int tupnatts;
4931 
4932  /*
4933  * We cannot reliably test a deleted or half-dead page, since they have
4934  * dummy high keys
4935  */
4936  if (P_IGNORE(opaque))
4937  return true;
4938 
4939  Assert(offnum >= FirstOffsetNumber &&
4940  offnum <= PageGetMaxOffsetNumber(page));
4941 
4942  itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
4943  tupnatts = BTreeTupleGetNAtts(itup, rel);
4944 
4945  /* !heapkeyspace indexes do not support deduplication */
4946  if (!heapkeyspace && BTreeTupleIsPosting(itup))
4947  return false;
4948 
4949  /* Posting list tuples should never have "pivot heap TID" bit set */
4950  if (BTreeTupleIsPosting(itup) &&
4952  BT_PIVOT_HEAP_TID_ATTR) != 0)
4953  return false;
4954 
4955  /* INCLUDE indexes do not support deduplication */
4956  if (natts != nkeyatts && BTreeTupleIsPosting(itup))
4957  return false;
4958 
4959  if (P_ISLEAF(opaque))
4960  {
4961  if (offnum >= P_FIRSTDATAKEY(opaque))
4962  {
4963  /*
4964  * Non-pivot tuple should never be explicitly marked as a pivot
4965  * tuple
4966  */
4967  if (BTreeTupleIsPivot(itup))
4968  return false;
4969 
4970  /*
4971  * Leaf tuples that are not the page high key (non-pivot tuples)
4972  * should never be truncated. (Note that tupnatts must have been
4973  * inferred, even with a posting list tuple, because only pivot
4974  * tuples store tupnatts directly.)
4975  */
4976  return tupnatts == natts;
4977  }
4978  else
4979  {
4980  /*
4981  * Rightmost page doesn't contain a page high key, so tuple was
4982  * checked above as ordinary leaf tuple
4983  */
4984  Assert(!P_RIGHTMOST(opaque));
4985 
4986  /*
4987  * !heapkeyspace high key tuple contains only key attributes. Note
4988  * that tupnatts will only have been explicitly represented in
4989  * !heapkeyspace indexes that happen to have non-key attributes.
4990  */
4991  if (!heapkeyspace)
4992  return tupnatts == nkeyatts;
4993 
4994  /* Use generic heapkeyspace pivot tuple handling */
4995  }
4996  }
4997  else /* !P_ISLEAF(opaque) */
4998  {
4999  if (offnum == P_FIRSTDATAKEY(opaque))
5000  {
5001  /*
5002  * The first tuple on any internal page (possibly the first after
5003  * its high key) is its negative infinity tuple. Negative
5004  * infinity tuples are always truncated to zero attributes. They
5005  * are a particular kind of pivot tuple.
5006  */
5007  if (heapkeyspace)
5008  return tupnatts == 0;
5009 
5010  /*
5011  * The number of attributes won't be explicitly represented if the
5012  * negative infinity tuple was generated during a page split that
5013  * occurred with a version of Postgres before v11. There must be
5014  * a problem when there is an explicit representation that is
5015  * non-zero, or when there is no explicit representation and the
5016  * tuple is evidently not a pre-pg_upgrade tuple.
5017  *
5018  * Prior to v11, downlinks always had P_HIKEY as their offset.
5019  * Accept that as an alternative indication of a valid
5020  * !heapkeyspace negative infinity tuple.
5021  */
5022  return tupnatts == 0 ||
5024  }
5025  else
5026  {
5027  /*
5028  * !heapkeyspace downlink tuple with separator key contains only
5029  * key attributes. Note that tupnatts will only have been
5030  * explicitly represented in !heapkeyspace indexes that happen to
5031  * have non-key attributes.
5032  */
5033  if (!heapkeyspace)
5034  return tupnatts == nkeyatts;
5035 
5036  /* Use generic heapkeyspace pivot tuple handling */
5037  }
5038  }
5039 
5040  /* Handle heapkeyspace pivot tuples (excluding minus infinity items) */
5041  Assert(heapkeyspace);
5042 
5043  /*
5044  * Explicit representation of the number of attributes is mandatory with
5045  * heapkeyspace index pivot tuples, regardless of whether or not there are
5046  * non-key attributes.
5047  */
5048  if (!BTreeTupleIsPivot(itup))
5049  return false;
5050 
5051  /* Pivot tuple should not use posting list representation (redundant) */
5052  if (BTreeTupleIsPosting(itup))
5053  return false;
5054 
5055  /*
5056  * Heap TID is a tiebreaker key attribute, so it cannot be untruncated
5057  * when any other key attribute is truncated
5058  */
5059  if (BTreeTupleGetHeapTID(itup) != NULL && tupnatts != nkeyatts)
5060  return false;
5061 
5062  /*
5063  * Pivot tuple must have at least one untruncated key attribute (minus
5064  * infinity pivot tuples are the only exception). Pivot tuples can never
5065  * represent that there is a value present for a key attribute that
5066  * exceeds pg_index.indnkeyatts for the index.
5067  */
5068  return tupnatts > 0 && tupnatts <= nkeyatts;
5069 }
signed short int16
Definition: c.h:507
#define BT_PIVOT_HEAP_TID_ATTR
Definition: nbtree.h:465
#define P_HIKEY
Definition: nbtree.h:367
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:219
#define P_IGNORE(opaque)
Definition: nbtree.h:225
static bool BTreeTupleIsPosting(IndexTuple itup)
Definition: nbtree.h:492
static ItemPointer BTreeTupleGetHeapTID(IndexTuple itup)
Definition: nbtree.h:638
#define BTreeTupleGetNAtts(itup, rel)
Definition: nbtree.h:577
#define FirstOffsetNumber
Definition: off.h:27
ItemPointerData t_tid
Definition: itup.h:37

References Assert, BT_PIVOT_HEAP_TID_ATTR, BTPageGetOpaque, BTreeTupleGetHeapTID(), BTreeTupleGetNAtts, BTreeTupleIsPivot(), BTreeTupleIsPosting(), FirstOffsetNumber, IndexRelationGetNumberOfAttributes, IndexRelationGetNumberOfKeyAttributes, ItemPointerGetOffsetNumber(), ItemPointerGetOffsetNumberNoCheck(), P_FIRSTDATAKEY, P_HIKEY, P_IGNORE, P_ISLEAF, P_RIGHTMOST, PageGetItem(), PageGetItemId(), PageGetMaxOffsetNumber(), and IndexTupleData::t_tid.

Referenced by _bt_compare(), and bt_target_page_check().

◆ _bt_check_third_page()

void _bt_check_third_page ( Relation  rel,
Relation  heap,
bool  needheaptidspace,
Page  page,
IndexTuple  newtup 
)

Definition at line 5084 of file nbtutils.c.

5086 {
5087  Size itemsz;
5088  BTPageOpaque opaque;
5089 
5090  itemsz = MAXALIGN(IndexTupleSize(newtup));
5091 
5092  /* Double check item size against limit */
5093  if (itemsz <= BTMaxItemSize(page))
5094  return;
5095 
5096  /*
5097  * Tuple is probably too large to fit on page, but it's possible that the
5098  * index uses version 2 or version 3, or that page is an internal page, in
5099  * which case a slightly higher limit applies.
5100  */
5101  if (!needheaptidspace && itemsz <= BTMaxItemSizeNoHeapTid(page))
5102  return;
5103 
5104  /*
5105  * Internal page insertions cannot fail here, because that would mean that
5106  * an earlier leaf level insertion that should have failed didn't
5107  */
5108  opaque = BTPageGetOpaque(page);
5109  if (!P_ISLEAF(opaque))
5110  elog(ERROR, "cannot insert oversized tuple of size %zu on internal page of index \"%s\"",
5111  itemsz, RelationGetRelationName(rel));
5112 
5113  ereport(ERROR,
5114  (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
5115  errmsg("index row size %zu exceeds btree version %u maximum %zu for index \"%s\"",
5116  itemsz,
5117  needheaptidspace ? BTREE_VERSION : BTREE_NOVAC_VERSION,
5118  needheaptidspace ? BTMaxItemSize(page) :
5119  BTMaxItemSizeNoHeapTid(page),
5121  errdetail("Index row references tuple (%u,%u) in relation \"%s\".",
5124  RelationGetRelationName(heap)),
5125  errhint("Values larger than 1/3 of a buffer page cannot be indexed.\n"
5126  "Consider a function index of an MD5 hash of the value, "
5127  "or use full text indexing."),
5129 }
size_t Size
Definition: c.h:610
int errdetail(const char *fmt,...)
Definition: elog.c:1203
int errhint(const char *fmt,...)
Definition: elog.c:1317
int errmsg(const char *fmt,...)
Definition: elog.c:1070
#define IndexTupleSize(itup)
Definition: itup.h:70
#define BTMaxItemSizeNoHeapTid(page)
Definition: nbtree.h:169
#define BTREE_VERSION
Definition: nbtree.h:150
#define BTMaxItemSize(page)
Definition: nbtree.h:164
#define BTREE_NOVAC_VERSION
Definition: nbtree.h:152
int errtableconstraint(Relation rel, const char *conname)
Definition: relcache.c:6001

References BTMaxItemSize, BTMaxItemSizeNoHeapTid, BTPageGetOpaque, BTREE_NOVAC_VERSION, BTREE_VERSION, BTreeTupleGetHeapTID(), elog, ereport, errcode(), errdetail(), errhint(), errmsg(), ERROR, errtableconstraint(), IndexTupleSize, ItemPointerGetBlockNumber(), ItemPointerGetOffsetNumber(), MAXALIGN, P_ISLEAF, and RelationGetRelationName.

Referenced by _bt_buildadd(), and _bt_findinsertloc().

◆ _bt_checkkeys()

bool _bt_checkkeys ( IndexScanDesc  scan,
BTReadPageState pstate,
bool  arrayKeys,
IndexTuple  tuple,
int  tupnatts 
)

Definition at line 3472 of file nbtutils.c.

3474 {
3475  TupleDesc tupdesc = RelationGetDescr(scan->indexRelation);
3476  BTScanOpaque so = (BTScanOpaque) scan->opaque;
3477  ScanDirection dir = so->currPos.dir;
3478  int ikey = 0;
3479  bool res;
3480 
3481  Assert(BTreeTupleGetNAtts(tuple, scan->indexRelation) == tupnatts);
3482 
3483  res = _bt_check_compare(scan, dir, tuple, tupnatts, tupdesc,
3484  arrayKeys, pstate->prechecked, pstate->firstmatch,
3485  &pstate->continuescan, &ikey);
3486 
3487 #ifdef USE_ASSERT_CHECKING
3488  if (!arrayKeys && so->numArrayKeys)
3489  {
3490  /*
3491  * This is a continuescan precheck call for a scan with array keys.
3492  *
3493  * Assert that the scan isn't in danger of becoming confused.
3494  */
3495  Assert(!so->scanBehind && !so->oppositeDirCheck);
3496  Assert(!pstate->prechecked && !pstate->firstmatch);
3497  Assert(!_bt_tuple_before_array_skeys(scan, dir, tuple, tupdesc,
3498  tupnatts, false, 0, NULL));
3499  }
3500  if (pstate->prechecked || pstate->firstmatch)
3501  {
3502  bool dcontinuescan;
3503  int dikey = 0;
3504 
3505  /*
3506  * Call relied on continuescan/firstmatch prechecks -- assert that we
3507  * get the same answer without those optimizations
3508  */
3509  Assert(res == _bt_check_compare(scan, dir, tuple, tupnatts, tupdesc,
3510  false, false, false,
3511  &dcontinuescan, &dikey));
3512  Assert(pstate->continuescan == dcontinuescan);
3513  }
3514 #endif
3515 
3516  /*
3517  * Only one _bt_check_compare call is required in the common case where
3518  * there are no equality strategy array scan keys. Otherwise we can only
3519  * accept _bt_check_compare's answer unreservedly when it didn't set
3520  * pstate.continuescan=false.
3521  */
3522  if (!arrayKeys || pstate->continuescan)
3523  return res;
3524 
3525  /*
3526  * _bt_check_compare call set continuescan=false in the presence of
3527  * equality type array keys. This could mean that the tuple is just past
3528  * the end of matches for the current array keys.
3529  *
3530  * It's also possible that the scan is still _before_ the _start_ of
3531  * tuples matching the current set of array keys. Check for that first.
3532  */
3533  if (_bt_tuple_before_array_skeys(scan, dir, tuple, tupdesc, tupnatts, true,
3534  ikey, NULL))
3535  {
3536  /*
3537  * Tuple is still before the start of matches according to the scan's
3538  * required array keys (according to _all_ of its required equality
3539  * strategy keys, actually).
3540  *
3541  * _bt_advance_array_keys occasionally sets so->scanBehind to signal
3542  * that the scan's current position/tuples might be significantly
3543  * behind (multiple pages behind) its current array keys. When this
3544  * happens, we need to be prepared to recover by starting a new
3545  * primitive index scan here, on our own.
3546  */
3547  Assert(!so->scanBehind ||
3549  if (unlikely(so->scanBehind) && pstate->finaltup &&
3550  _bt_tuple_before_array_skeys(scan, dir, pstate->finaltup, tupdesc,
3551  BTreeTupleGetNAtts(pstate->finaltup,
3552  scan->indexRelation),
3553  false, 0, NULL))
3554  {
3555  /* Cut our losses -- start a new primitive index scan now */
3556  pstate->continuescan = false;
3557  so->needPrimScan = true;
3558  }
3559  else
3560  {
3561  /* Override _bt_check_compare, continue primitive scan */
3562  pstate->continuescan = true;
3563 
3564  /*
3565  * We will end up here repeatedly given a group of tuples > the
3566  * previous array keys and < the now-current keys (for a backwards
3567  * scan it's just the same, though the operators swap positions).
3568  *
3569  * We must avoid allowing this linear search process to scan very
3570  * many tuples from well before the start of tuples matching the
3571  * current array keys (or from well before the point where we'll
3572  * once again have to advance the scan's array keys).
3573  *
3574  * We keep the overhead under control by speculatively "looking
3575  * ahead" to later still-unscanned items from this same leaf page.
3576  * We'll only attempt this once the number of tuples that the
3577  * linear search process has examined starts to get out of hand.
3578  */
3579  pstate->rechecks++;
3580  if (pstate->rechecks >= LOOK_AHEAD_REQUIRED_RECHECKS)
3581  {
3582  /* See if we should skip ahead within the current leaf page */
3583  _bt_checkkeys_look_ahead(scan, pstate, tupnatts, tupdesc);
3584 
3585  /*
3586  * Might have set pstate.skip to a later page offset. When
3587  * that happens then _bt_readpage caller will inexpensively
3588  * skip ahead to a later tuple from the same page (the one
3589  * just after the tuple we successfully "looked ahead" to).
3590  */
3591  }
3592  }
3593 
3594  /* This indextuple doesn't match the current qual, in any case */
3595  return false;
3596  }
3597 
3598  /*
3599  * Caller's tuple is >= the current set of array keys and other equality
3600  * constraint scan keys (or <= if this is a backwards scan). It's now
3601  * clear that we _must_ advance any required array keys in lockstep with
3602  * the scan.
3603  */
3604  return _bt_advance_array_keys(scan, pstate, tuple, tupnatts, tupdesc,
3605  ikey, true);
3606 }
BTScanOpaqueData * BTScanOpaque
Definition: nbtree.h:1073
static bool _bt_check_compare(IndexScanDesc scan, ScanDirection dir, IndexTuple tuple, int tupnatts, TupleDesc tupdesc, bool advancenonrequired, bool prechecked, bool firstmatch, bool *continuescan, int *ikey)
Definition: nbtutils.c:3688
static bool _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate, IndexTuple tuple, int tupnatts, TupleDesc tupdesc, int sktrig, bool sktrig_required)
Definition: nbtutils.c:1802
#define LOOK_AHEAD_REQUIRED_RECHECKS
Definition: nbtutils.c:32
static bool _bt_tuple_before_array_skeys(IndexScanDesc scan, ScanDirection dir, IndexTuple tuple, TupleDesc tupdesc, int tupnatts, bool readpagetup, int sktrig, bool *scanBehind)
Definition: nbtutils.c:1558
static void _bt_checkkeys_look_ahead(IndexScanDesc scan, BTReadPageState *pstate, int tupnatts, TupleDesc tupdesc)
Definition: nbtutils.c:4078
#define RelationGetDescr(relation)
Definition: rel.h:531
ScanDirection
Definition: sdir.h:25
#define BTEqualStrategyNumber
Definition: stratnum.h:31
bool firstmatch
Definition: nbtree.h:1098
bool continuescan
Definition: nbtree.h:1091
IndexTuple finaltup
Definition: nbtree.h:1083
bool prechecked
Definition: nbtree.h:1097
int16 rechecks
Definition: nbtree.h:1104
bool needPrimScan
Definition: nbtree.h:1040
BTScanPosData currPos
Definition: nbtree.h:1069
bool oppositeDirCheck
Definition: nbtree.h:1042
ScanKey keyData
Definition: nbtree.h:1036
ScanDirection dir
Definition: nbtree.h:962
Relation indexRelation
Definition: relscan.h:141
StrategyNumber sk_strategy
Definition: skey.h:68

References _bt_advance_array_keys(), _bt_check_compare(), _bt_checkkeys_look_ahead(), _bt_tuple_before_array_skeys(), Assert, BTEqualStrategyNumber, BTreeTupleGetNAtts, BTReadPageState::continuescan, BTScanOpaqueData::currPos, BTScanPosData::dir, BTReadPageState::finaltup, BTReadPageState::firstmatch, IndexScanDescData::indexRelation, BTScanOpaqueData::keyData, LOOK_AHEAD_REQUIRED_RECHECKS, BTScanOpaqueData::needPrimScan, BTScanOpaqueData::numArrayKeys, IndexScanDescData::opaque, BTScanOpaqueData::oppositeDirCheck, BTReadPageState::prechecked, BTReadPageState::rechecks, RelationGetDescr, res, BTScanOpaqueData::scanBehind, ScanKeyData::sk_strategy, and unlikely.

Referenced by _bt_readpage().

◆ _bt_checkpage()

void _bt_checkpage ( Relation  rel,
Buffer  buf 
)

Definition at line 797 of file nbtpage.c.

798 {
799  Page page = BufferGetPage(buf);
800 
801  /*
802  * ReadBuffer verifies that every newly-read page passes
803  * PageHeaderIsValid, which means it either contains a reasonably sane
804  * page header or is all-zero. We have to defend against the all-zero
805  * case, however.
806  */
807  if (PageIsNew(page))
808  ereport(ERROR,
809  (errcode(ERRCODE_INDEX_CORRUPTED),
810  errmsg("index \"%s\" contains unexpected zero page at block %u",
813  errhint("Please REINDEX it.")));
814 
815  /*
816  * Additionally check that the special area looks sane.
817  */
818  if (PageGetSpecialSize(page) != MAXALIGN(sizeof(BTPageOpaqueData)))
819  ereport(ERROR,
820  (errcode(ERRCODE_INDEX_CORRUPTED),
821  errmsg("index \"%s\" contains corrupted page at block %u",
824  errhint("Please REINDEX it.")));
825 }
static uint16 PageGetSpecialSize(Page page)
Definition: bufpage.h:316

References buf, BufferGetBlockNumber(), BufferGetPage(), ereport, errcode(), errhint(), errmsg(), ERROR, MAXALIGN, PageGetSpecialSize(), PageIsNew(), and RelationGetRelationName.

Referenced by _bt_getbuf(), _bt_relandgetbuf(), _bt_search_insert(), bt_recheck_sibling_links(), btvacuumpage(), and palloc_btree_page().

◆ _bt_compare()

int32 _bt_compare ( Relation  rel,
BTScanInsert  key,
Page  page,
OffsetNumber  offnum 
)

Definition at line 688 of file nbtsearch.c.

692 {
693  TupleDesc itupdesc = RelationGetDescr(rel);
694  BTPageOpaque opaque = BTPageGetOpaque(page);
695  IndexTuple itup;
696  ItemPointer heapTid;
697  ScanKey scankey;
698  int ncmpkey;
699  int ntupatts;
700  int32 result;
701 
702  Assert(_bt_check_natts(rel, key->heapkeyspace, page, offnum));
704  Assert(key->heapkeyspace || key->scantid == NULL);
705 
706  /*
707  * Force result ">" if target item is first data item on an internal page
708  * --- see NOTE above.
709  */
710  if (!P_ISLEAF(opaque) && offnum == P_FIRSTDATAKEY(opaque))
711  return 1;
712 
713  itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
714  ntupatts = BTreeTupleGetNAtts(itup, rel);
715 
716  /*
717  * The scan key is set up with the attribute number associated with each
718  * term in the key. It is important that, if the index is multi-key, the
719  * scan contain the first k key attributes, and that they be in order. If
720  * you think about how multi-key ordering works, you'll understand why
721  * this is.
722  *
723  * We don't test for violation of this condition here, however. The
724  * initial setup for the index scan had better have gotten it right (see
725  * _bt_first).
726  */
727 
728  ncmpkey = Min(ntupatts, key->keysz);
729  Assert(key->heapkeyspace || ncmpkey == key->keysz);
730  Assert(!BTreeTupleIsPosting(itup) || key->allequalimage);
731  scankey = key->scankeys;
732  for (int i = 1; i <= ncmpkey; i++)
733  {
734  Datum datum;
735  bool isNull;
736 
737  datum = index_getattr(itup, scankey->sk_attno, itupdesc, &isNull);
738 
739  if (scankey->sk_flags & SK_ISNULL) /* key is NULL */
740  {
741  if (isNull)
742  result = 0; /* NULL "=" NULL */
743  else if (scankey->sk_flags & SK_BT_NULLS_FIRST)
744  result = -1; /* NULL "<" NOT_NULL */
745  else
746  result = 1; /* NULL ">" NOT_NULL */
747  }
748  else if (isNull) /* key is NOT_NULL and item is NULL */
749  {
750  if (scankey->sk_flags & SK_BT_NULLS_FIRST)
751  result = 1; /* NOT_NULL ">" NULL */
752  else
753  result = -1; /* NOT_NULL "<" NULL */
754  }
755  else
756  {
757  /*
758  * The sk_func needs to be passed the index value as left arg and
759  * the sk_argument as right arg (they might be of different
760  * types). Since it is convenient for callers to think of
761  * _bt_compare as comparing the scankey to the index item, we have
762  * to flip the sign of the comparison result. (Unless it's a DESC
763  * column, in which case we *don't* flip the sign.)
764  */
765  result = DatumGetInt32(FunctionCall2Coll(&scankey->sk_func,
766  scankey->sk_collation,
767  datum,
768  scankey->sk_argument));
769 
770  if (!(scankey->sk_flags & SK_BT_DESC))
771  INVERT_COMPARE_RESULT(result);
772  }
773 
774  /* if the keys are unequal, return the difference */
775  if (result != 0)
776  return result;
777 
778  scankey++;
779  }
780 
781  /*
782  * All non-truncated attributes (other than heap TID) were found to be
783  * equal. Treat truncated attributes as minus infinity when scankey has a
784  * key attribute value that would otherwise be compared directly.
785  *
786  * Note: it doesn't matter if ntupatts includes non-key attributes;
787  * scankey won't, so explicitly excluding non-key attributes isn't
788  * necessary.
789  */
790  if (key->keysz > ntupatts)
791  return 1;
792 
793  /*
794  * Use the heap TID attribute and scantid to try to break the tie. The
795  * rules are the same as any other key attribute -- only the
796  * representation differs.
797  */
798  heapTid = BTreeTupleGetHeapTID(itup);
799  if (key->scantid == NULL)
800  {
801  /*
802  * Forward scans have a scankey that is considered greater than a
803  * truncated pivot tuple if and when the scankey has equal values for
804  * attributes up to and including the least significant untruncated
805  * attribute in tuple. Even attributes that were omitted from the
806  * scan key are considered greater than -inf truncated attributes.
807  * (See _bt_binsrch for an explanation of our backward scan behavior.)
808  *
809  * For example, if an index has the minimum two attributes (single
810  * user key attribute, plus heap TID attribute), and a page's high key
811  * is ('foo', -inf), and scankey is ('foo', <omitted>), the search
812  * will not descend to the page to the left. The search will descend
813  * right instead. The truncated attribute in pivot tuple means that
814  * all non-pivot tuples on the page to the left are strictly < 'foo',
815  * so it isn't necessary to descend left. In other words, search
816  * doesn't have to descend left because it isn't interested in a match
817  * that has a heap TID value of -inf.
818  *
819  * Note: the heap TID part of the test ensures that scankey is being
820  * compared to a pivot tuple with one or more truncated -inf key
821  * attributes. The heap TID attribute is the last key attribute in
822  * every index, of course, but other than that it isn't special.
823  */
824  if (!key->backward && key->keysz == ntupatts && heapTid == NULL &&
825  key->heapkeyspace)
826  return 1;
827 
828  /* All provided scankey arguments found to be equal */
829  return 0;
830  }
831 
832  /*
833  * Treat truncated heap TID as minus infinity, since scankey has a key
834  * attribute value (scantid) that would otherwise be compared directly
835  */
837  if (heapTid == NULL)
838  return 1;
839 
840  /*
841  * Scankey must be treated as equal to a posting list tuple if its scantid
842  * value falls within the range of the posting list. In all other cases
843  * there can only be a single heap TID value, which is compared directly
844  * with scantid.
845  */
847  result = ItemPointerCompare(key->scantid, heapTid);
848  if (result <= 0 || !BTreeTupleIsPosting(itup))
849  return result;
850  else
851  {
852  result = ItemPointerCompare(key->scantid,
854  if (result > 0)
855  return 1;
856  }
857 
858  return 0;
859 }
#define Min(x, y)
Definition: c.h:1009
#define INVERT_COMPARE_RESULT(var)
Definition: c.h:1111
Datum FunctionCall2Coll(FmgrInfo *flinfo, Oid collation, Datum arg1, Datum arg2)
Definition: fmgr.c:1149
int32 ItemPointerCompare(ItemPointer arg1, ItemPointer arg2)
Definition: itemptr.c:51
static Datum index_getattr(IndexTuple tup, int attnum, TupleDesc tupleDesc, bool *isnull)
Definition: itup.h:117
#define SK_BT_NULLS_FIRST
Definition: nbtree.h:1118
#define SK_BT_DESC
Definition: nbtree.h:1117
static ItemPointer BTreeTupleGetMaxHeapTID(IndexTuple itup)
Definition: nbtree.h:664
bool _bt_check_natts(Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum)
Definition: nbtutils.c:4924
uintptr_t Datum
Definition: postgres.h:64
static int32 DatumGetInt32(Datum X)
Definition: postgres.h:202
#define SK_ISNULL
Definition: skey.h:115
int sk_flags
Definition: skey.h:66
Datum sk_argument
Definition: skey.h:72
FmgrInfo sk_func
Definition: skey.h:71
Oid sk_collation
Definition: skey.h:70
AttrNumber sk_attno
Definition: skey.h:67

References _bt_check_natts(), Assert, BTPageGetOpaque, BTreeTupleGetHeapTID(), BTreeTupleGetMaxHeapTID(), BTreeTupleGetNAtts, BTreeTupleIsPosting(), DatumGetInt32(), FunctionCall2Coll(), i, index_getattr(), IndexRelationGetNumberOfKeyAttributes, INVERT_COMPARE_RESULT, ItemPointerCompare(), sort-test::key, Min, P_FIRSTDATAKEY, P_ISLEAF, PageGetItem(), PageGetItemId(), RelationGetDescr, ScanKeyData::sk_argument, ScanKeyData::sk_attno, SK_BT_DESC, SK_BT_NULLS_FIRST, ScanKeyData::sk_collation, ScanKeyData::sk_flags, ScanKeyData::sk_func, and SK_ISNULL.

Referenced by _bt_binsrch(), _bt_binsrch_insert(), _bt_check_unique(), _bt_findinsertloc(), _bt_moveright(), _bt_search_insert(), bt_rootdescend(), bt_target_page_check(), invariant_g_offset(), invariant_l_nontarget_offset(), invariant_l_offset(), and invariant_leq_offset().

◆ _bt_conditionallockbuf()

bool _bt_conditionallockbuf ( Relation  rel,
Buffer  buf 
)

Definition at line 1093 of file nbtpage.c.

1094 {
1095  /* ConditionalLockBuffer() asserts that pin is held by this backend */
1096  if (!ConditionalLockBuffer(buf))
1097  return false;
1098 
1099  if (!RelationUsesLocalBuffers(rel))
1101 
1102  return true;
1103 }
bool ConditionalLockBuffer(Buffer buffer)
Definition: bufmgr.c:5184

References buf, BufferGetPage(), ConditionalLockBuffer(), RelationUsesLocalBuffers, and VALGRIND_MAKE_MEM_DEFINED.

Referenced by _bt_allocbuf(), and _bt_search_insert().

◆ _bt_dedup_finish_pending()

Size _bt_dedup_finish_pending ( Page  newpage,
BTDedupState  state 
)

Definition at line 555 of file nbtdedup.c.

556 {
557  OffsetNumber tupoff;
558  Size tuplesz;
559  Size spacesaving;
560 
561  Assert(state->nitems > 0);
562  Assert(state->nitems <= state->nhtids);
563  Assert(state->intervals[state->nintervals].baseoff == state->baseoff);
564 
565  tupoff = OffsetNumberNext(PageGetMaxOffsetNumber(newpage));
566  if (state->nitems == 1)
567  {
568  /* Use original, unchanged base tuple */
569  tuplesz = IndexTupleSize(state->base);
570  Assert(tuplesz == MAXALIGN(IndexTupleSize(state->base)));
571  Assert(tuplesz <= BTMaxItemSize(newpage));
572  if (PageAddItem(newpage, (Item) state->base, tuplesz, tupoff,
573  false, false) == InvalidOffsetNumber)
574  elog(ERROR, "deduplication failed to add tuple to page");
575 
576  spacesaving = 0;
577  }
578  else
579  {
580  IndexTuple final;
581 
582  /* Form a tuple with a posting list */
583  final = _bt_form_posting(state->base, state->htids, state->nhtids);
584  tuplesz = IndexTupleSize(final);
585  Assert(tuplesz <= state->maxpostingsize);
586 
587  /* Save final number of items for posting list */
588  state->intervals[state->nintervals].nitems = state->nitems;
589 
590  Assert(tuplesz == MAXALIGN(IndexTupleSize(final)));
591  Assert(tuplesz <= BTMaxItemSize(newpage));
592  if (PageAddItem(newpage, (Item) final, tuplesz, tupoff, false,
593  false) == InvalidOffsetNumber)
594  elog(ERROR, "deduplication failed to add tuple to page");
595 
596  pfree(final);
597  spacesaving = state->phystupsize - (tuplesz + sizeof(ItemIdData));
598  /* Increment nintervals, since we wrote a new posting list tuple */
599  state->nintervals++;
600  Assert(spacesaving > 0 && spacesaving < BLCKSZ);
601  }
602 
603  /* Reset state for next pending posting list */
604  state->nhtids = 0;
605  state->nitems = 0;
606  state->phystupsize = 0;
607 
608  return spacesaving;
609 }
#define PageAddItem(page, item, size, offsetNumber, overwrite, is_heap)
Definition: bufpage.h:471
Pointer Item
Definition: item.h:17
IndexTuple _bt_form_posting(IndexTuple base, ItemPointer htids, int nhtids)
Definition: nbtdedup.c:864

References _bt_form_posting(), Assert, BTMaxItemSize, elog, ERROR, IndexTupleSize, InvalidOffsetNumber, MAXALIGN, OffsetNumberNext, PageAddItem, PageGetMaxOffsetNumber(), and pfree().

Referenced by _bt_dedup_pass(), and btree_xlog_dedup().

◆ _bt_dedup_pass()

void _bt_dedup_pass ( Relation  rel,
Buffer  buf,
IndexTuple  newitem,
Size  newitemsz,
bool  bottomupdedup 
)

Definition at line 58 of file nbtdedup.c.

60 {
61  OffsetNumber offnum,
62  minoff,
63  maxoff;
64  Page page = BufferGetPage(buf);
65  BTPageOpaque opaque = BTPageGetOpaque(page);
66  Page newpage;
68  Size pagesaving PG_USED_FOR_ASSERTS_ONLY = 0;
69  bool singlevalstrat = false;
70  int nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
71 
72  /* Passed-in newitemsz is MAXALIGNED but does not include line pointer */
73  newitemsz += sizeof(ItemIdData);
74 
75  /*
76  * Initialize deduplication state.
77  *
78  * It would be possible for maxpostingsize (limit on posting list tuple
79  * size) to be set to one third of the page. However, it seems like a
80  * good idea to limit the size of posting lists to one sixth of a page.
81  * That ought to leave us with a good split point when pages full of
82  * duplicates can be split several times.
83  */
85  state->deduplicate = true;
86  state->nmaxitems = 0;
87  state->maxpostingsize = Min(BTMaxItemSize(page) / 2, INDEX_SIZE_MASK);
88  /* Metadata about base tuple of current pending posting list */
89  state->base = NULL;
90  state->baseoff = InvalidOffsetNumber;
91  state->basetupsize = 0;
92  /* Metadata about current pending posting list TIDs */
93  state->htids = palloc(state->maxpostingsize);
94  state->nhtids = 0;
95  state->nitems = 0;
96  /* Size of all physical tuples to be replaced by pending posting list */
97  state->phystupsize = 0;
98  /* nintervals should be initialized to zero */
99  state->nintervals = 0;
100 
101  minoff = P_FIRSTDATAKEY(opaque);
102  maxoff = PageGetMaxOffsetNumber(page);
103 
104  /*
105  * Consider applying "single value" strategy, though only if the page
106  * seems likely to be split in the near future
107  */
108  if (!bottomupdedup)
109  singlevalstrat = _bt_do_singleval(rel, page, state, minoff, newitem);
110 
111  /*
112  * Deduplicate items from page, and write them to newpage.
113  *
114  * Copy the original page's LSN into newpage copy. This will become the
115  * updated version of the page. We need this because XLogInsert will
116  * examine the LSN and possibly dump it in a page image.
117  */
118  newpage = PageGetTempPageCopySpecial(page);
119  PageSetLSN(newpage, PageGetLSN(page));
120 
121  /* Copy high key, if any */
122  if (!P_RIGHTMOST(opaque))
123  {
124  ItemId hitemid = PageGetItemId(page, P_HIKEY);
125  Size hitemsz = ItemIdGetLength(hitemid);
126  IndexTuple hitem = (IndexTuple) PageGetItem(page, hitemid);
127 
128  if (PageAddItem(newpage, (Item) hitem, hitemsz, P_HIKEY,
129  false, false) == InvalidOffsetNumber)
130  elog(ERROR, "deduplication failed to add highkey");
131  }
132 
133  for (offnum = minoff;
134  offnum <= maxoff;
135  offnum = OffsetNumberNext(offnum))
136  {
137  ItemId itemid = PageGetItemId(page, offnum);
138  IndexTuple itup = (IndexTuple) PageGetItem(page, itemid);
139 
140  Assert(!ItemIdIsDead(itemid));
141 
142  if (offnum == minoff)
143  {
144  /*
145  * No previous/base tuple for the data item -- use the data item
146  * as base tuple of pending posting list
147  */
148  _bt_dedup_start_pending(state, itup, offnum);
149  }
150  else if (state->deduplicate &&
151  _bt_keep_natts_fast(rel, state->base, itup) > nkeyatts &&
152  _bt_dedup_save_htid(state, itup))
153  {
154  /*
155  * Tuple is equal to base tuple of pending posting list. Heap
156  * TID(s) for itup have been saved in state.
157  */
158  }
159  else
160  {
161  /*
162  * Tuple is not equal to pending posting list tuple, or
163  * _bt_dedup_save_htid() opted to not merge current item into
164  * pending posting list for some other reason (e.g., adding more
165  * TIDs would have caused posting list to exceed current
166  * maxpostingsize).
167  *
168  * If state contains pending posting list with more than one item,
169  * form new posting tuple and add it to our temp page (newpage).
170  * Else add pending interval's base tuple to the temp page as-is.
171  */
172  pagesaving += _bt_dedup_finish_pending(newpage, state);
173 
174  if (singlevalstrat)
175  {
176  /*
177  * Single value strategy's extra steps.
178  *
179  * Lower maxpostingsize for sixth and final large posting list
180  * tuple at the point where 5 maxpostingsize-capped tuples
181  * have either been formed or observed.
182  *
183  * When a sixth maxpostingsize-capped item is formed/observed,
184  * stop merging together tuples altogether. The few tuples
185  * that remain at the end of the page won't be merged together
186  * at all (at least not until after a future page split takes
187  * place, when this page's newly allocated right sibling page
188  * gets its first deduplication pass).
189  */
190  if (state->nmaxitems == 5)
191  _bt_singleval_fillfactor(page, state, newitemsz);
192  else if (state->nmaxitems == 6)
193  {
194  state->deduplicate = false;
195  singlevalstrat = false; /* won't be back here */
196  }
197  }
198 
199  /* itup starts new pending posting list */
200  _bt_dedup_start_pending(state, itup, offnum);
201  }
202  }
203 
204  /* Handle the last item */
205  pagesaving += _bt_dedup_finish_pending(newpage, state);
206 
207  /*
208  * If no items suitable for deduplication were found, newpage must be
209  * exactly the same as the original page, so just return from function.
210  *
211  * We could determine whether or not to proceed on the basis the space
212  * savings being sufficient to avoid an immediate page split instead. We
213  * don't do that because there is some small value in nbtsplitloc.c always
214  * operating against a page that is fully deduplicated (apart from
215  * newitem). Besides, most of the cost has already been paid.
216  */
217  if (state->nintervals == 0)
218  {
219  /* cannot leak memory here */
220  pfree(newpage);
221  pfree(state->htids);
222  pfree(state);
223  return;
224  }
225 
226  /*
227  * By here, it's clear that deduplication will definitely go ahead.
228  *
229  * Clear the BTP_HAS_GARBAGE page flag. The index must be a heapkeyspace
230  * index, and as such we'll never pay attention to BTP_HAS_GARBAGE anyway.
231  * But keep things tidy.
232  */
233  if (P_HAS_GARBAGE(opaque))
234  {
235  BTPageOpaque nopaque = BTPageGetOpaque(newpage);
236 
237  nopaque->btpo_flags &= ~BTP_HAS_GARBAGE;
238  }
239 
241 
242  PageRestoreTempPage(newpage, page);
244 
245  /* XLOG stuff */
246  if (RelationNeedsWAL(rel))
247  {
248  XLogRecPtr recptr;
249  xl_btree_dedup xlrec_dedup;
250 
251  xlrec_dedup.nintervals = state->nintervals;
252 
253  XLogBeginInsert();
255  XLogRegisterData((char *) &xlrec_dedup, SizeOfBtreeDedup);
256 
257  /*
258  * The intervals array is not in the buffer, but pretend that it is.
259  * When XLogInsert stores the whole buffer, the array need not be
260  * stored too.
261  */
262  XLogRegisterBufData(0, (char *) state->intervals,
263  state->nintervals * sizeof(BTDedupInterval));
264 
265  recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DEDUP);
266 
267  PageSetLSN(page, recptr);
268  }
269 
271 
272  /* Local space accounting should agree with page accounting */
273  Assert(pagesaving < newitemsz || PageGetExactFreeSpace(page) >= newitemsz);
274 
275  /* cannot leak memory here */
276  pfree(state->htids);
277  pfree(state);
278 }
void MarkBufferDirty(Buffer buffer)
Definition: bufmgr.c:2532
void PageRestoreTempPage(Page tempPage, Page oldPage)
Definition: bufpage.c:413
Page PageGetTempPageCopySpecial(Page page)
Definition: bufpage.c:391
static void PageSetLSN(Page page, XLogRecPtr lsn)
Definition: bufpage.h:391
static XLogRecPtr PageGetLSN(const char *page)
Definition: bufpage.h:386
#define PG_USED_FOR_ASSERTS_ONLY
Definition: c.h:197
#define ItemIdGetLength(itemId)
Definition: itemid.h:59
#define INDEX_SIZE_MASK
Definition: itup.h:65
#define START_CRIT_SECTION()
Definition: miscadmin.h:149
#define END_CRIT_SECTION()
Definition: miscadmin.h:151
static bool _bt_do_singleval(Relation rel, Page page, BTDedupState state, OffsetNumber minoff, IndexTuple newitem)
Definition: nbtdedup.c:782
Size _bt_dedup_finish_pending(Page newpage, BTDedupState state)
Definition: nbtdedup.c:555
static void _bt_singleval_fillfactor(Page page, BTDedupState state, Size newitemsz)
Definition: nbtdedup.c:822
#define P_HAS_GARBAGE(opaque)
Definition: nbtree.h:226
#define BTP_HAS_GARBAGE
Definition: nbtree.h:82
#define XLOG_BTREE_DEDUP
Definition: nbtxlog.h:33
#define SizeOfBtreeDedup
Definition: nbtxlog.h:174
uint16 btpo_flags
Definition: nbtree.h:67
uint16 nintervals
Definition: nbtxlog.h:169
uint64 XLogRecPtr
Definition: xlogdefs.h:21
void XLogRegisterBufData(uint8 block_id, const char *data, uint32 len)
Definition: xloginsert.c:405
void XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags)
Definition: xloginsert.c:242
#define REGBUF_STANDARD
Definition: xloginsert.h:34

References _bt_dedup_finish_pending(), _bt_dedup_save_htid(), _bt_dedup_start_pending(), _bt_do_singleval(), _bt_keep_natts_fast(), _bt_singleval_fillfactor(), Assert, BTMaxItemSize, BTP_HAS_GARBAGE, BTPageGetOpaque, BTPageOpaqueData::btpo_flags, buf, BufferGetPage(), elog, END_CRIT_SECTION, ERROR, INDEX_SIZE_MASK, IndexRelationGetNumberOfKeyAttributes, InvalidOffsetNumber, ItemIdGetLength, ItemIdIsDead, MarkBufferDirty(), Min, xl_btree_dedup::nintervals, OffsetNumberNext, P_FIRSTDATAKEY, P_HAS_GARBAGE, P_HIKEY, P_RIGHTMOST, PageAddItem, PageGetExactFreeSpace(), PageGetItem(), PageGetItemId(), PageGetLSN(), PageGetMaxOffsetNumber(), PageGetTempPageCopySpecial(), PageRestoreTempPage(), PageSetLSN(), palloc(), pfree(), PG_USED_FOR_ASSERTS_ONLY, REGBUF_STANDARD, RelationNeedsWAL, SizeOfBtreeDedup, START_CRIT_SECTION, XLOG_BTREE_DEDUP, XLogBeginInsert(), XLogInsert(), XLogRegisterBufData(), XLogRegisterBuffer(), and XLogRegisterData().

Referenced by _bt_delete_or_dedup_one_page().

◆ _bt_dedup_save_htid()

bool _bt_dedup_save_htid ( BTDedupState  state,
IndexTuple  itup 
)

Definition at line 484 of file nbtdedup.c.

485 {
486  int nhtids;
487  ItemPointer htids;
488  Size mergedtupsz;
489 
490  Assert(!BTreeTupleIsPivot(itup));
491 
492  if (!BTreeTupleIsPosting(itup))
493  {
494  nhtids = 1;
495  htids = &itup->t_tid;
496  }
497  else
498  {
499  nhtids = BTreeTupleGetNPosting(itup);
500  htids = BTreeTupleGetPosting(itup);
501  }
502 
503  /*
504  * Don't append (have caller finish pending posting list as-is) if
505  * appending heap TID(s) from itup would put us over maxpostingsize limit.
506  *
507  * This calculation needs to match the code used within _bt_form_posting()
508  * for new posting list tuples.
509  */
510  mergedtupsz = MAXALIGN(state->basetupsize +
511  (state->nhtids + nhtids) * sizeof(ItemPointerData));
512 
513  if (mergedtupsz > state->maxpostingsize)
514  {
515  /*
516  * Count this as an oversized item for single value strategy, though
517  * only when there are 50 TIDs in the final posting list tuple. This
518  * limit (which is fairly arbitrary) avoids confusion about how many
519  * 1/6 of a page tuples have been encountered/created by the current
520  * deduplication pass.
521  *
522  * Note: We deliberately don't consider which deduplication pass
523  * merged together tuples to create this item (could be a previous
524  * deduplication pass, or current pass). See _bt_do_singleval()
525  * comments.
526  */
527  if (state->nhtids > 50)
528  state->nmaxitems++;
529 
530  return false;
531  }
532 
533  /*
534  * Save heap TIDs to pending posting list tuple -- itup can be merged into
535  * pending posting list
536  */
537  state->nitems++;
538  memcpy(state->htids + state->nhtids, htids,
539  sizeof(ItemPointerData) * nhtids);
540  state->nhtids += nhtids;
541  state->phystupsize += MAXALIGN(IndexTupleSize(itup)) + sizeof(ItemIdData);
542 
543  return true;
544 }
static uint16 BTreeTupleGetNPosting(IndexTuple posting)
Definition: nbtree.h:518
static ItemPointer BTreeTupleGetPosting(IndexTuple posting)
Definition: nbtree.h:537

References Assert, BTreeTupleGetNPosting(), BTreeTupleGetPosting(), BTreeTupleIsPivot(), BTreeTupleIsPosting(), IndexTupleSize, MAXALIGN, and IndexTupleData::t_tid.

Referenced by _bt_bottomupdel_pass(), _bt_dedup_pass(), _bt_load(), and btree_xlog_dedup().

◆ _bt_dedup_start_pending()

void _bt_dedup_start_pending ( BTDedupState  state,
IndexTuple  base,
OffsetNumber  baseoff 
)

Definition at line 433 of file nbtdedup.c.

435 {
436  Assert(state->nhtids == 0);
437  Assert(state->nitems == 0);
438  Assert(!BTreeTupleIsPivot(base));
439 
440  /*
441  * Copy heap TID(s) from new base tuple for new candidate posting list
442  * into working state's array
443  */
444  if (!BTreeTupleIsPosting(base))
445  {
446  memcpy(state->htids, &base->t_tid, sizeof(ItemPointerData));
447  state->nhtids = 1;
448  state->basetupsize = IndexTupleSize(base);
449  }
450  else
451  {
452  int nposting;
453 
454  nposting = BTreeTupleGetNPosting(base);
455  memcpy(state->htids, BTreeTupleGetPosting(base),
456  sizeof(ItemPointerData) * nposting);
457  state->nhtids = nposting;
458  /* basetupsize should not include existing posting list */
459  state->basetupsize = BTreeTupleGetPostingOffset(base);
460  }
461 
462  /*
463  * Save new base tuple itself -- it'll be needed if we actually create a
464  * new posting list from new pending posting list.
465  *
466  * Must maintain physical size of all existing tuples (including line
467  * pointer overhead) so that we can calculate space savings on page.
468  */
469  state->nitems = 1;
470  state->base = base;
471  state->baseoff = baseoff;
472  state->phystupsize = MAXALIGN(IndexTupleSize(base)) + sizeof(ItemIdData);
473  /* Also save baseoff in pending state for interval */
474  state->intervals[state->nintervals].baseoff = state->baseoff;
475 }
static uint32 BTreeTupleGetPostingOffset(IndexTuple posting)
Definition: nbtree.h:529

References Assert, BTreeTupleGetNPosting(), BTreeTupleGetPosting(), BTreeTupleGetPostingOffset(), BTreeTupleIsPivot(), BTreeTupleIsPosting(), IndexTupleSize, MAXALIGN, and IndexTupleData::t_tid.

Referenced by _bt_bottomupdel_pass(), _bt_dedup_pass(), _bt_load(), and btree_xlog_dedup().

◆ _bt_delitems_delete_check()

void _bt_delitems_delete_check ( Relation  rel,
Buffer  buf,
Relation  heapRel,
TM_IndexDeleteOp delstate 
)

Definition at line 1513 of file nbtpage.c.

1515 {
1516  Page page = BufferGetPage(buf);
1517  TransactionId snapshotConflictHorizon;
1518  bool isCatalogRel;
1519  OffsetNumber postingidxoffnum = InvalidOffsetNumber;
1520  int ndeletable = 0,
1521  nupdatable = 0;
1524 
1525  /* Use tableam interface to determine which tuples to delete first */
1526  snapshotConflictHorizon = table_index_delete_tuples(heapRel, delstate);
1527  isCatalogRel = RelationIsAccessibleInLogicalDecoding(heapRel);
1528 
1529  /* Should not WAL-log snapshotConflictHorizon unless it's required */
1530  if (!XLogStandbyInfoActive())
1531  snapshotConflictHorizon = InvalidTransactionId;
1532 
1533  /*
1534  * Construct a leaf-page-wise description of what _bt_delitems_delete()
1535  * needs to do to physically delete index tuples from the page.
1536  *
1537  * Must sort deltids array to restore leaf-page-wise order (original order
1538  * before call to tableam). This is the order that the loop expects.
1539  *
1540  * Note that deltids array might be a lot smaller now. It might even have
1541  * no entries at all (with bottom-up deletion caller), in which case there
1542  * is nothing left to do.
1543  */
1544  qsort(delstate->deltids, delstate->ndeltids, sizeof(TM_IndexDelete),
1546  if (delstate->ndeltids == 0)
1547  {
1548  Assert(delstate->bottomup);
1549  return;
1550  }
1551 
1552  /* We definitely have to delete at least one index tuple (or one TID) */
1553  for (int i = 0; i < delstate->ndeltids; i++)
1554  {
1555  TM_IndexStatus *dstatus = delstate->status + delstate->deltids[i].id;
1556  OffsetNumber idxoffnum = dstatus->idxoffnum;
1557  ItemId itemid = PageGetItemId(page, idxoffnum);
1558  IndexTuple itup = (IndexTuple) PageGetItem(page, itemid);
1559  int nestedi,
1560  nitem;
1561  BTVacuumPosting vacposting;
1562 
1563  Assert(OffsetNumberIsValid(idxoffnum));
1564 
1565  if (idxoffnum == postingidxoffnum)
1566  {
1567  /*
1568  * This deltid entry is a TID from a posting list tuple that has
1569  * already been completely processed
1570  */
1571  Assert(BTreeTupleIsPosting(itup));
1573  &delstate->deltids[i].tid) < 0);
1575  &delstate->deltids[i].tid) >= 0);
1576  continue;
1577  }
1578 
1579  if (!BTreeTupleIsPosting(itup))
1580  {
1581  /* Plain non-pivot tuple */
1582  Assert(ItemPointerEquals(&itup->t_tid, &delstate->deltids[i].tid));
1583  if (dstatus->knowndeletable)
1584  deletable[ndeletable++] = idxoffnum;
1585  continue;
1586  }
1587 
1588  /*
1589  * itup is a posting list tuple whose lowest deltids entry (which may
1590  * or may not be for the first TID from itup) is considered here now.
1591  * We should process all of the deltids entries for the posting list
1592  * together now, though (not just the lowest). Remember to skip over
1593  * later itup-related entries during later iterations of outermost
1594  * loop.
1595  */
1596  postingidxoffnum = idxoffnum; /* Remember work in outermost loop */
1597  nestedi = i; /* Initialize for first itup deltids entry */
1598  vacposting = NULL; /* Describes final action for itup */
1599  nitem = BTreeTupleGetNPosting(itup);
1600  for (int p = 0; p < nitem; p++)
1601  {
1602  ItemPointer ptid = BTreeTupleGetPostingN(itup, p);
1603  int ptidcmp = -1;
1604 
1605  /*
1606  * This nested loop reuses work across ptid TIDs taken from itup.
1607  * We take advantage of the fact that both itup's TIDs and deltids
1608  * entries (within a single itup/posting list grouping) must both
1609  * be in ascending TID order.
1610  */
1611  for (; nestedi < delstate->ndeltids; nestedi++)
1612  {
1613  TM_IndexDelete *tcdeltid = &delstate->deltids[nestedi];
1614  TM_IndexStatus *tdstatus = (delstate->status + tcdeltid->id);
1615 
1616  /* Stop once we get past all itup related deltids entries */
1617  Assert(tdstatus->idxoffnum >= idxoffnum);
1618  if (tdstatus->idxoffnum != idxoffnum)
1619  break;
1620 
1621  /* Skip past non-deletable itup related entries up front */
1622  if (!tdstatus->knowndeletable)
1623  continue;
1624 
1625  /* Entry is first partial ptid match (or an exact match)? */
1626  ptidcmp = ItemPointerCompare(&tcdeltid->tid, ptid);
1627  if (ptidcmp >= 0)
1628  {
1629  /* Greater than or equal (partial or exact) match... */
1630  break;
1631  }
1632  }
1633 
1634  /* ...exact ptid match to a deletable deltids entry? */
1635  if (ptidcmp != 0)
1636  continue;
1637 
1638  /* Exact match for deletable deltids entry -- ptid gets deleted */
1639  if (vacposting == NULL)
1640  {
1641  vacposting = palloc(offsetof(BTVacuumPostingData, deletetids) +
1642  nitem * sizeof(uint16));
1643  vacposting->itup = itup;
1644  vacposting->updatedoffset = idxoffnum;
1645  vacposting->ndeletedtids = 0;
1646  }
1647  vacposting->deletetids[vacposting->ndeletedtids++] = p;
1648  }
1649 
1650  /* Final decision on itup, a posting list tuple */
1651 
1652  if (vacposting == NULL)
1653  {
1654  /* No TIDs to delete from itup -- do nothing */
1655  }
1656  else if (vacposting->ndeletedtids == nitem)
1657  {
1658  /* Straight delete of itup (to delete all TIDs) */
1659  deletable[ndeletable++] = idxoffnum;
1660  /* Turns out we won't need granular information */
1661  pfree(vacposting);
1662  }
1663  else
1664  {
1665  /* Delete some (but not all) TIDs from itup */
1666  Assert(vacposting->ndeletedtids > 0 &&
1667  vacposting->ndeletedtids < nitem);
1668  updatable[nupdatable++] = vacposting;
1669  }
1670  }
1671 
1672  /* Physically delete tuples (or TIDs) using deletable (or updatable) */
1673  _bt_delitems_delete(rel, buf, snapshotConflictHorizon, isCatalogRel,
1674  deletable, ndeletable, updatable, nupdatable);
1675 
1676  /* be tidy */
1677  for (int i = 0; i < nupdatable; i++)
1678  pfree(updatable[i]);
1679 }
unsigned short uint16
Definition: c.h:517
uint32 TransactionId
Definition: c.h:657
bool ItemPointerEquals(ItemPointer pointer1, ItemPointer pointer2)
Definition: itemptr.c:35
#define MaxIndexTuplesPerPage
Definition: itup.h:165
static void _bt_delitems_delete(Relation rel, Buffer buf, TransactionId snapshotConflictHorizon, bool isCatalogRel, OffsetNumber *deletable, int ndeletable, BTVacuumPosting *updatable, int nupdatable)
Definition: nbtpage.c:1284
static int _bt_delitems_cmp(const void *a, const void *b)
Definition: nbtpage.c:1464
static ItemPointer BTreeTupleGetPostingN(IndexTuple posting, int n)
Definition: nbtree.h:544
#define OffsetNumberIsValid(offsetNumber)
Definition: off.h:39
#define qsort(a, b, c, d)
Definition: port.h:447
uint16 deletetids[FLEXIBLE_ARRAY_MEMBER]
Definition: nbtree.h:911
uint16 ndeletedtids
Definition: nbtree.h:910
IndexTuple itup
Definition: nbtree.h:906
OffsetNumber updatedoffset
Definition: nbtree.h:907
ItemPointerData tid
Definition: tableam.h:212
bool knowndeletable
Definition: tableam.h:219
OffsetNumber idxoffnum
Definition: tableam.h:218
static TransactionId table_index_delete_tuples(Relation rel, TM_IndexDeleteOp *delstate)
Definition: tableam.h:1365
#define InvalidTransactionId
Definition: transam.h:31

References _bt_delitems_cmp(), _bt_delitems_delete(), Assert, TM_IndexDeleteOp::bottomup, BTreeTupleGetHeapTID(), BTreeTupleGetMaxHeapTID(), BTreeTupleGetNPosting(), BTreeTupleGetPostingN(), BTreeTupleIsPosting(), buf, BufferGetPage(), BTVacuumPostingData::deletetids, TM_IndexDeleteOp::deltids, i, TM_IndexDelete::id, TM_IndexStatus::idxoffnum, InvalidOffsetNumber, InvalidTransactionId, ItemPointerCompare(), ItemPointerEquals(), BTVacuumPostingData::itup, TM_IndexStatus::knowndeletable, MaxIndexTuplesPerPage, BTVacuumPostingData::ndeletedtids, TM_IndexDeleteOp::ndeltids, OffsetNumberIsValid, PageGetItem(), PageGetItemId(), palloc(), pfree(), qsort, RelationIsAccessibleInLogicalDecoding, TM_IndexDeleteOp::status, IndexTupleData::t_tid, table_index_delete_tuples(), TM_IndexDelete::tid, BTVacuumPostingData::updatedoffset, and XLogStandbyInfoActive.

Referenced by _bt_bottomupdel_pass(), and _bt_simpledel_pass().

◆ _bt_delitems_vacuum()

void _bt_delitems_vacuum ( Relation  rel,
Buffer  buf,
OffsetNumber deletable,
int  ndeletable,
BTVacuumPosting updatable,
int  nupdatable 
)

Definition at line 1154 of file nbtpage.c.

1157 {
1158  Page page = BufferGetPage(buf);
1159  BTPageOpaque opaque;
1160  bool needswal = RelationNeedsWAL(rel);
1161  char *updatedbuf = NULL;
1162  Size updatedbuflen = 0;
1163  OffsetNumber updatedoffsets[MaxIndexTuplesPerPage];
1164 
1165  /* Shouldn't be called unless there's something to do */
1166  Assert(ndeletable > 0 || nupdatable > 0);
1167 
1168  /* Generate new version of posting lists without deleted TIDs */
1169  if (nupdatable > 0)
1170  updatedbuf = _bt_delitems_update(updatable, nupdatable,
1171  updatedoffsets, &updatedbuflen,
1172  needswal);
1173 
1174  /* No ereport(ERROR) until changes are logged */
1176 
1177  /*
1178  * Handle posting tuple updates.
1179  *
1180  * Deliberately do this before handling simple deletes. If we did it the
1181  * other way around (i.e. WAL record order -- simple deletes before
1182  * updates) then we'd have to make compensating changes to the 'updatable'
1183  * array of offset numbers.
1184  *
1185  * PageIndexTupleOverwrite() won't unset each item's LP_DEAD bit when it
1186  * happens to already be set. It's important that we not interfere with
1187  * any future simple index tuple deletion operations.
1188  */
1189  for (int i = 0; i < nupdatable; i++)
1190  {
1191  OffsetNumber updatedoffset = updatedoffsets[i];
1192  IndexTuple itup;
1193  Size itemsz;
1194 
1195  itup = updatable[i]->itup;
1196  itemsz = MAXALIGN(IndexTupleSize(itup));
1197  if (!PageIndexTupleOverwrite(page, updatedoffset, (Item) itup,
1198  itemsz))
1199  elog(PANIC, "failed to update partially dead item in block %u of index \"%s\"",
1201  }
1202 
1203  /* Now handle simple deletes of entire tuples */
1204  if (ndeletable > 0)
1205  PageIndexMultiDelete(page, deletable, ndeletable);
1206 
1207  /*
1208  * We can clear the vacuum cycle ID since this page has certainly been
1209  * processed by the current vacuum scan.
1210  */
1211  opaque = BTPageGetOpaque(page);
1212  opaque->btpo_cycleid = 0;
1213 
1214  /*
1215  * Clear the BTP_HAS_GARBAGE page flag.
1216  *
1217  * This flag indicates the presence of LP_DEAD items on the page (though
1218  * not reliably). Note that we only rely on it with pg_upgrade'd
1219  * !heapkeyspace indexes. That's why clearing it here won't usually
1220  * interfere with simple index tuple deletion.
1221  */
1222  opaque->btpo_flags &= ~BTP_HAS_GARBAGE;
1223 
1225 
1226  /* XLOG stuff */
1227  if (needswal)
1228  {
1229  XLogRecPtr recptr;
1230  xl_btree_vacuum xlrec_vacuum;
1231 
1232  xlrec_vacuum.ndeleted = ndeletable;
1233  xlrec_vacuum.nupdated = nupdatable;
1234 
1235  XLogBeginInsert();
1237  XLogRegisterData((char *) &xlrec_vacuum, SizeOfBtreeVacuum);
1238 
1239  if (ndeletable > 0)
1240  XLogRegisterBufData(0, (char *) deletable,
1241  ndeletable * sizeof(OffsetNumber));
1242 
1243  if (nupdatable > 0)
1244  {
1245  XLogRegisterBufData(0, (char *) updatedoffsets,
1246  nupdatable * sizeof(OffsetNumber));
1247  XLogRegisterBufData(0, updatedbuf, updatedbuflen);
1248  }
1249 
1250  recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_VACUUM);
1251 
1252  PageSetLSN(page, recptr);
1253  }
1254 
1255  END_CRIT_SECTION();
1256 
1257  /* can't leak memory here */
1258  if (updatedbuf != NULL)
1259  pfree(updatedbuf);
1260  /* free tuples allocated within _bt_delitems_update() */
1261  for (int i = 0; i < nupdatable; i++)
1262  pfree(updatable[i]->itup);
1263 }
void PageIndexMultiDelete(Page page, OffsetNumber *itemnos, int nitems)
Definition: bufpage.c:1150
bool PageIndexTupleOverwrite(Page page, OffsetNumber offnum, Item newtup, Size newsize)
Definition: bufpage.c:1394
#define PANIC
Definition: elog.h:42
static char * _bt_delitems_update(BTVacuumPosting *updatable, int nupdatable, OffsetNumber *updatedoffsets, Size *updatedbuflen, bool needswal)
Definition: nbtpage.c:1405
#define SizeOfBtreeVacuum
Definition: nbtxlog.h:234
#define XLOG_BTREE_VACUUM
Definition: nbtxlog.h:39
BTCycleId btpo_cycleid
Definition: nbtree.h:68
uint16 ndeleted
Definition: nbtxlog.h:222
uint16 nupdated
Definition: nbtxlog.h:223

References _bt_delitems_update(), Assert, BTP_HAS_GARBAGE, BTPageGetOpaque, BTPageOpaqueData::btpo_cycleid, BTPageOpaqueData::btpo_flags, buf, BufferGetBlockNumber(), BufferGetPage(), elog, END_CRIT_SECTION, i, IndexTupleSize, BTVacuumPostingData::itup, MarkBufferDirty(), MAXALIGN, MaxIndexTuplesPerPage, xl_btree_vacuum::ndeleted, xl_btree_vacuum::nupdated, PageIndexMultiDelete(), PageIndexTupleOverwrite(), PageSetLSN(), PANIC, pfree(), REGBUF_STANDARD, RelationGetRelationName, RelationNeedsWAL, SizeOfBtreeVacuum, START_CRIT_SECTION, XLOG_BTREE_VACUUM, XLogBeginInsert(), XLogInsert(), XLogRegisterBufData(), XLogRegisterBuffer(), and XLogRegisterData().

Referenced by btvacuumpage().

◆ _bt_doinsert()

bool _bt_doinsert ( Relation  rel,
IndexTuple  itup,
IndexUniqueCheck  checkUnique,
bool  indexUnchanged,
Relation  heapRel 
)

Definition at line 102 of file nbtinsert.c.

105 {
106  bool is_unique = false;
107  BTInsertStateData insertstate;
108  BTScanInsert itup_key;
109  BTStack stack;
110  bool checkingunique = (checkUnique != UNIQUE_CHECK_NO);
111 
112  /* we need an insertion scan key to do our search, so build one */
113  itup_key = _bt_mkscankey(rel, itup);
114 
115  if (checkingunique)
116  {
117  if (!itup_key->anynullkeys)
118  {
119  /* No (heapkeyspace) scantid until uniqueness established */
120  itup_key->scantid = NULL;
121  }
122  else
123  {
124  /*
125  * Scan key for new tuple contains NULL key values. Bypass
126  * checkingunique steps. They are unnecessary because core code
127  * considers NULL unequal to every value, including NULL.
128  *
129  * This optimization avoids O(N^2) behavior within the
130  * _bt_findinsertloc() heapkeyspace path when a unique index has a
131  * large number of "duplicates" with NULL key values.
132  */
133  checkingunique = false;
134  /* Tuple is unique in the sense that core code cares about */
135  Assert(checkUnique != UNIQUE_CHECK_EXISTING);
136  is_unique = true;
137  }
138  }
139 
140  /*
141  * Fill in the BTInsertState working area, to track the current page and
142  * position within the page to insert on.
143  *
144  * Note that itemsz is passed down to lower level code that deals with
145  * inserting the item. It must be MAXALIGN()'d. This ensures that space
146  * accounting code consistently considers the alignment overhead that we
147  * expect PageAddItem() will add later. (Actually, index_form_tuple() is
148  * already conservative about alignment, but we don't rely on that from
149  * this distance. Besides, preserving the "true" tuple size in index
150  * tuple headers for the benefit of nbtsplitloc.c might happen someday.
151  * Note that heapam does not MAXALIGN() each heap tuple's lp_len field.)
152  */
153  insertstate.itup = itup;
154  insertstate.itemsz = MAXALIGN(IndexTupleSize(itup));
155  insertstate.itup_key = itup_key;
156  insertstate.bounds_valid = false;
157  insertstate.buf = InvalidBuffer;
158  insertstate.postingoff = 0;
159 
160 search:
161 
162  /*
163  * Find and lock the leaf page that the tuple should be added to by
164  * searching from the root page. insertstate.buf will hold a buffer that
165  * is locked in exclusive mode afterwards.
166  */
167  stack = _bt_search_insert(rel, heapRel, &insertstate);
168 
169  /*
170  * checkingunique inserts are not allowed to go ahead when two tuples with
171  * equal key attribute values would be visible to new MVCC snapshots once
172  * the xact commits. Check for conflicts in the locked page/buffer (if
173  * needed) here.
174  *
175  * It might be necessary to check a page to the right in _bt_check_unique,
176  * though that should be very rare. In practice the first page the value
177  * could be on (with scantid omitted) is almost always also the only page
178  * that a matching tuple might be found on. This is due to the behavior
179  * of _bt_findsplitloc with duplicate tuples -- a group of duplicates can
180  * only be allowed to cross a page boundary when there is no candidate
181  * leaf page split point that avoids it. Also, _bt_check_unique can use
182  * the leaf page high key to determine that there will be no duplicates on
183  * the right sibling without actually visiting it (it uses the high key in
184  * cases where the new item happens to belong at the far right of the leaf
185  * page).
186  *
187  * NOTE: obviously, _bt_check_unique can only detect keys that are already
188  * in the index; so it cannot defend against concurrent insertions of the
189  * same key. We protect against that by means of holding a write lock on
190  * the first page the value could be on, with omitted/-inf value for the
191  * implicit heap TID tiebreaker attribute. Any other would-be inserter of
192  * the same key must acquire a write lock on the same page, so only one
193  * would-be inserter can be making the check at one time. Furthermore,
194  * once we are past the check we hold write locks continuously until we
195  * have performed our insertion, so no later inserter can fail to see our
196  * insertion. (This requires some care in _bt_findinsertloc.)
197  *
198  * If we must wait for another xact, we release the lock while waiting,
199  * and then must perform a new search.
200  *
201  * For a partial uniqueness check, we don't wait for the other xact. Just
202  * let the tuple in and return false for possibly non-unique, or true for
203  * definitely unique.
204  */
205  if (checkingunique)
206  {
207  TransactionId xwait;
208  uint32 speculativeToken;
209 
210  xwait = _bt_check_unique(rel, &insertstate, heapRel, checkUnique,
211  &is_unique, &speculativeToken);
212 
213  if (unlikely(TransactionIdIsValid(xwait)))
214  {
215  /* Have to wait for the other guy ... */
216  _bt_relbuf(rel, insertstate.buf);
217  insertstate.buf = InvalidBuffer;
218 
219  /*
220  * If it's a speculative insertion, wait for it to finish (ie. to
221  * go ahead with the insertion, or kill the tuple). Otherwise
222  * wait for the transaction to finish as usual.
223  */
224  if (speculativeToken)
225  SpeculativeInsertionWait(xwait, speculativeToken);
226  else
227  XactLockTableWait(xwait, rel, &itup->t_tid, XLTW_InsertIndex);
228 
229  /* start over... */
230  if (stack)
231  _bt_freestack(stack);
232  goto search;
233  }
234 
235  /* Uniqueness is established -- restore heap tid as scantid */
236  if (itup_key->heapkeyspace)
237  itup_key->scantid = &itup->t_tid;
238  }
239 
240  if (checkUnique != UNIQUE_CHECK_EXISTING)
241  {
242  OffsetNumber newitemoff;
243 
244  /*
245  * The only conflict predicate locking cares about for indexes is when
246  * an index tuple insert conflicts with an existing lock. We don't
247  * know the actual page we're going to insert on for sure just yet in
248  * checkingunique and !heapkeyspace cases, but it's okay to use the
249  * first page the value could be on (with scantid omitted) instead.
250  */
252 
253  /*
254  * Do the insertion. Note that insertstate contains cached binary
255  * search bounds established within _bt_check_unique when insertion is
256  * checkingunique.
257  */
258  newitemoff = _bt_findinsertloc(rel, &insertstate, checkingunique,
259  indexUnchanged, stack, heapRel);
260  _bt_insertonpg(rel, heapRel, itup_key, insertstate.buf, InvalidBuffer,
261  stack, itup, insertstate.itemsz, newitemoff,
262  insertstate.postingoff, false);
263  }
264  else
265  {
266  /* just release the buffer */
267  _bt_relbuf(rel, insertstate.buf);
268  }
269 
270  /* be tidy */
271  if (stack)
272  _bt_freestack(stack);
273  pfree(itup_key);
274 
275  return is_unique;
276 }
unsigned int uint32
Definition: c.h:518
@ UNIQUE_CHECK_NO
Definition: genam.h:117
@ UNIQUE_CHECK_EXISTING
Definition: genam.h:120
void XactLockTableWait(TransactionId xid, Relation rel, ItemPointer ctid, XLTW_Oper oper)
Definition: lmgr.c:656
void SpeculativeInsertionWait(TransactionId xid, uint32 token)
Definition: lmgr.c:813
@ XLTW_InsertIndex
Definition: lmgr.h:31
static BTStack _bt_search_insert(Relation rel, Relation heaprel, BTInsertState insertstate)
Definition: nbtinsert.c:317
static OffsetNumber _bt_findinsertloc(Relation rel, BTInsertState insertstate, bool checkingunique, bool indexUnchanged, BTStack stack, Relation heapRel)
Definition: nbtinsert.c:815
static void _bt_insertonpg(Relation rel, Relation heaprel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, BTStack stack, IndexTuple itup, Size itemsz, OffsetNumber newitemoff, int postingoff, bool split_only_page)
Definition: nbtinsert.c:1105
static TransactionId _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel, IndexUniqueCheck checkUnique, bool *is_unique, uint32 *speculativeToken)
Definition: nbtinsert.c:408
void _bt_freestack(BTStack stack)
Definition: nbtutils.c:221
BTScanInsert _bt_mkscankey(Relation rel, IndexTuple itup)
Definition: nbtutils.c:129
void CheckForSerializableConflictIn(Relation relation, ItemPointer tid, BlockNumber blkno)
Definition: predicate.c:4326
IndexTuple itup
Definition: nbtree.h:811
ItemPointer scantid
Definition: nbtree.h:791
bool heapkeyspace
Definition: nbtree.h:786
bool anynullkeys
Definition: nbtree.h:788
#define TransactionIdIsValid(xid)
Definition: transam.h:41

References _bt_check_unique(), _bt_findinsertloc(), _bt_freestack(), _bt_insertonpg(), _bt_mkscankey(), _bt_relbuf(), _bt_search_insert(), BTScanInsertData::anynullkeys, Assert, BTInsertStateData::bounds_valid, BTInsertStateData::buf, BufferGetBlockNumber(), CheckForSerializableConflictIn(), BTScanInsertData::heapkeyspace, IndexTupleSize, InvalidBuffer, BTInsertStateData::itemsz, BTInsertStateData::itup, BTInsertStateData::itup_key, MAXALIGN, pfree(), BTInsertStateData::postingoff, BTScanInsertData::scantid, SpeculativeInsertionWait(), IndexTupleData::t_tid, TransactionIdIsValid, UNIQUE_CHECK_EXISTING, UNIQUE_CHECK_NO, unlikely, XactLockTableWait(), and XLTW_InsertIndex.

Referenced by btinsert().

◆ _bt_end_vacuum()

void _bt_end_vacuum ( Relation  rel)

Definition at line 4486 of file nbtutils.c.

4487 {
4488  int i;
4489 
4490  LWLockAcquire(BtreeVacuumLock, LW_EXCLUSIVE);
4491 
4492  /* Find the array entry */
4493  for (i = 0; i < btvacinfo->num_vacuums; i++)
4494  {
4495  BTOneVacInfo *vac = &btvacinfo->vacuums[i];
4496 
4497  if (vac->relid.relId == rel->rd_lockInfo.lockRelId.relId &&
4498  vac->relid.dbId == rel->rd_lockInfo.lockRelId.dbId)
4499  {
4500  /* Remove it by shifting down the last entry */
4501  *vac = btvacinfo->vacuums[btvacinfo->num_vacuums - 1];
4503  break;
4504  }
4505  }
4506 
4507  LWLockRelease(BtreeVacuumLock);
4508 }
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1168
void LWLockRelease(LWLock *lock)
Definition: lwlock.c:1781
@ LW_EXCLUSIVE
Definition: lwlock.h:114
static BTVacInfo * btvacinfo
Definition: nbtutils.c:4382
LockRelId relid
Definition: nbtutils.c:4370
int num_vacuums
Definition: nbtutils.c:4377
BTOneVacInfo vacuums[FLEXIBLE_ARRAY_MEMBER]
Definition: nbtutils.c:4379
LockRelId lockRelId
Definition: rel.h:46
Oid relId
Definition: rel.h:40
Oid dbId
Definition: rel.h:41
LockInfoData rd_lockInfo
Definition: rel.h:114

References btvacinfo, LockRelId::dbId, i, LockInfoData::lockRelId, LW_EXCLUSIVE, LWLockAcquire(), LWLockRelease(), BTVacInfo::num_vacuums, RelationData::rd_lockInfo, BTOneVacInfo::relid, LockRelId::relId, and BTVacInfo::vacuums.

Referenced by _bt_end_vacuum_callback(), and btbulkdelete().

◆ _bt_end_vacuum_callback()

void _bt_end_vacuum_callback ( int  code,
Datum  arg 
)

Definition at line 4514 of file nbtutils.c.

4515 {
4517 }
void _bt_end_vacuum(Relation rel)
Definition: nbtutils.c:4486
void * arg
static Pointer DatumGetPointer(Datum X)
Definition: postgres.h:312

References _bt_end_vacuum(), arg, and DatumGetPointer().

Referenced by btbulkdelete().

◆ _bt_findsplitloc()

OffsetNumber _bt_findsplitloc ( Relation  rel,
Page  origpage,
OffsetNumber  newitemoff,
Size  newitemsz,
IndexTuple  newitem,
bool newitemonleft 
)

Definition at line 129 of file nbtsplitloc.c.

135 {
136  BTPageOpaque opaque;
137  int leftspace,
138  rightspace,
139  olddataitemstotal,
140  olddataitemstoleft,
141  perfectpenalty,
142  leaffillfactor;
144  FindSplitStrat strategy;
145  ItemId itemid;
146  OffsetNumber offnum,
147  maxoff,
148  firstrightoff;
149  double fillfactormult;
150  bool usemult;
151  SplitPoint leftpage,
152  rightpage;
153 
154  opaque = BTPageGetOpaque(origpage);
155  maxoff = PageGetMaxOffsetNumber(origpage);
156 
157  /* Total free space available on a btree page, after fixed overhead */
158  leftspace = rightspace =
160  MAXALIGN(sizeof(BTPageOpaqueData));
161 
162  /* The right page will have the same high key as the old page */
163  if (!P_RIGHTMOST(opaque))
164  {
165  itemid = PageGetItemId(origpage, P_HIKEY);
166  rightspace -= (int) (MAXALIGN(ItemIdGetLength(itemid)) +
167  sizeof(ItemIdData));
168  }
169 
170  /* Count up total space in data items before actually scanning 'em */
171  olddataitemstotal = rightspace - (int) PageGetExactFreeSpace(origpage);
172  leaffillfactor = BTGetFillFactor(rel);
173 
174  /* Passed-in newitemsz is MAXALIGNED but does not include line pointer */
175  newitemsz += sizeof(ItemIdData);
176  state.rel = rel;
177  state.origpage = origpage;
178  state.newitem = newitem;
179  state.newitemsz = newitemsz;
180  state.is_leaf = P_ISLEAF(opaque);
181  state.is_rightmost = P_RIGHTMOST(opaque);
182  state.leftspace = leftspace;
183  state.rightspace = rightspace;
184  state.olddataitemstotal = olddataitemstotal;
185  state.minfirstrightsz = SIZE_MAX;
186  state.newitemoff = newitemoff;
187 
188  /* newitem cannot be a posting list item */
189  Assert(!BTreeTupleIsPosting(newitem));
190 
191  /*
192  * nsplits should never exceed maxoff because there will be at most as
193  * many candidate split points as there are points _between_ tuples, once
194  * you imagine that the new item is already on the original page (the
195  * final number of splits may be slightly lower because not all points
196  * between tuples will be legal).
197  */
198  state.maxsplits = maxoff;
199  state.splits = palloc(sizeof(SplitPoint) * state.maxsplits);
200  state.nsplits = 0;
201 
202  /*
203  * Scan through the data items and calculate space usage for a split at
204  * each possible position
205  */
206  olddataitemstoleft = 0;
207 
208  for (offnum = P_FIRSTDATAKEY(opaque);
209  offnum <= maxoff;
210  offnum = OffsetNumberNext(offnum))
211  {
212  Size itemsz;
213 
214  itemid = PageGetItemId(origpage, offnum);
215  itemsz = MAXALIGN(ItemIdGetLength(itemid)) + sizeof(ItemIdData);
216 
217  /*
218  * When item offset number is not newitemoff, neither side of the
219  * split can be newitem. Record a split after the previous data item
220  * from original page, but before the current data item from original
221  * page. (_bt_recsplitloc() will reject the split when there are no
222  * previous items, which we rely on.)
223  */
224  if (offnum < newitemoff)
225  _bt_recsplitloc(&state, offnum, false, olddataitemstoleft, itemsz);
226  else if (offnum > newitemoff)
227  _bt_recsplitloc(&state, offnum, true, olddataitemstoleft, itemsz);
228  else
229  {
230  /*
231  * Record a split after all "offnum < newitemoff" original page
232  * data items, but before newitem
233  */
234  _bt_recsplitloc(&state, offnum, false, olddataitemstoleft, itemsz);
235 
236  /*
237  * Record a split after newitem, but before data item from
238  * original page at offset newitemoff/current offset
239  */
240  _bt_recsplitloc(&state, offnum, true, olddataitemstoleft, itemsz);
241  }
242 
243  olddataitemstoleft += itemsz;
244  }
245 
246  /*
247  * Record a split after all original page data items, but before newitem.
248  * (Though only when it's possible that newitem will end up alone on new
249  * right page.)
250  */
251  Assert(olddataitemstoleft == olddataitemstotal);
252  if (newitemoff > maxoff)
253  _bt_recsplitloc(&state, newitemoff, false, olddataitemstotal, 0);
254 
255  /*
256  * I believe it is not possible to fail to find a feasible split, but just
257  * in case ...
258  */
259  if (state.nsplits == 0)
260  elog(ERROR, "could not find a feasible split point for index \"%s\"",
262 
263  /*
264  * Start search for a split point among list of legal split points. Give
265  * primary consideration to equalizing available free space in each half
266  * of the split initially (start with default strategy), while applying
267  * rightmost and split-after-new-item optimizations where appropriate.
268  * Either of the two other fallback strategies may be required for cases
269  * with a large number of duplicates around the original/space-optimal
270  * split point.
271  *
272  * Default strategy gives some weight to suffix truncation in deciding a
273  * split point on leaf pages. It attempts to select a split point where a
274  * distinguishing attribute appears earlier in the new high key for the
275  * left side of the split, in order to maximize the number of trailing
276  * attributes that can be truncated away. Only candidate split points
277  * that imply an acceptable balance of free space on each side are
278  * considered. See _bt_defaultinterval().
279  */
280  if (!state.is_leaf)
281  {
282  /* fillfactormult only used on rightmost page */
283  usemult = state.is_rightmost;
284  fillfactormult = BTREE_NONLEAF_FILLFACTOR / 100.0;
285  }
286  else if (state.is_rightmost)
287  {
288  /* Rightmost leaf page -- fillfactormult always used */
289  usemult = true;
290  fillfactormult = leaffillfactor / 100.0;
291  }
292  else if (_bt_afternewitemoff(&state, maxoff, leaffillfactor, &usemult))
293  {
294  /*
295  * New item inserted at rightmost point among a localized grouping on
296  * a leaf page -- apply "split after new item" optimization, either by
297  * applying leaf fillfactor multiplier, or by choosing the exact split
298  * point that leaves newitem as lastleft. (usemult is set for us.)
299  */
300  if (usemult)
301  {
302  /* fillfactormult should be set based on leaf fillfactor */
303  fillfactormult = leaffillfactor / 100.0;
304  }
305  else
306  {
307  /* find precise split point after newitemoff */
308  for (int i = 0; i < state.nsplits; i++)
309  {
310  SplitPoint *split = state.splits + i;
311 
312  if (split->newitemonleft &&
313  newitemoff == split->firstrightoff)
314  {
315  pfree(state.splits);
316  *newitemonleft = true;
317  return newitemoff;
318  }
319  }
320 
321  /*
322  * Cannot legally split after newitemoff; proceed with split
323  * without using fillfactor multiplier. This is defensive, and
324  * should never be needed in practice.
325  */
326  fillfactormult = 0.50;
327  }
328  }
329  else
330  {
331  /* Other leaf page. 50:50 page split. */
332  usemult = false;
333  /* fillfactormult not used, but be tidy */
334  fillfactormult = 0.50;
335  }
336 
337  /*
338  * Save leftmost and rightmost splits for page before original ordinal
339  * sort order is lost by delta/fillfactormult sort
340  */
341  leftpage = state.splits[0];
342  rightpage = state.splits[state.nsplits - 1];
343 
344  /* Give split points a fillfactormult-wise delta, and sort on deltas */
345  _bt_deltasortsplits(&state, fillfactormult, usemult);
346 
347  /* Determine split interval for default strategy */
348  state.interval = _bt_defaultinterval(&state);
349 
350  /*
351  * Determine if default strategy/split interval will produce a
352  * sufficiently distinguishing split, or if we should change strategies.
353  * Alternative strategies change the range of split points that are
354  * considered acceptable (split interval), and possibly change
355  * fillfactormult, in order to deal with pages with a large number of
356  * duplicates gracefully.
357  *
358  * Pass low and high splits for the entire page (actually, they're for an
359  * imaginary version of the page that includes newitem). These are used
360  * when the initial split interval encloses split points that are full of
361  * duplicates, and we need to consider if it's even possible to avoid
362  * appending a heap TID.
363  */
364  perfectpenalty = _bt_strategy(&state, &leftpage, &rightpage, &strategy);
365 
366  if (strategy == SPLIT_DEFAULT)
367  {
368  /*
369  * Default strategy worked out (always works out with internal page).
370  * Original split interval still stands.
371  */
372  }
373 
374  /*
375  * Many duplicates strategy is used when a heap TID would otherwise be
376  * appended, but the page isn't completely full of logical duplicates.
377  *
378  * The split interval is widened to include all legal candidate split
379  * points. There might be a few as two distinct values in the whole-page
380  * split interval, though it's also possible that most of the values on
381  * the page are unique. The final split point will either be to the
382  * immediate left or to the immediate right of the group of duplicate
383  * tuples that enclose the first/delta-optimal split point (perfect
384  * penalty was set so that the lowest delta split point that avoids
385  * appending a heap TID will be chosen). Maximizing the number of
386  * attributes that can be truncated away is not a goal of the many
387  * duplicates strategy.
388  *
389  * Single value strategy is used when it is impossible to avoid appending
390  * a heap TID. It arranges to leave the left page very full. This
391  * maximizes space utilization in cases where tuples with the same
392  * attribute values span many pages. Newly inserted duplicates will tend
393  * to have higher heap TID values, so we'll end up splitting to the right
394  * consistently. (Single value strategy is harmless though not
395  * particularly useful with !heapkeyspace indexes.)
396  */
397  else if (strategy == SPLIT_MANY_DUPLICATES)
398  {
399  Assert(state.is_leaf);
400  /* Shouldn't try to truncate away extra user attributes */
401  Assert(perfectpenalty ==
403  /* No need to resort splits -- no change in fillfactormult/deltas */
404  state.interval = state.nsplits;
405  }
406  else if (strategy == SPLIT_SINGLE_VALUE)
407  {
408  Assert(state.is_leaf);
409  /* Split near the end of the page */
410  usemult = true;
411  fillfactormult = BTREE_SINGLEVAL_FILLFACTOR / 100.0;
412  /* Resort split points with new delta */
413  _bt_deltasortsplits(&state, fillfactormult, usemult);
414  /* Appending a heap TID is unavoidable, so interval of 1 is fine */
415  state.interval = 1;
416  }
417 
418  /*
419  * Search among acceptable split points (using final split interval) for
420  * the entry that has the lowest penalty, and is therefore expected to
421  * maximize fan-out. Sets *newitemonleft for us.
422  */
423  firstrightoff = _bt_bestsplitloc(&state, perfectpenalty, newitemonleft,
424  strategy);
425  pfree(state.splits);
426 
427  return firstrightoff;
428 }
#define BTREE_SINGLEVAL_FILLFACTOR
Definition: nbtree.h:202
#define BTGetFillFactor(relation)
Definition: nbtree.h:1128
#define BTREE_NONLEAF_FILLFACTOR
Definition: nbtree.h:201
static void _bt_deltasortsplits(FindSplitData *state, double fillfactormult, bool usemult)
Definition: nbtsplitloc.c:566
static int _bt_strategy(FindSplitData *state, SplitPoint *leftpage, SplitPoint *rightpage, FindSplitStrat *strategy)
Definition: nbtsplitloc.c:934
static bool _bt_afternewitemoff(FindSplitData *state, OffsetNumber maxoff, int leaffillfactor, bool *usemult)
Definition: nbtsplitloc.c:630
static void _bt_recsplitloc(FindSplitData *state, OffsetNumber firstrightoff, bool newitemonleft, int olddataitemstoleft, Size firstrightofforigpagetuplesz)
Definition: nbtsplitloc.c:449
FindSplitStrat
Definition: nbtsplitloc.c:21
@ SPLIT_DEFAULT
Definition: nbtsplitloc.c:23
@ SPLIT_MANY_DUPLICATES
Definition: nbtsplitloc.c:24
@ SPLIT_SINGLE_VALUE
Definition: nbtsplitloc.c:25
static OffsetNumber _bt_bestsplitloc(FindSplitData *state, int perfectpenalty, bool *newitemonleft, FindSplitStrat strategy)
Definition: nbtsplitloc.c:788
static int _bt_defaultinterval(FindSplitData *state)
Definition: nbtsplitloc.c:876
bool newitemonleft
Definition: nbtsplitloc.c:37
OffsetNumber firstrightoff
Definition: nbtsplitloc.c:36

References _bt_afternewitemoff(), _bt_bestsplitloc(), _bt_defaultinterval(), _bt_deltasortsplits(), _bt_recsplitloc(), _bt_strategy(), Assert, BTGetFillFactor, BTPageGetOpaque, BTREE_NONLEAF_FILLFACTOR, BTREE_SINGLEVAL_FILLFACTOR, BTreeTupleIsPosting(), elog, ERROR, SplitPoint::firstrightoff, i, IndexRelationGetNumberOfKeyAttributes, ItemIdGetLength, MAXALIGN, SplitPoint::newitemonleft, OffsetNumberNext, P_FIRSTDATAKEY, P_HIKEY, P_ISLEAF, P_RIGHTMOST, PageGetExactFreeSpace(), PageGetItemId(), PageGetMaxOffsetNumber(), PageGetPageSize(), palloc(), pfree(), RelationGetRelationName, SizeOfPageHeaderData, SPLIT_DEFAULT, SPLIT_MANY_DUPLICATES, and SPLIT_SINGLE_VALUE.

Referenced by _bt_split().

◆ _bt_finish_split()

void _bt_finish_split ( Relation  rel,
Relation  heaprel,
Buffer  lbuf,
BTStack  stack 
)

Definition at line 2241 of file nbtinsert.c.

2242 {
2243  Page lpage = BufferGetPage(lbuf);
2244  BTPageOpaque lpageop = BTPageGetOpaque(lpage);
2245  Buffer rbuf;
2246  Page rpage;
2247  BTPageOpaque rpageop;
2248  bool wasroot;
2249  bool wasonly;
2250 
2251  Assert(P_INCOMPLETE_SPLIT(lpageop));
2252  Assert(heaprel != NULL);
2253 
2254  /* Lock right sibling, the one missing the downlink */
2255  rbuf = _bt_getbuf(rel, lpageop->btpo_next, BT_WRITE);
2256  rpage = BufferGetPage(rbuf);
2257  rpageop = BTPageGetOpaque(rpage);
2258 
2259  /* Could this be a root split? */
2260  if (!stack)
2261  {
2262  Buffer metabuf;
2263  Page metapg;
2264  BTMetaPageData *metad;
2265 
2266  /* acquire lock on the metapage */
2267  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
2268  metapg = BufferGetPage(metabuf);
2269  metad = BTPageGetMeta(metapg);
2270 
2271  wasroot = (metad->btm_root == BufferGetBlockNumber(lbuf));
2272 
2273  _bt_relbuf(rel, metabuf);
2274  }
2275  else
2276  wasroot = false;
2277 
2278  /* Was this the only page on the level before split? */
2279  wasonly = (P_LEFTMOST(lpageop) && P_RIGHTMOST(rpageop));
2280 
2281  elog(DEBUG1, "finishing incomplete split of %u/%u",
2283 
2284  _bt_insert_parent(rel, heaprel, lbuf, rbuf, stack, wasroot, wasonly);
2285 }
static void _bt_insert_parent(Relation rel, Relation heaprel, Buffer buf, Buffer rbuf, BTStack stack, bool isroot, bool isonly)
Definition: nbtinsert.c:2099
Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access)
Definition: nbtpage.c:845
#define BTPageGetMeta(p)
Definition: nbtree.h:121
#define P_LEFTMOST(opaque)
Definition: nbtree.h:218
#define P_INCOMPLETE_SPLIT(opaque)
Definition: nbtree.h:227
#define BTREE_METAPAGE
Definition: nbtree.h:148
#define BT_WRITE
Definition: nbtree.h:720
BlockNumber btm_root
Definition: nbtree.h:107
BlockNumber btpo_next
Definition: nbtree.h:65

References _bt_getbuf(), _bt_insert_parent(), _bt_relbuf(), Assert, BT_WRITE, BTMetaPageData::btm_root, BTPageGetMeta, BTPageGetOpaque, BTPageOpaqueData::btpo_next, BTREE_METAPAGE, BufferGetBlockNumber(), BufferGetPage(), DEBUG1, elog, P_INCOMPLETE_SPLIT, P_LEFTMOST, and P_RIGHTMOST.

Referenced by _bt_getstackbuf(), _bt_moveright(), and _bt_stepright().

◆ _bt_first()

bool _bt_first ( IndexScanDesc  scan,
ScanDirection  dir 
)

Definition at line 882 of file nbtsearch.c.

883 {
884  Relation rel = scan->indexRelation;
885  BTScanOpaque so = (BTScanOpaque) scan->opaque;
886  BTStack stack;
887  OffsetNumber offnum;
888  BTScanInsertData inskey;
889  ScanKey startKeys[INDEX_MAX_KEYS];
890  ScanKeyData notnullkeys[INDEX_MAX_KEYS];
891  int keysz = 0;
892  StrategyNumber strat_total;
893 
895 
896  /*
897  * Examine the scan keys and eliminate any redundant keys; also mark the
898  * keys that must be matched to continue the scan.
899  */
900  _bt_preprocess_keys(scan);
901 
902  /*
903  * Quit now if _bt_preprocess_keys() discovered that the scan keys can
904  * never be satisfied (eg, x == 1 AND x > 2).
905  */
906  if (!so->qual_ok)
907  {
908  _bt_parallel_done(scan);
909  return false;
910  }
911 
912  /*
913  * For parallel scans, get the starting page from shared state. If the
914  * scan has not started, proceed to find out first leaf page in the usual
915  * way while keeping other participating processes waiting. If the scan
916  * has already begun, use the page number from the shared structure.
917  *
918  * When a parallel scan has another primitive index scan scheduled, a
919  * parallel worker will seize the scan for that purpose now. This is
920  * similar to the case where the top-level scan hasn't started.
921  */
922  if (scan->parallel_scan != NULL)
923  {
924  BlockNumber blkno,
925  lastcurrblkno;
926 
927  if (!_bt_parallel_seize(scan, &blkno, &lastcurrblkno, true))
928  return false;
929 
930  /*
931  * Successfully seized the scan, which _bt_readfirstpage or possibly
932  * _bt_readnextpage will release (unless the scan ends right away, in
933  * which case we'll call _bt_parallel_done directly).
934  *
935  * Initialize arrays (when _bt_parallel_seize didn't already set up
936  * the next primitive index scan).
937  */
938  if (so->numArrayKeys && !so->needPrimScan)
939  _bt_start_array_keys(scan, dir);
940 
941  Assert(blkno != P_NONE);
942  if (blkno != InvalidBlockNumber)
943  {
944  Assert(!so->needPrimScan);
945 
946  /*
947  * We anticipated starting another primitive scan, but some other
948  * worker bet us to it
949  */
950  if (!_bt_readnextpage(scan, blkno, lastcurrblkno, dir, true))
951  return false;
952 
953  _bt_returnitem(scan, so);
954  return true;
955  }
956  }
957  else if (so->numArrayKeys && !so->needPrimScan)
958  {
959  /*
960  * First _bt_first call (for current btrescan) without parallelism.
961  *
962  * Initialize arrays, and the corresponding scan keys that were just
963  * output by _bt_preprocess_keys.
964  */
965  _bt_start_array_keys(scan, dir);
966  }
967 
968  /*
969  * Count an indexscan for stats, now that we know that we'll call
970  * _bt_search/_bt_endpoint below
971  */
973 
974  /*----------
975  * Examine the scan keys to discover where we need to start the scan.
976  *
977  * We want to identify the keys that can be used as starting boundaries;
978  * these are =, >, or >= keys for a forward scan or =, <, <= keys for
979  * a backwards scan. We can use keys for multiple attributes so long as
980  * the prior attributes had only =, >= (resp. =, <=) keys. Once we accept
981  * a > or < boundary or find an attribute with no boundary (which can be
982  * thought of as the same as "> -infinity"), we can't use keys for any
983  * attributes to its right, because it would break our simplistic notion
984  * of what initial positioning strategy to use.
985  *
986  * When the scan keys include cross-type operators, _bt_preprocess_keys
987  * may not be able to eliminate redundant keys; in such cases we will
988  * arbitrarily pick a usable one for each attribute. This is correct
989  * but possibly not optimal behavior. (For example, with keys like
990  * "x >= 4 AND x >= 5" we would elect to scan starting at x=4 when
991  * x=5 would be more efficient.) Since the situation only arises given
992  * a poorly-worded query plus an incomplete opfamily, live with it.
993  *
994  * When both equality and inequality keys appear for a single attribute
995  * (again, only possible when cross-type operators appear), we *must*
996  * select one of the equality keys for the starting point, because
997  * _bt_checkkeys() will stop the scan as soon as an equality qual fails.
998  * For example, if we have keys like "x >= 4 AND x = 10" and we elect to
999  * start at x=4, we will fail and stop before reaching x=10. If multiple
1000  * equality quals survive preprocessing, however, it doesn't matter which
1001  * one we use --- by definition, they are either redundant or
1002  * contradictory.
1003  *
1004  * Any regular (not SK_SEARCHNULL) key implies a NOT NULL qualifier.
1005  * If the index stores nulls at the end of the index we'll be starting
1006  * from, and we have no boundary key for the column (which means the key
1007  * we deduced NOT NULL from is an inequality key that constrains the other
1008  * end of the index), then we cons up an explicit SK_SEARCHNOTNULL key to
1009  * use as a boundary key. If we didn't do this, we might find ourselves
1010  * traversing a lot of null entries at the start of the scan.
1011  *
1012  * In this loop, row-comparison keys are treated the same as keys on their
1013  * first (leftmost) columns. We'll add on lower-order columns of the row
1014  * comparison below, if possible.
1015  *
1016  * The selected scan keys (at most one per index column) are remembered by
1017  * storing their addresses into the local startKeys[] array.
1018  *
1019  * _bt_checkkeys/_bt_advance_array_keys decide whether and when to start
1020  * the next primitive index scan (for scans with array keys) based in part
1021  * on an understanding of how it'll enable us to reposition the scan.
1022  * They're directly aware of how we'll sometimes cons up an explicit
1023  * SK_SEARCHNOTNULL key. They'll even end primitive scans by applying a
1024  * symmetric "deduce NOT NULL" rule of their own. This allows top-level
1025  * scans to skip large groups of NULLs through repeated deductions about
1026  * key strictness (for a required inequality key) and whether NULLs in the
1027  * key's index column are stored last or first (relative to non-NULLs).
1028  * If you update anything here, _bt_checkkeys/_bt_advance_array_keys might
1029  * need to be kept in sync.
1030  *----------
1031  */
1032  strat_total = BTEqualStrategyNumber;
1033  if (so->numberOfKeys > 0)
1034  {
1035  AttrNumber curattr;
1036  ScanKey chosen;
1037  ScanKey impliesNN;
1038  ScanKey cur;
1039 
1040  /*
1041  * chosen is the so-far-chosen key for the current attribute, if any.
1042  * We don't cast the decision in stone until we reach keys for the
1043  * next attribute.
1044  */
1045  cur = so->keyData;
1046  curattr = 1;
1047  chosen = NULL;
1048  /* Also remember any scankey that implies a NOT NULL constraint */
1049  impliesNN = NULL;
1050 
1051  /*
1052  * Loop iterates from 0 to numberOfKeys inclusive; we use the last
1053  * pass to handle after-last-key processing. Actual exit from the
1054  * loop is at one of the "break" statements below.
1055  */
1056  for (int i = 0;; cur++, i++)
1057  {
1058  if (i >= so->numberOfKeys || cur->sk_attno != curattr)
1059  {
1060  /*
1061  * Done looking at keys for curattr. If we didn't find a
1062  * usable boundary key, see if we can deduce a NOT NULL key.
1063  */
1064  if (chosen == NULL && impliesNN != NULL &&
1065  ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
1066  ScanDirectionIsForward(dir) :
1068  {
1069  /* Yes, so build the key in notnullkeys[keysz] */
1070  chosen = &notnullkeys[keysz];
1071  ScanKeyEntryInitialize(chosen,
1073  (impliesNN->sk_flags &
1075  curattr,
1076  ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
1079  InvalidOid,
1080  InvalidOid,
1081  InvalidOid,
1082  (Datum) 0);
1083  }
1084 
1085  /*
1086  * If we still didn't find a usable boundary key, quit; else
1087  * save the boundary key pointer in startKeys.
1088  */
1089  if (chosen == NULL)
1090  break;
1091  startKeys[keysz++] = chosen;
1092 
1093  /* Quit if we have stored a > or < key */
1094  strat_total = chosen->sk_strategy;
1095  if (strat_total == BTGreaterStrategyNumber ||
1096  strat_total == BTLessStrategyNumber)
1097  break;
1098 
1099  /*
1100  * Done if that was the last attribute, or if next key is not
1101  * in sequence (implying no boundary key is available for the
1102  * next attribute).
1103  */
1104  if (i >= so->numberOfKeys ||
1105  cur->sk_attno != curattr + 1)
1106  break;
1107 
1108  /*
1109  * Reset for next attr.
1110  */
1111  curattr = cur->sk_attno;
1112  chosen = NULL;
1113  impliesNN = NULL;
1114  }
1115 
1116  /*
1117  * Can we use this key as a starting boundary for this attr?
1118  *
1119  * If not, does it imply a NOT NULL constraint? (Because
1120  * SK_SEARCHNULL keys are always assigned BTEqualStrategyNumber,
1121  * *any* inequality key works for that; we need not test.)
1122  */
1123  switch (cur->sk_strategy)
1124  {
1125  case BTLessStrategyNumber:
1127  if (chosen == NULL)
1128  {
1129  if (ScanDirectionIsBackward(dir))
1130  chosen = cur;
1131  else
1132  impliesNN = cur;
1133  }
1134  break;
1135  case BTEqualStrategyNumber:
1136  /* override any non-equality choice */
1137  chosen = cur;
1138  break;
1141  if (chosen == NULL)
1142  {
1143  if (ScanDirectionIsForward(dir))
1144  chosen = cur;
1145  else
1146  impliesNN = cur;
1147  }
1148  break;
1149  }
1150  }
1151  }
1152 
1153  /*
1154  * If we found no usable boundary keys, we have to start from one end of
1155  * the tree. Walk down that edge to the first or last key, and scan from
1156  * there.
1157  *
1158  * Note: calls _bt_readfirstpage for us, which releases the parallel scan.
1159  */
1160  if (keysz == 0)
1161  return _bt_endpoint(scan, dir);
1162 
1163  /*
1164  * We want to start the scan somewhere within the index. Set up an
1165  * insertion scankey we can use to search for the boundary point we
1166  * identified above. The insertion scankey is built using the keys
1167  * identified by startKeys[]. (Remaining insertion scankey fields are
1168  * initialized after initial-positioning scan keys are finalized.)
1169  */
1170  Assert(keysz <= INDEX_MAX_KEYS);
1171  for (int i = 0; i < keysz; i++)
1172  {
1173  ScanKey cur = startKeys[i];
1174 
1175  Assert(cur->sk_attno == i + 1);
1176 
1177  if (cur->sk_flags & SK_ROW_HEADER)
1178  {
1179  /*
1180  * Row comparison header: look to the first row member instead.
1181  *
1182  * The member scankeys are already in insertion format (ie, they
1183  * have sk_func = 3-way-comparison function), but we have to watch
1184  * out for nulls, which _bt_preprocess_keys didn't check. A null
1185  * in the first row member makes the condition unmatchable, just
1186  * like qual_ok = false.
1187  */
1188  ScanKey subkey = (ScanKey) DatumGetPointer(cur->sk_argument);
1189 
1190  Assert(subkey->sk_flags & SK_ROW_MEMBER);
1191  if (subkey->sk_flags & SK_ISNULL)
1192  {
1193  _bt_parallel_done(scan);
1194  return false;
1195  }
1196  memcpy(inskey.scankeys + i, subkey, sizeof(ScanKeyData));
1197 
1198  /*
1199  * If the row comparison is the last positioning key we accepted,
1200  * try to add additional keys from the lower-order row members.
1201  * (If we accepted independent conditions on additional index
1202  * columns, we use those instead --- doesn't seem worth trying to
1203  * determine which is more restrictive.) Note that this is OK
1204  * even if the row comparison is of ">" or "<" type, because the
1205  * condition applied to all but the last row member is effectively
1206  * ">=" or "<=", and so the extra keys don't break the positioning
1207  * scheme. But, by the same token, if we aren't able to use all
1208  * the row members, then the part of the row comparison that we
1209  * did use has to be treated as just a ">=" or "<=" condition, and
1210  * so we'd better adjust strat_total accordingly.
1211  */
1212  if (i == keysz - 1)
1213  {
1214  bool used_all_subkeys = false;
1215 
1216  Assert(!(subkey->sk_flags & SK_ROW_END));
1217  for (;;)
1218  {
1219  subkey++;
1220  Assert(subkey->sk_flags & SK_ROW_MEMBER);
1221  if (subkey->sk_attno != keysz + 1)
1222  break; /* out-of-sequence, can't use it */
1223  if (subkey->sk_strategy != cur->sk_strategy)
1224  break; /* wrong direction, can't use it */
1225  if (subkey->sk_flags & SK_ISNULL)
1226  break; /* can't use null keys */
1227  Assert(keysz < INDEX_MAX_KEYS);
1228  memcpy(inskey.scankeys + keysz, subkey,
1229  sizeof(ScanKeyData));
1230  keysz++;
1231  if (subkey->sk_flags & SK_ROW_END)
1232  {
1233  used_all_subkeys = true;
1234  break;
1235  }
1236  }
1237  if (!used_all_subkeys)
1238  {
1239  switch (strat_total)
1240  {
1241  case BTLessStrategyNumber:
1242  strat_total = BTLessEqualStrategyNumber;
1243  break;
1245  strat_total = BTGreaterEqualStrategyNumber;
1246  break;
1247  }
1248  }
1249  break; /* done with outer loop */
1250  }
1251  }
1252  else
1253  {
1254  /*
1255  * Ordinary comparison key. Transform the search-style scan key
1256  * to an insertion scan key by replacing the sk_func with the
1257  * appropriate btree comparison function.
1258  *
1259  * If scankey operator is not a cross-type comparison, we can use
1260  * the cached comparison function; otherwise gotta look it up in
1261  * the catalogs. (That can't lead to infinite recursion, since no
1262  * indexscan initiated by syscache lookup will use cross-data-type
1263  * operators.)
1264  *
1265  * We support the convention that sk_subtype == InvalidOid means
1266  * the opclass input type; this is a hack to simplify life for
1267  * ScanKeyInit().
1268  */
1269  if (cur->sk_subtype == rel->rd_opcintype[i] ||
1270  cur->sk_subtype == InvalidOid)
1271  {
1272  FmgrInfo *procinfo;
1273 
1274  procinfo = index_getprocinfo(rel, cur->sk_attno, BTORDER_PROC);
1275  ScanKeyEntryInitializeWithInfo(inskey.scankeys + i,
1276  cur->sk_flags,
1277  cur->sk_attno,
1279  cur->sk_subtype,
1280  cur->sk_collation,
1281  procinfo,
1282  cur->sk_argument);
1283  }
1284  else
1285  {
1286  RegProcedure cmp_proc;
1287 
1288  cmp_proc = get_opfamily_proc(rel->rd_opfamily[i],
1289  rel->rd_opcintype[i],
1290  cur->sk_subtype,
1291  BTORDER_PROC);
1292  if (!RegProcedureIsValid(cmp_proc))
1293  elog(ERROR, "missing support function %d(%u,%u) for attribute %d of index \"%s\"",
1294  BTORDER_PROC, rel->rd_opcintype[i], cur->sk_subtype,
1295  cur->sk_attno, RelationGetRelationName(rel));
1296  ScanKeyEntryInitialize(inskey.scankeys + i,
1297  cur->sk_flags,
1298  cur->sk_attno,
1300  cur->sk_subtype,
1301  cur->sk_collation,
1302  cmp_proc,
1303  cur->sk_argument);
1304  }
1305  }
1306  }
1307 
1308  /*----------
1309  * Examine the selected initial-positioning strategy to determine exactly
1310  * where we need to start the scan, and set flag variables to control the
1311  * initial descent by _bt_search (and our _bt_binsrch call for the leaf
1312  * page _bt_search returns).
1313  *----------
1314  */
1315  _bt_metaversion(rel, &inskey.heapkeyspace, &inskey.allequalimage);
1316  inskey.anynullkeys = false; /* unused */
1317  inskey.scantid = NULL;
1318  inskey.keysz = keysz;
1319  switch (strat_total)
1320  {
1321  case BTLessStrategyNumber:
1322 
1323  inskey.nextkey = false;
1324  inskey.backward = true;
1325  break;
1326 
1328 
1329  inskey.nextkey = true;
1330  inskey.backward = true;
1331  break;
1332 
1333  case BTEqualStrategyNumber:
1334 
1335  /*
1336  * If a backward scan was specified, need to start with last equal
1337  * item not first one.
1338  */
1339  if (ScanDirectionIsBackward(dir))
1340  {
1341  /*
1342  * This is the same as the <= strategy
1343  */
1344  inskey.nextkey = true;
1345  inskey.backward = true;
1346  }
1347  else
1348  {
1349  /*
1350  * This is the same as the >= strategy
1351  */
1352  inskey.nextkey = false;
1353  inskey.backward = false;
1354  }
1355  break;
1356 
1358 
1359  /*
1360  * Find first item >= scankey
1361  */
1362  inskey.nextkey = false;
1363  inskey.backward = false;
1364  break;
1365 
1367 
1368  /*
1369  * Find first item > scankey
1370  */
1371  inskey.nextkey = true;
1372  inskey.backward = false;
1373  break;
1374 
1375  default:
1376  /* can't get here, but keep compiler quiet */
1377  elog(ERROR, "unrecognized strat_total: %d", (int) strat_total);
1378  return false;
1379  }
1380 
1381  /*
1382  * Use the manufactured insertion scan key to descend the tree and
1383  * position ourselves on the target leaf page.
1384  */
1385  Assert(ScanDirectionIsBackward(dir) == inskey.backward);
1386  stack = _bt_search(rel, NULL, &inskey, &so->currPos.buf, BT_READ);
1387 
1388  /* don't need to keep the stack around... */
1389  _bt_freestack(stack);
1390 
1391  if (!BufferIsValid(so->currPos.buf))
1392  {
1393  /*
1394  * We only get here if the index is completely empty. Lock relation
1395  * because nothing finer to lock exists. Without a buffer lock, it's
1396  * possible for another transaction to insert data between
1397  * _bt_search() and PredicateLockRelation(). We have to try again
1398  * after taking the relation-level predicate lock, to close a narrow
1399  * window where we wouldn't scan concurrently inserted tuples, but the
1400  * writer wouldn't see our predicate lock.
1401  */
1403  {
1404  PredicateLockRelation(rel, scan->xs_snapshot);
1405  stack = _bt_search(rel, NULL, &inskey, &so->currPos.buf, BT_READ);
1406  _bt_freestack(stack);
1407  }
1408 
1409  if (!BufferIsValid(so->currPos.buf))
1410  {
1411  _bt_parallel_done(scan);
1412  return false;
1413  }
1414  }
1415 
1416  /* position to the precise item on the page */
1417  offnum = _bt_binsrch(rel, &inskey, so->currPos.buf);
1418 
1419  /*
1420  * Now load data from the first page of the scan (usually the page
1421  * currently in so->currPos.buf).
1422  *
1423  * If inskey.nextkey = false and inskey.backward = false, offnum is
1424  * positioned at the first non-pivot tuple >= inskey.scankeys.
1425  *
1426  * If inskey.nextkey = false and inskey.backward = true, offnum is
1427  * positioned at the last non-pivot tuple < inskey.scankeys.
1428  *
1429  * If inskey.nextkey = true and inskey.backward = false, offnum is
1430  * positioned at the first non-pivot tuple > inskey.scankeys.
1431  *
1432  * If inskey.nextkey = true and inskey.backward = true, offnum is
1433  * positioned at the last non-pivot tuple <= inskey.scankeys.
1434  *
1435  * It's possible that _bt_binsrch returned an offnum that is out of bounds
1436  * for the page. For example, when inskey is both < the leaf page's high
1437  * key and > all of its non-pivot tuples, offnum will be "maxoff + 1".
1438  */
1439  if (!_bt_readfirstpage(scan, offnum, dir))
1440  return false;
1441 
1442  _bt_returnitem(scan, so);
1443  return true;
1444 }
int16 AttrNumber
Definition: attnum.h:21
#define RegProcedureIsValid(p)
Definition: c.h:782
regproc RegProcedure
Definition: c.h:655
struct cursor * cur
Definition: ecpg.c:28
FmgrInfo * index_getprocinfo(Relation irel, AttrNumber attnum, uint16 procnum)
Definition: indexam.c:862
void _bt_metaversion(Relation rel, bool *heapkeyspace, bool *allequalimage)
Definition: nbtpage.c:739
bool _bt_parallel_seize(IndexScanDesc scan, BlockNumber *next_scan_page, BlockNumber *last_curr_page, bool first)
Definition: nbtree.c:605
void _bt_parallel_done(IndexScanDesc scan)
Definition: nbtree.c:774
#define BTORDER_PROC
Definition: nbtree.h:707
#define BTScanPosIsValid(scanpos)
Definition: nbtree.h:1010
#define P_NONE
Definition: nbtree.h:212
#define BT_READ
Definition: nbtree.h:719
static bool _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, BlockNumber lastcurrblkno, ScanDirection dir, bool seized)
Definition: nbtsearch.c:2213
static OffsetNumber _bt_binsrch(Relation rel, BTScanInsert key, Buffer buf)
Definition: nbtsearch.c:343
static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:2547
static bool _bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum, ScanDirection dir)
Definition: nbtsearch.c:2127
BTStack _bt_search(Relation rel, Relation heaprel, BTScanInsert key, Buffer *bufP, int access)
Definition: nbtsearch.c:102
static void _bt_returnitem(IndexScanDesc scan, BTScanOpaque so)
Definition: nbtsearch.c:1998
void _bt_start_array_keys(IndexScanDesc scan, ScanDirection dir)
Definition: nbtutils.c:1352
void _bt_preprocess_keys(IndexScanDesc scan)
Definition: nbtutils.c:2530
#define INDEX_MAX_KEYS
#define pgstat_count_index_scan(rel)
Definition: pgstat.h:664
#define InvalidOid
Definition: postgres_ext.h:36
void PredicateLockRelation(Relation relation, Snapshot snapshot)
Definition: predicate.c:2566
void ScanKeyEntryInitialize(ScanKey entry, int flags, AttrNumber attributeNumber, StrategyNumber strategy, Oid subtype, Oid collation, RegProcedure procedure, Datum argument)
Definition: scankey.c:32
void ScanKeyEntryInitializeWithInfo(ScanKey entry, int flags, AttrNumber attributeNumber, StrategyNumber strategy, Oid subtype, Oid collation, FmgrInfo *finfo, Datum argument)
Definition: scankey.c:101
#define ScanDirectionIsForward(direction)
Definition: sdir.h:64
#define ScanDirectionIsBackward(direction)
Definition: sdir.h:50
#define SK_ROW_HEADER
Definition: skey.h:117
#define SK_ROW_MEMBER
Definition: skey.h:118
#define SK_SEARCHNOTNULL
Definition: skey.h:122
#define SK_ROW_END
Definition: skey.h:119
ScanKeyData * ScanKey
Definition: skey.h:75
uint16 StrategyNumber
Definition: stratnum.h:22
#define BTGreaterStrategyNumber
Definition: stratnum.h:33
#define InvalidStrategy
Definition: stratnum.h:24
#define BTLessStrategyNumber
Definition: stratnum.h:29
#define BTLessEqualStrategyNumber
Definition: stratnum.h:30
#define BTGreaterEqualStrategyNumber
Definition: stratnum.h:32
Buffer buf
Definition: nbtree.h:953
Definition: fmgr.h:57
struct ParallelIndexScanDescData * parallel_scan
Definition: relscan.h:189
struct SnapshotData * xs_snapshot
Definition: relscan.h:142
#define IsolationIsSerializable()
Definition: xact.h:52

References _bt_binsrch(), _bt_endpoint(), _bt_freestack(), _bt_metaversion(), _bt_parallel_done(), _bt_parallel_seize(), _bt_preprocess_keys(), _bt_readfirstpage(), _bt_readnextpage(), _bt_returnitem(), _bt_search(), _bt_start_array_keys(), Assert, BT_READ, BTEqualStrategyNumber, BTGreaterEqualStrategyNumber, BTGreaterStrategyNumber, BTLessEqualStrategyNumber, BTLessStrategyNumber, BTORDER_PROC, BTScanPosIsValid, BTScanPosData::buf, BufferIsValid(), cur, BTScanOpaqueData::currPos, DatumGetPointer(), elog, ERROR, get_opfamily_proc(), i, index_getprocinfo(), INDEX_MAX_KEYS, IndexScanDescData::indexRelation, InvalidBlockNumber, InvalidOid, InvalidStrategy, IsolationIsSerializable, BTScanOpaqueData::keyData, BTScanOpaqueData::needPrimScan, BTScanOpaqueData::numArrayKeys, BTScanOpaqueData::numberOfKeys, IndexScanDescData::opaque, P_NONE, IndexScanDescData::parallel_scan, pgstat_count_index_scan, PredicateLockRelation(), BTScanOpaqueData::qual_ok, RelationData::rd_opcintype, RelationData::rd_opfamily, RegProcedureIsValid, RelationGetRelationName, ScanDirectionIsBackward, ScanDirectionIsForward, ScanKeyEntryInitialize(), ScanKeyEntryInitializeWithInfo(), ScanKeyData::sk_attno, SK_BT_DESC, SK_BT_NULLS_FIRST, ScanKeyData::sk_flags, SK_ISNULL, SK_ROW_END, SK_ROW_HEADER, SK_ROW_MEMBER, SK_SEARCHNOTNULL, ScanKeyData::sk_strategy, and IndexScanDescData::xs_snapshot.

Referenced by btgetbitmap(), and btgettuple().

◆ _bt_form_posting()

IndexTuple _bt_form_posting ( IndexTuple  base,
ItemPointer  htids,
int  nhtids 
)

Definition at line 864 of file nbtdedup.c.

865 {
866  uint32 keysize,
867  newsize;
868  IndexTuple itup;
869 
870  if (BTreeTupleIsPosting(base))
871  keysize = BTreeTupleGetPostingOffset(base);
872  else
873  keysize = IndexTupleSize(base);
874 
875  Assert(!BTreeTupleIsPivot(base));
876  Assert(nhtids > 0 && nhtids <= PG_UINT16_MAX);
877  Assert(keysize == MAXALIGN(keysize));
878 
879  /* Determine final size of new tuple */
880  if (nhtids > 1)
881  newsize = MAXALIGN(keysize +
882  nhtids * sizeof(ItemPointerData));
883  else
884  newsize = keysize;
885 
886  Assert(newsize <= INDEX_SIZE_MASK);
887  Assert(newsize == MAXALIGN(newsize));
888 
889  /* Allocate memory using palloc0() (matches index_form_tuple()) */
890  itup = palloc0(newsize);
891  memcpy(itup, base, keysize);
892  itup->t_info &= ~INDEX_SIZE_MASK;
893  itup->t_info |= newsize;
894  if (nhtids > 1)
895  {
896  /* Form posting list tuple */
897  BTreeTupleSetPosting(itup, nhtids, keysize);
898  memcpy(BTreeTupleGetPosting(itup), htids,
899  sizeof(ItemPointerData) * nhtids);
900  Assert(_bt_posting_valid(itup));
901  }
902  else
903  {
904  /* Form standard non-pivot tuple */
905  itup->t_info &= ~INDEX_ALT_TID_MASK;
906  ItemPointerCopy(htids, &itup->t_tid);
908  }
909 
910  return itup;
911 }
#define PG_UINT16_MAX
Definition: c.h:592
static void ItemPointerCopy(const ItemPointerData *fromPointer, ItemPointerData *toPointer)
Definition: itemptr.h:172
static bool ItemPointerIsValid(const ItemPointerData *pointer)
Definition: itemptr.h:83
void * palloc0(Size size)
Definition: mcxt.c:1347
static void BTreeTupleSetPosting(IndexTuple itup, uint16 nhtids, int postingoffset)
Definition: nbtree.h:504
#define INDEX_ALT_TID_MASK
Definition: nbtree.h:459
unsigned short t_info
Definition: itup.h:49

References Assert, BTreeTupleGetPosting(), BTreeTupleGetPostingOffset(), BTreeTupleIsPivot(), BTreeTupleIsPosting(), BTreeTupleSetPosting(), INDEX_ALT_TID_MASK, INDEX_SIZE_MASK, IndexTupleSize, ItemPointerCopy(), ItemPointerIsValid(), MAXALIGN, palloc0(), PG_UINT16_MAX, IndexTupleData::t_info, and IndexTupleData::t_tid.

Referenced by _bt_dedup_finish_pending(), _bt_sort_dedup_finish_pending(), and bt_posting_plain_tuple().

◆ _bt_freestack()

void _bt_freestack ( BTStack  stack)

Definition at line 221 of file nbtutils.c.

222 {
223  BTStack ostack;
224 
225  while (stack != NULL)
226  {
227  ostack = stack;
228  stack = stack->bts_parent;
229  pfree(ostack);
230  }
231 }
struct BTStackData * bts_parent
Definition: nbtree.h:736

References BTStackData::bts_parent, and pfree().

Referenced by _bt_doinsert(), _bt_first(), and bt_rootdescend().

◆ _bt_get_endpoint()

Buffer _bt_get_endpoint ( Relation  rel,
uint32  level,
bool  rightmost 
)

Definition at line 2464 of file nbtsearch.c.

2465 {
2466  Buffer buf;
2467  Page page;
2468  BTPageOpaque opaque;
2469  OffsetNumber offnum;
2470  BlockNumber blkno;
2471  IndexTuple itup;
2472 
2473  /*
2474  * If we are looking for a leaf page, okay to descend from fast root;
2475  * otherwise better descend from true root. (There is no point in being
2476  * smarter about intermediate levels.)
2477  */
2478  if (level == 0)
2479  buf = _bt_getroot(rel, NULL, BT_READ);
2480  else
2481  buf = _bt_gettrueroot(rel);
2482 
2483  if (!BufferIsValid(buf))
2484  return InvalidBuffer;
2485 
2486  page = BufferGetPage(buf);
2487  opaque = BTPageGetOpaque(page);
2488 
2489  for (;;)
2490  {
2491  /*
2492  * If we landed on a deleted page, step right to find a live page
2493  * (there must be one). Also, if we want the rightmost page, step
2494  * right if needed to get to it (this could happen if the page split
2495  * since we obtained a pointer to it).
2496  */
2497  while (P_IGNORE(opaque) ||
2498  (rightmost && !P_RIGHTMOST(opaque)))
2499  {
2500  blkno = opaque->btpo_next;
2501  if (blkno == P_NONE)
2502  elog(ERROR, "fell off the end of index \"%s\"",
2504  buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2505  page = BufferGetPage(buf);
2506  opaque = BTPageGetOpaque(page);
2507  }
2508 
2509  /* Done? */
2510  if (opaque->btpo_level == level)
2511  break;
2512  if (opaque->btpo_level < level)
2513  ereport(ERROR,
2514  (errcode(ERRCODE_INDEX_CORRUPTED),
2515  errmsg_internal("btree level %u not found in index \"%s\"",
2516  level, RelationGetRelationName(rel))));
2517 
2518  /* Descend to leftmost or rightmost child page */
2519  if (rightmost)
2520  offnum = PageGetMaxOffsetNumber(page);
2521  else
2522  offnum = P_FIRSTDATAKEY(opaque);
2523 
2524  itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
2525  blkno = BTreeTupleGetDownLink(itup);
2526 
2527  buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2528  page = BufferGetPage(buf);
2529  opaque = BTPageGetOpaque(page);
2530  }
2531 
2532  return buf;
2533 }
Buffer _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
Definition: nbtpage.c:1003
Buffer _bt_gettrueroot(Relation rel)
Definition: nbtpage.c:580
Buffer _bt_getroot(Relation rel, Relation heaprel, int access)
Definition: nbtpage.c:344
static BlockNumber BTreeTupleGetDownLink(IndexTuple pivot)
Definition: nbtree.h:556
uint32 btpo_level
Definition: nbtree.h:66

References _bt_getroot(), _bt_gettrueroot(), _bt_relandgetbuf(), BT_READ, BTPageGetOpaque, BTPageOpaqueData::btpo_level, BTPageOpaqueData::btpo_next, BTreeTupleGetDownLink(), buf, BufferGetPage(), BufferIsValid(), elog, ereport, errcode(), errmsg_internal(), ERROR, InvalidBuffer, P_FIRSTDATAKEY, P_IGNORE, P_NONE, P_RIGHTMOST, PageGetItem(), PageGetItemId(), PageGetMaxOffsetNumber(), and RelationGetRelationName.

Referenced by _bt_endpoint(), and _bt_insert_parent().

◆ _bt_getbuf()

Buffer _bt_getbuf ( Relation  rel,
BlockNumber  blkno,
int  access 
)

Definition at line 845 of file nbtpage.c.

846 {
847  Buffer buf;
848 
849  Assert(BlockNumberIsValid(blkno));
850 
851  /* Read an existing block of the relation */
852  buf = ReadBuffer(rel, blkno);
853  _bt_lockbuf(rel, buf, access);
854  _bt_checkpage(rel, buf);
855 
856  return buf;
857 }
void _bt_checkpage(Relation rel, Buffer buf)
Definition: nbtpage.c:797
void _bt_lockbuf(Relation rel, Buffer buf, int access)
Definition: nbtpage.c:1039
short access
Definition: preproc-type.c:36

References _bt_checkpage(), _bt_lockbuf(), Assert, BlockNumberIsValid(), buf, and ReadBuffer().

Referenced by _bt_finish_split(), _bt_getroot(), _bt_getrootheight(), _bt_getstackbuf(), _bt_gettrueroot(), _bt_insertonpg(), _bt_killitems(), _bt_leftsib_splitflag(), _bt_lock_and_validate_left(), _bt_metaversion(), _bt_moveright(), _bt_newlevel(), _bt_pagedel(), _bt_readnextpage(), _bt_rightsib_halfdeadflag(), _bt_set_cleanup_info(), _bt_split(), _bt_unlink_halfdead_page(), and _bt_vacuum_needs_cleanup().

◆ _bt_getroot()

Buffer _bt_getroot ( Relation  rel,
Relation  heaprel,
int  access 
)

Definition at line 344 of file nbtpage.c.

345 {
346  Buffer metabuf;
347  Buffer rootbuf;
348  Page rootpage;
349  BTPageOpaque rootopaque;
350  BlockNumber rootblkno;
351  uint32 rootlevel;
352  BTMetaPageData *metad;
353 
354  Assert(access == BT_READ || heaprel != NULL);
355 
356  /*
357  * Try to use previously-cached metapage data to find the root. This
358  * normally saves one buffer access per index search, which is a very
359  * helpful savings in bufmgr traffic and hence contention.
360  */
361  if (rel->rd_amcache != NULL)
362  {
363  metad = (BTMetaPageData *) rel->rd_amcache;
364  /* We shouldn't have cached it if any of these fail */
365  Assert(metad->btm_magic == BTREE_MAGIC);
367  Assert(metad->btm_version <= BTREE_VERSION);
368  Assert(!metad->btm_allequalimage ||
370  Assert(metad->btm_root != P_NONE);
371 
372  rootblkno = metad->btm_fastroot;
373  Assert(rootblkno != P_NONE);
374  rootlevel = metad->btm_fastlevel;
375 
376  rootbuf = _bt_getbuf(rel, rootblkno, BT_READ);
377  rootpage = BufferGetPage(rootbuf);
378  rootopaque = BTPageGetOpaque(rootpage);
379 
380  /*
381  * Since the cache might be stale, we check the page more carefully
382  * here than normal. We *must* check that it's not deleted. If it's
383  * not alone on its level, then we reject too --- this may be overly
384  * paranoid but better safe than sorry. Note we don't check P_ISROOT,
385  * because that's not set in a "fast root".
386  */
387  if (!P_IGNORE(rootopaque) &&
388  rootopaque->btpo_level == rootlevel &&
389  P_LEFTMOST(rootopaque) &&
390  P_RIGHTMOST(rootopaque))
391  {
392  /* OK, accept cached page as the root */
393  return rootbuf;
394  }
395  _bt_relbuf(rel, rootbuf);
396  /* Cache is stale, throw it away */
397  if (rel->rd_amcache)
398  pfree(rel->rd_amcache);
399  rel->rd_amcache = NULL;
400  }
401 
402  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
403  metad = _bt_getmeta(rel, metabuf);
404 
405  /* if no root page initialized yet, do it */
406  if (metad->btm_root == P_NONE)
407  {
408  Page metapg;
409 
410  /* If access = BT_READ, caller doesn't want us to create root yet */
411  if (access == BT_READ)
412  {
413  _bt_relbuf(rel, metabuf);
414  return InvalidBuffer;
415  }
416 
417  /* trade in our read lock for a write lock */
418  _bt_unlockbuf(rel, metabuf);
419  _bt_lockbuf(rel, metabuf, BT_WRITE);
420 
421  /*
422  * Race condition: if someone else initialized the metadata between
423  * the time we released the read lock and acquired the write lock, we
424  * must avoid doing it again.
425  */
426  if (metad->btm_root != P_NONE)
427  {
428  /*
429  * Metadata initialized by someone else. In order to guarantee no
430  * deadlocks, we have to release the metadata page and start all
431  * over again. (Is that really true? But it's hardly worth trying
432  * to optimize this case.)
433  */
434  _bt_relbuf(rel, metabuf);
435  return _bt_getroot(rel, heaprel, access);
436  }
437 
438  /*
439  * Get, initialize, write, and leave a lock of the appropriate type on
440  * the new root page. Since this is the first page in the tree, it's
441  * a leaf as well as the root.
442  */
443  rootbuf = _bt_allocbuf(rel, heaprel);
444  rootblkno = BufferGetBlockNumber(rootbuf);
445  rootpage = BufferGetPage(rootbuf);
446  rootopaque = BTPageGetOpaque(rootpage);
447  rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE;
448  rootopaque->btpo_flags = (BTP_LEAF | BTP_ROOT);
449  rootopaque->btpo_level = 0;
450  rootopaque->btpo_cycleid = 0;
451  /* Get raw page pointer for metapage */
452  metapg = BufferGetPage(metabuf);
453 
454  /* NO ELOG(ERROR) till meta is updated */
456 
457  /* upgrade metapage if needed */
458  if (metad->btm_version < BTREE_NOVAC_VERSION)
459  _bt_upgrademetapage(metapg);
460 
461  metad->btm_root = rootblkno;
462  metad->btm_level = 0;
463  metad->btm_fastroot = rootblkno;
464  metad->btm_fastlevel = 0;
466  metad->btm_last_cleanup_num_heap_tuples = -1.0;
467 
468  MarkBufferDirty(rootbuf);
469  MarkBufferDirty(metabuf);
470 
471  /* XLOG stuff */
472  if (RelationNeedsWAL(rel))
473  {
474  xl_btree_newroot xlrec;
475  XLogRecPtr recptr;
477 
478  XLogBeginInsert();
481 
483  md.version = metad->btm_version;
484  md.root = rootblkno;
485  md.level = 0;
486  md.fastroot = rootblkno;
487  md.fastlevel = 0;
489  md.allequalimage = metad->btm_allequalimage;
490 
491  XLogRegisterBufData(2, (char *) &md, sizeof(xl_btree_metadata));
492 
493  xlrec.rootblk = rootblkno;
494  xlrec.level = 0;
495 
496  XLogRegisterData((char *) &xlrec, SizeOfBtreeNewroot);
497 
498  recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT);
499 
500  PageSetLSN(rootpage, recptr);
501  PageSetLSN(metapg, recptr);
502  }
503 
505 
506  /*
507  * swap root write lock for read lock. There is no danger of anyone
508  * else accessing the new root page while it's unlocked, since no one
509  * else knows where it is yet.
510  */
511  _bt_unlockbuf(rel, rootbuf);
512  _bt_lockbuf(rel, rootbuf, BT_READ);
513 
514  /* okay, metadata is correct, release lock on it without caching */
515  _bt_relbuf(rel, metabuf);
516  }
517  else
518  {
519  rootblkno = metad->btm_fastroot;
520  Assert(rootblkno != P_NONE);
521  rootlevel = metad->btm_fastlevel;
522 
523  /*
524  * Cache the metapage data for next time
525  */
527  sizeof(BTMetaPageData));
528  memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));
529 
530  /*
531  * We are done with the metapage; arrange to release it via first
532  * _bt_relandgetbuf call
533  */
534  rootbuf = metabuf;
535 
536  for (;;)
537  {
538  rootbuf = _bt_relandgetbuf(rel, rootbuf, rootblkno, BT_READ);
539  rootpage = BufferGetPage(rootbuf);
540  rootopaque = BTPageGetOpaque(rootpage);
541 
542  if (!P_IGNORE(rootopaque))
543  break;
544 
545  /* it's dead, Jim. step right one page */
546  if (P_RIGHTMOST(rootopaque))
547  elog(ERROR, "no live root page found in index \"%s\"",
549  rootblkno = rootopaque->btpo_next;
550  }
551 
552  if (rootopaque->btpo_level != rootlevel)
553  elog(ERROR, "root page %u of index \"%s\" has level %u, expected %u",
554  rootblkno, RelationGetRelationName(rel),
555  rootopaque->btpo_level, rootlevel);
556  }
557 
558  /*
559  * By here, we have a pin and read lock on the root page, and no lock set
560  * on the metadata page. Return the root page's buffer.
561  */
562  return rootbuf;
563 }
void * MemoryContextAlloc(MemoryContext context, Size size)
Definition: mcxt.c:1181
void _bt_upgrademetapage(Page page)
Definition: nbtpage.c:107
Buffer _bt_allocbuf(Relation rel, Relation heaprel)
Definition: nbtpage.c:869
static BTMetaPageData * _bt_getmeta(Relation rel, Buffer metabuf)
Definition: nbtpage.c:142
void _bt_unlockbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:1070
#define BTREE_MIN_VERSION
Definition: nbtree.h:151
#define BTP_LEAF
Definition: nbtree.h:76
#define BTREE_MAGIC
Definition: nbtree.h:149
#define BTP_ROOT
Definition: nbtree.h:77
#define SizeOfBtreeNewroot
Definition: nbtxlog.h:347
#define XLOG_BTREE_NEWROOT
Definition: nbtxlog.h:37
uint32 btm_last_cleanup_num_delpages
Definition: nbtree.h:114
uint32 btm_level
Definition: nbtree.h:108
float8 btm_last_cleanup_num_heap_tuples
Definition: nbtree.h:116
BlockNumber btm_fastroot
Definition: nbtree.h:109
uint32 btm_version
Definition: nbtree.h:106
uint32 btm_magic
Definition: nbtree.h:105
bool btm_allequalimage
Definition: nbtree.h:118
uint32 btm_fastlevel
Definition: nbtree.h:110
BlockNumber btpo_prev
Definition: nbtree.h:64
void * rd_amcache
Definition: rel.h:229
MemoryContext rd_indexcxt
Definition: rel.h:204
uint32 level
Definition: nbtxlog.h:50
uint32 version
Definition: nbtxlog.h:48
bool allequalimage
Definition: nbtxlog.h:54
BlockNumber fastroot
Definition: nbtxlog.h:51
uint32 fastlevel
Definition: nbtxlog.h:52
BlockNumber root
Definition: nbtxlog.h:49
uint32 last_cleanup_num_delpages
Definition: nbtxlog.h:53
uint32 level
Definition: nbtxlog.h:344
BlockNumber rootblk
Definition: nbtxlog.h:343
#define REGBUF_WILL_INIT
Definition: xloginsert.h:33

References _bt_allocbuf(), _bt_getbuf(), _bt_getmeta(), _bt_lockbuf(), _bt_relandgetbuf(), _bt_relbuf(), _bt_unlockbuf(), _bt_upgrademetapage(), xl_btree_metadata::allequalimage, Assert, BT_READ, BT_WRITE, BTMetaPageData::btm_allequalimage, BTMetaPageData::btm_fastlevel, BTMetaPageData::btm_fastroot, BTMetaPageData::btm_last_cleanup_num_delpages, BTMetaPageData::btm_last_cleanup_num_heap_tuples, BTMetaPageData::btm_level, BTMetaPageData::btm_magic, BTMetaPageData::btm_root, BTMetaPageData::btm_version, BTP_LEAF, BTP_ROOT, BTPageGetOpaque, BTPageOpaqueData::btpo_cycleid, BTPageOpaqueData::btpo_flags, BTPageOpaqueData::btpo_level, BTPageOpaqueData::btpo_next, BTPageOpaqueData::btpo_prev, BTREE_MAGIC, BTREE_METAPAGE, BTREE_MIN_VERSION, BTREE_NOVAC_VERSION, BTREE_VERSION, BufferGetBlockNumber(), BufferGetPage(), elog, END_CRIT_SECTION, ERROR, xl_btree_metadata::fastlevel, xl_btree_metadata::fastroot, InvalidBuffer, xl_btree_metadata::last_cleanup_num_delpages, xl_btree_metadata::level, xl_btree_newroot::level, MarkBufferDirty(), MemoryContextAlloc(), P_IGNORE, P_LEFTMOST, P_NONE, P_RIGHTMOST, PageSetLSN(), pfree(), RelationData::rd_amcache, RelationData::rd_indexcxt, REGBUF_STANDARD, REGBUF_WILL_INIT, RelationGetRelationName, RelationNeedsWAL, xl_btree_metadata::root, xl_btree_newroot::rootblk, SizeOfBtreeNewroot, START_CRIT_SECTION, xl_btree_metadata::version, XLOG_BTREE_NEWROOT, XLogBeginInsert(), XLogInsert(), XLogRegisterBufData(), XLogRegisterBuffer(), and XLogRegisterData().

Referenced by _bt_get_endpoint(), and _bt_search().

◆ _bt_getrootheight()

int _bt_getrootheight ( Relation  rel)

Definition at line 675 of file nbtpage.c.

676 {
677  BTMetaPageData *metad;
678 
679  if (rel->rd_amcache == NULL)
680  {
681  Buffer metabuf;
682 
683  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
684  metad = _bt_getmeta(rel, metabuf);
685 
686  /*
687  * If there's no root page yet, _bt_getroot() doesn't expect a cache
688  * to be made, so just stop here and report the index height is zero.
689  * (XXX perhaps _bt_getroot() should be changed to allow this case.)
690  */
691  if (metad->btm_root == P_NONE)
692  {
693  _bt_relbuf(rel, metabuf);
694  return 0;
695  }
696 
697  /*
698  * Cache the metapage data for next time
699  */
701  sizeof(BTMetaPageData));
702  memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));
703  _bt_relbuf(rel, metabuf);
704  }
705 
706  /* Get cached page */
707  metad = (BTMetaPageData *) rel->rd_amcache;
708  /* We shouldn't have cached it if any of these fail */
709  Assert(metad->btm_magic == BTREE_MAGIC);
711  Assert(metad->btm_version <= BTREE_VERSION);
712  Assert(!metad->btm_allequalimage ||
714  Assert(metad->btm_fastroot != P_NONE);
715 
716  return metad->btm_fastlevel;
717 }

References _bt_getbuf(), _bt_getmeta(), _bt_relbuf(), Assert, BT_READ, BTMetaPageData::btm_allequalimage, BTMetaPageData::btm_fastlevel, BTMetaPageData::btm_fastroot, BTMetaPageData::btm_magic, BTMetaPageData::btm_root, BTMetaPageData::btm_version, BTREE_MAGIC, BTREE_METAPAGE, BTREE_MIN_VERSION, BTREE_NOVAC_VERSION, BTREE_VERSION, MemoryContextAlloc(), P_NONE, RelationData::rd_amcache, and RelationData::rd_indexcxt.

Referenced by _bt_insertonpg(), and btgettreeheight().

◆ _bt_getstackbuf()

Buffer _bt_getstackbuf ( Relation  rel,
Relation  heaprel,
BTStack  stack,
BlockNumber  child 
)

Definition at line 2319 of file nbtinsert.c.

2320 {
2321  BlockNumber blkno;
2323 
2324  blkno = stack->bts_blkno;
2325  start = stack->bts_offset;
2326 
2327  for (;;)
2328  {
2329  Buffer buf;
2330  Page page;
2331  BTPageOpaque opaque;
2332 
2333  buf = _bt_getbuf(rel, blkno, BT_WRITE);
2334  page = BufferGetPage(buf);
2335  opaque = BTPageGetOpaque(page);
2336 
2337  Assert(heaprel != NULL);
2338  if (P_INCOMPLETE_SPLIT(opaque))
2339  {
2340  _bt_finish_split(rel, heaprel, buf, stack->bts_parent);
2341  continue;
2342  }
2343 
2344  if (!P_IGNORE(opaque))
2345  {
2346  OffsetNumber offnum,
2347  minoff,
2348  maxoff;
2349  ItemId itemid;
2350  IndexTuple item;
2351 
2352  minoff = P_FIRSTDATAKEY(opaque);
2353  maxoff = PageGetMaxOffsetNumber(page);
2354 
2355  /*
2356  * start = InvalidOffsetNumber means "search the whole page". We
2357  * need this test anyway due to possibility that page has a high
2358  * key now when it didn't before.
2359  */
2360  if (start < minoff)
2361  start = minoff;
2362 
2363  /*
2364  * Need this check too, to guard against possibility that page
2365  * split since we visited it originally.
2366  */
2367  if (start > maxoff)
2368  start = OffsetNumberNext(maxoff);
2369 
2370  /*
2371  * These loops will check every item on the page --- but in an
2372  * order that's attuned to the probability of where it actually
2373  * is. Scan to the right first, then to the left.
2374  */
2375  for (offnum = start;
2376  offnum <= maxoff;
2377  offnum = OffsetNumberNext(offnum))
2378  {
2379  itemid = PageGetItemId(page, offnum);
2380  item = (IndexTuple) PageGetItem(page, itemid);
2381 
2382  if (BTreeTupleGetDownLink(item) == child)
2383  {
2384  /* Return accurate pointer to where link is now */
2385  stack->bts_blkno = blkno;
2386  stack->bts_offset = offnum;
2387  return buf;
2388  }
2389  }
2390 
2391  for (offnum = OffsetNumberPrev(start);
2392  offnum >= minoff;
2393  offnum = OffsetNumberPrev(offnum))
2394  {
2395  itemid = PageGetItemId(page, offnum);
2396  item = (IndexTuple) PageGetItem(page, itemid);
2397 
2398  if (BTreeTupleGetDownLink(item) == child)
2399  {
2400  /* Return accurate pointer to where link is now */
2401  stack->bts_blkno = blkno;
2402  stack->bts_offset = offnum;
2403  return buf;
2404  }
2405  }
2406  }
2407 
2408  /*
2409  * The item we're looking for moved right at least one page.
2410  *
2411  * Lehman and Yao couple/chain locks when moving right here, which we
2412  * can avoid. See nbtree/README.
2413  */
2414  if (P_RIGHTMOST(opaque))
2415  {
2416  _bt_relbuf(rel, buf);
2417  return InvalidBuffer;
2418  }
2419  blkno = opaque->btpo_next;
2421  _bt_relbuf(rel, buf);
2422  }
2423 }
return str start
void _bt_finish_split(Relation rel, Relation heaprel, Buffer lbuf, BTStack stack)
Definition: nbtinsert.c:2241
#define OffsetNumberPrev(offsetNumber)
Definition: off.h:54
BlockNumber bts_blkno
Definition: nbtree.h:734
OffsetNumber bts_offset
Definition: nbtree.h:735

References _bt_finish_split(), _bt_getbuf(), _bt_relbuf(), Assert, BT_WRITE, BTPageGetOpaque, BTPageOpaqueData::btpo_next, BTreeTupleGetDownLink(), BTStackData::bts_blkno, BTStackData::bts_offset, BTStackData::bts_parent, buf, BufferGetPage(), InvalidBuffer, InvalidOffsetNumber, OffsetNumberNext, OffsetNumberPrev, P_FIRSTDATAKEY, P_IGNORE, P_INCOMPLETE_SPLIT, P_RIGHTMOST, PageGetItem(), PageGetItemId(), PageGetMaxOffsetNumber(), and start.

Referenced by _bt_insert_parent(), and _bt_lock_subtree_parent().

◆ _bt_gettrueroot()

Buffer _bt_gettrueroot ( Relation  rel)

Definition at line 580 of file nbtpage.c.

581 {
582  Buffer metabuf;
583  Page metapg;
584  BTPageOpaque metaopaque;
585  Buffer rootbuf;
586  Page rootpage;
587  BTPageOpaque rootopaque;
588  BlockNumber rootblkno;
589  uint32 rootlevel;
590  BTMetaPageData *metad;
591 
592  /*
593  * We don't try to use cached metapage data here, since (a) this path is
594  * not performance-critical, and (b) if we are here it suggests our cache
595  * is out-of-date anyway. In light of point (b), it's probably safest to
596  * actively flush any cached metapage info.
597  */
598  if (rel->rd_amcache)
599  pfree(rel->rd_amcache);
600  rel->rd_amcache = NULL;
601 
602  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
603  metapg = BufferGetPage(metabuf);
604  metaopaque = BTPageGetOpaque(metapg);
605  metad = BTPageGetMeta(metapg);
606 
607  if (!P_ISMETA(metaopaque) ||
608  metad->btm_magic != BTREE_MAGIC)
609  ereport(ERROR,
610  (errcode(ERRCODE_INDEX_CORRUPTED),
611  errmsg("index \"%s\" is not a btree",
612  RelationGetRelationName(rel))));
613 
614  if (metad->btm_version < BTREE_MIN_VERSION ||
615  metad->btm_version > BTREE_VERSION)
616  ereport(ERROR,
617  (errcode(ERRCODE_INDEX_CORRUPTED),
618  errmsg("version mismatch in index \"%s\": file version %d, "
619  "current version %d, minimal supported version %d",
622 
623  /* if no root page initialized yet, fail */
624  if (metad->btm_root == P_NONE)
625  {
626  _bt_relbuf(rel, metabuf);
627  return InvalidBuffer;
628  }
629 
630  rootblkno = metad->btm_root;
631  rootlevel = metad->btm_level;
632 
633  /*
634  * We are done with the metapage; arrange to release it via first
635  * _bt_relandgetbuf call
636  */
637  rootbuf = metabuf;
638 
639  for (;;)
640  {
641  rootbuf = _bt_relandgetbuf(rel, rootbuf, rootblkno, BT_READ);
642  rootpage = BufferGetPage(rootbuf);
643  rootopaque = BTPageGetOpaque(rootpage);
644 
645  if (!P_IGNORE(rootopaque))
646  break;
647 
648  /* it's dead, Jim. step right one page */
649  if (P_RIGHTMOST(rootopaque))
650  elog(ERROR, "no live root page found in index \"%s\"",
652  rootblkno = rootopaque->btpo_next;
653  }
654 
655  if (rootopaque->btpo_level != rootlevel)
656  elog(ERROR, "root page %u of index \"%s\" has level %u, expected %u",
657  rootblkno, RelationGetRelationName(rel),
658  rootopaque->btpo_level, rootlevel);
659 
660  return rootbuf;
661 }
#define P_ISMETA(opaque)
Definition: nbtree.h:223

References _bt_getbuf(), _bt_relandgetbuf(), _bt_relbuf(), BT_READ, BTMetaPageData::btm_level, BTMetaPageData::btm_magic, BTMetaPageData::btm_root, BTMetaPageData::btm_version, BTPageGetMeta, BTPageGetOpaque, BTPageOpaqueData::btpo_level, BTPageOpaqueData::btpo_next, BTREE_MAGIC, BTREE_METAPAGE, BTREE_MIN_VERSION, BTREE_VERSION, BufferGetPage(), elog, ereport, errcode(), errmsg(), ERROR, InvalidBuffer, P_IGNORE, P_ISMETA, P_NONE, P_RIGHTMOST, pfree(), RelationData::rd_amcache, and RelationGetRelationName.

Referenced by _bt_get_endpoint().

◆ _bt_initmetapage()

void _bt_initmetapage ( Page  page,
BlockNumber  rootbknum,
uint32  level,
bool  allequalimage 
)

Definition at line 67 of file nbtpage.c.

69 {
70  BTMetaPageData *metad;
71  BTPageOpaque metaopaque;
72 
73  _bt_pageinit(page, BLCKSZ);
74 
75  metad = BTPageGetMeta(page);
76  metad->btm_magic = BTREE_MAGIC;
77  metad->btm_version = BTREE_VERSION;
78  metad->btm_root = rootbknum;
79  metad->btm_level = level;
80  metad->btm_fastroot = rootbknum;
81  metad->btm_fastlevel = level;
84  metad->btm_allequalimage = allequalimage;
85 
86  metaopaque = BTPageGetOpaque(page);
87  metaopaque->btpo_flags = BTP_META;
88 
89  /*
90  * Set pd_lower just past the end of the metadata. This is essential,
91  * because without doing so, metadata will be lost if xlog.c compresses
92  * the page.
93  */
94  ((PageHeader) page)->pd_lower =
95  ((char *) metad + sizeof(BTMetaPageData)) - (char *) page;
96 }
PageHeaderData * PageHeader
Definition: bufpage.h:173
#define BTP_META
Definition: nbtree.h:79

References _bt_pageinit(), BTMetaPageData::btm_allequalimage, BTMetaPageData::btm_fastlevel, BTMetaPageData::btm_fastroot, BTMetaPageData::btm_last_cleanup_num_delpages, BTMetaPageData::btm_last_cleanup_num_heap_tuples, BTMetaPageData::btm_level, BTMetaPageData::btm_magic, BTMetaPageData::btm_root, BTMetaPageData::btm_version, BTP_META, BTPageGetMeta, BTPageGetOpaque, BTPageOpaqueData::btpo_flags, BTREE_MAGIC, and BTREE_VERSION.

Referenced by _bt_uppershutdown(), and btbuildempty().

◆ _bt_keep_natts_fast()

int _bt_keep_natts_fast ( Relation  rel,
IndexTuple  lastleft,
IndexTuple  firstright 
)

Definition at line 4877 of file nbtutils.c.

4878 {
4879  TupleDesc itupdesc = RelationGetDescr(rel);
4880  int keysz = IndexRelationGetNumberOfKeyAttributes(rel);
4881  int keepnatts;
4882 
4883  keepnatts = 1;
4884  for (int attnum = 1; attnum <= keysz; attnum++)
4885  {
4886  Datum datum1,
4887  datum2;
4888  bool isNull1,
4889  isNull2;
4890  Form_pg_attribute att;
4891 
4892  datum1 = index_getattr(lastleft, attnum, itupdesc, &isNull1);
4893  datum2 = index_getattr(firstright, attnum, itupdesc, &isNull2);
4894  att = TupleDescAttr(itupdesc, attnum - 1);
4895 
4896  if (isNull1 != isNull2)
4897  break;
4898 
4899  if (!isNull1 &&
4900  !datum_image_eq(datum1, datum2, att->attbyval, att->attlen))
4901  break;
4902 
4903  keepnatts++;
4904  }
4905 
4906  return keepnatts;
4907 }
bool datum_image_eq(Datum value1, Datum value2, bool typByVal, int typLen)
Definition: datum.c:266
int16 attnum
Definition: pg_attribute.h:74
FormData_pg_attribute * Form_pg_attribute
Definition: pg_attribute.h:209
#define TupleDescAttr(tupdesc, i)
Definition: tupdesc.h:92

References attnum, datum_image_eq(), index_getattr(), IndexRelationGetNumberOfKeyAttributes, RelationGetDescr, and TupleDescAttr.

Referenced by _bt_afternewitemoff(), _bt_bottomupdel_pass(), _bt_dedup_pass(), _bt_do_singleval(), _bt_keep_natts(), _bt_load(), _bt_split_penalty(), and _bt_strategy().

◆ _bt_killitems()

void _bt_killitems ( IndexScanDesc  scan)

Definition at line 4178 of file nbtutils.c.

4179 {
4180  BTScanOpaque so = (BTScanOpaque) scan->opaque;
4181  Page page;
4182  BTPageOpaque opaque;
4183  OffsetNumber minoff;
4184  OffsetNumber maxoff;
4185  int i;
4186  int numKilled = so->numKilled;
4187  bool killedsomething = false;
4188  bool droppedpin PG_USED_FOR_ASSERTS_ONLY;
4189 
4191 
4192  /*
4193  * Always reset the scan state, so we don't look for same items on other
4194  * pages.
4195  */
4196  so->numKilled = 0;
4197 
4198  if (BTScanPosIsPinned(so->currPos))
4199  {
4200  /*
4201  * We have held the pin on this page since we read the index tuples,
4202  * so all we need to do is lock it. The pin will have prevented
4203  * re-use of any TID on the page, so there is no need to check the
4204  * LSN.
4205  */
4206  droppedpin = false;
4208 
4209  page = BufferGetPage(so->currPos.buf);
4210  }
4211  else
4212  {
4213  Buffer buf;
4214 
4215  droppedpin = true;
4216  /* Attempt to re-read the buffer, getting pin and lock. */
4218 
4219  page = BufferGetPage(buf);
4220  if (BufferGetLSNAtomic(buf) == so->currPos.lsn)
4221  so->currPos.buf = buf;
4222  else
4223  {
4224  /* Modified while not pinned means hinting is not safe. */
4225  _bt_relbuf(scan->indexRelation, buf);
4226  return;
4227  }
4228  }
4229 
4230  opaque = BTPageGetOpaque(page);
4231  minoff = P_FIRSTDATAKEY(opaque);
4232  maxoff = PageGetMaxOffsetNumber(page);
4233 
4234  for (i = 0; i < numKilled; i++)
4235  {
4236  int itemIndex = so->killedItems[i];
4237  BTScanPosItem *kitem = &so->currPos.items[itemIndex];
4238  OffsetNumber offnum = kitem->indexOffset;
4239 
4240  Assert(itemIndex >= so->currPos.firstItem &&
4241  itemIndex <= so->currPos.lastItem);
4242  if (offnum < minoff)
4243  continue; /* pure paranoia */
4244  while (offnum <= maxoff)
4245  {
4246  ItemId iid = PageGetItemId(page, offnum);
4247  IndexTuple ituple = (IndexTuple) PageGetItem(page, iid);
4248  bool killtuple = false;
4249 
4250  if (BTreeTupleIsPosting(ituple))
4251  {
4252  int pi = i + 1;
4253  int nposting = BTreeTupleGetNPosting(ituple);
4254  int j;
4255 
4256  /*
4257  * We rely on the convention that heap TIDs in the scanpos
4258  * items array are stored in ascending heap TID order for a
4259  * group of TIDs that originally came from a posting list
4260  * tuple. This convention even applies during backwards
4261  * scans, where returning the TIDs in descending order might
4262  * seem more natural. This is about effectiveness, not
4263  * correctness.
4264  *
4265  * Note that the page may have been modified in almost any way
4266  * since we first read it (in the !droppedpin case), so it's
4267  * possible that this posting list tuple wasn't a posting list
4268  * tuple when we first encountered its heap TIDs.
4269  */
4270  for (j = 0; j < nposting; j++)
4271  {
4272  ItemPointer item = BTreeTupleGetPostingN(ituple, j);
4273 
4274  if (!ItemPointerEquals(item, &kitem->heapTid))
4275  break; /* out of posting list loop */
4276 
4277  /*
4278  * kitem must have matching offnum when heap TIDs match,
4279  * though only in the common case where the page can't
4280  * have been concurrently modified
4281  */
4282  Assert(kitem->indexOffset == offnum || !droppedpin);
4283 
4284  /*
4285  * Read-ahead to later kitems here.
4286  *
4287  * We rely on the assumption that not advancing kitem here
4288  * will prevent us from considering the posting list tuple
4289  * fully dead by not matching its next heap TID in next
4290  * loop iteration.
4291  *
4292  * If, on the other hand, this is the final heap TID in
4293  * the posting list tuple, then tuple gets killed
4294  * regardless (i.e. we handle the case where the last
4295  * kitem is also the last heap TID in the last index tuple
4296  * correctly -- posting tuple still gets killed).
4297  */
4298  if (pi < numKilled)
4299  kitem = &so->currPos.items[so->killedItems[pi++]];
4300  }
4301 
4302  /*
4303  * Don't bother advancing the outermost loop's int iterator to
4304  * avoid processing killed items that relate to the same
4305  * offnum/posting list tuple. This micro-optimization hardly
4306  * seems worth it. (Further iterations of the outermost loop
4307  * will fail to match on this same posting list's first heap
4308  * TID instead, so we'll advance to the next offnum/index
4309  * tuple pretty quickly.)
4310  */
4311  if (j == nposting)
4312  killtuple = true;
4313  }
4314  else if (ItemPointerEquals(&ituple->t_tid, &kitem->heapTid))
4315  killtuple = true;
4316 
4317  /*
4318  * Mark index item as dead, if it isn't already. Since this
4319  * happens while holding a buffer lock possibly in shared mode,
4320  * it's possible that multiple processes attempt to do this
4321  * simultaneously, leading to multiple full-page images being sent
4322  * to WAL (if wal_log_hints or data checksums are enabled), which
4323  * is undesirable.
4324  */
4325  if (killtuple && !ItemIdIsDead(iid))
4326  {
4327  /* found the item/all posting list items */
4328  ItemIdMarkDead(iid);
4329  killedsomething = true;
4330  break; /* out of inner search loop */
4331  }
4332  offnum = OffsetNumberNext(offnum);
4333  }
4334  }
4335 
4336  /*
4337  * Since this can be redone later if needed, mark as dirty hint.
4338  *
4339  * Whenever we mark anything LP_DEAD, we also set the page's
4340  * BTP_HAS_GARBAGE flag, which is likewise just a hint. (Note that we
4341  * only rely on the page-level flag in !heapkeyspace indexes.)
4342  */
4343  if (killedsomething)
4344  {
4345  opaque->btpo_flags |= BTP_HAS_GARBAGE;
4346  MarkBufferDirtyHint(so->currPos.buf, true);
4347  }
4348 
4349  _bt_unlockbuf(scan->indexRelation, so->currPos.buf);
4350 }
XLogRecPtr BufferGetLSNAtomic(Buffer buffer)
Definition: bufmgr.c:3985
void MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
Definition: bufmgr.c:4988
int j
Definition: isn.c:73
#define ItemIdMarkDead(itemId)
Definition: itemid.h:179
int * killedItems
Definition: nbtree.h:1048
BlockNumber currPage
Definition: nbtree.h:956
int firstItem
Definition: nbtree.h:984
BTScanPosItem items[MaxTIDsPerBTreePage]
Definition: nbtree.h:988
XLogRecPtr lsn
Definition: nbtree.h:959
ItemPointerData heapTid
Definition: nbtree.h:946
OffsetNumber indexOffset
Definition: nbtree.h:947

References _bt_getbuf(), _bt_lockbuf(), _bt_relbuf(), _bt_unlockbuf(), Assert, BT_READ, BTP_HAS_GARBAGE, BTPageGetOpaque, BTreeTupleGetNPosting(), BTreeTupleGetPostingN(), BTreeTupleIsPosting(), BTScanPosIsPinned, BTScanPosIsValid, buf, BTScanPosData::buf, BufferGetLSNAtomic(), BufferGetPage(), BTScanPosData::currPage, BTScanOpaqueData::currPos, BTScanPosData::firstItem, BTScanPosItem::heapTid, i, BTScanPosItem::indexOffset, IndexScanDescData::indexRelation, ItemIdIsDead, ItemIdMarkDead, ItemPointerEquals(), BTScanPosData::items, j, BTScanOpaqueData::killedItems, BTScanPosData::lsn, MarkBufferDirtyHint(), BTScanOpaqueData::numKilled, OffsetNumberNext, IndexScanDescData::opaque, P_FIRSTDATAKEY, PageGetItem(), PageGetItemId(), PageGetMaxOffsetNumber(), PG_USED_FOR_ASSERTS_ONLY, and IndexTupleData::t_tid.

Referenced by _bt_steppage(), btendscan(), btrescan(), and btrestrpos().

◆ _bt_lockbuf()

void _bt_lockbuf ( Relation  rel,
Buffer  buf,
int  access 
)

Definition at line 1039 of file nbtpage.c.

1040 {
1041  /* LockBuffer() asserts that pin is held by this backend */
1042  LockBuffer(buf, access);
1043 
1044  /*
1045  * It doesn't matter that _bt_unlockbuf() won't get called in the event of
1046  * an nbtree error (e.g. a unique violation error). That won't cause
1047  * Valgrind false positives.
1048  *
1049  * The nbtree client requests are superimposed on top of the bufmgr.c
1050  * buffer pin client requests. In the event of an nbtree error the buffer
1051  * will certainly get marked as defined when the backend once again
1052  * acquires its first pin on the buffer. (Of course, if the backend never
1053  * touches the buffer again then it doesn't matter that it remains
1054  * non-accessible to Valgrind.)
1055  *
1056  * Note: When an IndexTuple C pointer gets computed using an ItemId read
1057  * from a page while a lock was held, the C pointer becomes unsafe to
1058  * dereference forever as soon as the lock is released. Valgrind can only
1059  * detect cases where the pointer gets dereferenced with no _current_
1060  * lock/pin held, though.
1061  */
1062  if (!RelationUsesLocalBuffers(rel))
1064 }
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:5158

References buf, BufferGetPage(), LockBuffer(), RelationUsesLocalBuffers, and VALGRIND_MAKE_MEM_DEFINED.

Referenced by _bt_getbuf(), _bt_getroot(), _bt_killitems(), _bt_moveright(), _bt_pagedel(), _bt_relandgetbuf(), _bt_search(), _bt_set_cleanup_info(), _bt_unlink_halfdead_page(), and btvacuumpage().

◆ _bt_metaversion()

void _bt_metaversion ( Relation  rel,
bool heapkeyspace,
bool allequalimage 
)

Definition at line 739 of file nbtpage.c.

740 {
741  BTMetaPageData *metad;
742 
743  if (rel->rd_amcache == NULL)
744  {
745  Buffer metabuf;
746 
747  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
748  metad = _bt_getmeta(rel, metabuf);
749 
750  /*
751  * If there's no root page yet, _bt_getroot() doesn't expect a cache
752  * to be made, so just stop here. (XXX perhaps _bt_getroot() should
753  * be changed to allow this case.)
754  */
755  if (metad->btm_root == P_NONE)
756  {
757  *heapkeyspace = metad->btm_version > BTREE_NOVAC_VERSION;
758  *allequalimage = metad->btm_allequalimage;
759 
760  _bt_relbuf(rel, metabuf);
761  return;
762  }
763 
764  /*
765  * Cache the metapage data for next time
766  *
767  * An on-the-fly version upgrade performed by _bt_upgrademetapage()
768  * can change the nbtree version for an index without invalidating any
769  * local cache. This is okay because it can only happen when moving
770  * from version 2 to version 3, both of which are !heapkeyspace
771  * versions.
772  */
774  sizeof(BTMetaPageData));
775  memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));
776  _bt_relbuf(rel, metabuf);
777  }
778 
779  /* Get cached page */
780  metad = (BTMetaPageData *) rel->rd_amcache;
781  /* We shouldn't have cached it if any of these fail */
782  Assert(metad->btm_magic == BTREE_MAGIC);
784  Assert(metad->btm_version <= BTREE_VERSION);
785  Assert(!metad->btm_allequalimage ||
787  Assert(metad->btm_fastroot != P_NONE);
788 
789  *heapkeyspace = metad->btm_version > BTREE_NOVAC_VERSION;
790  *allequalimage = metad->btm_allequalimage;
791 }

References _bt_getbuf(), _bt_getmeta(), _bt_relbuf(), Assert, BT_READ, BTMetaPageData::btm_allequalimage, BTMetaPageData::btm_fastroot, BTMetaPageData::btm_magic, BTMetaPageData::btm_root, BTMetaPageData::btm_version, BTREE_MAGIC, BTREE_METAPAGE, BTREE_MIN_VERSION, BTREE_NOVAC_VERSION, BTREE_VERSION, MemoryContextAlloc(), P_NONE, RelationData::rd_amcache, and RelationData::rd_indexcxt.

Referenced by _bt_first(), _bt_mkscankey(), and bt_index_check_internal().

◆ _bt_mkscankey()

BTScanInsert _bt_mkscankey ( Relation  rel,
IndexTuple  itup 
)

Definition at line 129 of file nbtutils.c.

130 {
132  ScanKey skey;
133  TupleDesc itupdesc;
134  int indnkeyatts;
135  int16 *indoption;
136  int tupnatts;
137  int i;
138 
139  itupdesc = RelationGetDescr(rel);
140  indnkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
141  indoption = rel->rd_indoption;
142  tupnatts = itup ? BTreeTupleGetNAtts(itup, rel) : 0;
143 
144  Assert(tupnatts <= IndexRelationGetNumberOfAttributes(rel));
145 
146  /*
147  * We'll execute search using scan key constructed on key columns.
148  * Truncated attributes and non-key attributes are omitted from the final
149  * scan key.
150  */
151  key = palloc(offsetof(BTScanInsertData, scankeys) +
152  sizeof(ScanKeyData) * indnkeyatts);
153  if (itup)
154  _bt_metaversion(rel, &key->heapkeyspace, &key->allequalimage);
155  else
156  {
157  /* Utility statement callers can set these fields themselves */
158  key->heapkeyspace = true;
159  key->allequalimage = false;
160  }
161  key->anynullkeys = false; /* initial assumption */
162  key->nextkey = false; /* usual case, required by btinsert */
163  key->backward = false; /* usual case, required by btinsert */
164  key->keysz = Min(indnkeyatts, tupnatts);
165  key->scantid = key->heapkeyspace && itup ?
166  BTreeTupleGetHeapTID(itup) : NULL;
167  skey = key->scankeys;
168  for (i = 0; i < indnkeyatts; i++)
169  {
170  FmgrInfo *procinfo;
171  Datum arg;
172  bool null;
173  int flags;
174 
175  /*
176  * We can use the cached (default) support procs since no cross-type
177  * comparison can be needed.
178  */
179  procinfo = index_getprocinfo(rel, i + 1, BTORDER_PROC);
180 
181  /*
182  * Key arguments built from truncated attributes (or when caller
183  * provides no tuple) are defensively represented as NULL values. They
184  * should never be used.
185  */
186  if (i < tupnatts)
187  arg = index_getattr(itup, i + 1, itupdesc, &null);
188  else
189  {
190  arg = (Datum) 0;
191  null = true;
192  }
193  flags = (null ? SK_ISNULL : 0) | (indoption[i] << SK_BT_INDOPTION_SHIFT);
195  flags,
196  (AttrNumber) (i + 1),
198  InvalidOid,
199  rel->rd_indcollation[i],
200  procinfo,
201  arg);
202  /* Record if any key attribute is NULL (or truncated) */
203  if (null)
204  key->anynullkeys = true;
205  }
206 
207  /*
208  * In NULLS NOT DISTINCT mode, we pretend that there are no null keys, so
209  * that full uniqueness check is done.
210  */
211  if (rel->rd_index->indnullsnotdistinct)
212  key->anynullkeys = false;
213 
214  return key;
215 }
#define SK_BT_INDOPTION_SHIFT
Definition: nbtree.h:1116
int16 * rd_indoption
Definition: rel.h:211
Form_pg_index rd_index
Definition: rel.h:192

References _bt_metaversion(), arg, Assert, BTORDER_PROC, BTreeTupleGetHeapTID(), BTreeTupleGetNAtts, i, index_getattr(), index_getprocinfo(), IndexRelationGetNumberOfAttributes, IndexRelationGetNumberOfKeyAttributes, InvalidOid, InvalidStrategy, sort-test::key, Min, palloc(), RelationData::rd_indcollation, RelationData::rd_index, RelationData::rd_indoption, RelationGetDescr, ScanKeyEntryInitializeWithInfo(), SK_BT_INDOPTION_SHIFT, and SK_ISNULL.

Referenced by _bt_doinsert(), _bt_leafbuild(), _bt_pagedel(), bt_mkscankey_pivotsearch(), bt_rootdescend(), tuplesort_begin_cluster(), and tuplesort_begin_index_btree().

◆ _bt_next()

bool _bt_next ( IndexScanDesc  scan,
ScanDirection  dir 
)

Definition at line 1461 of file nbtsearch.c.

1462 {
1463  BTScanOpaque so = (BTScanOpaque) scan->opaque;
1464 
1466 
1467  /*
1468  * Advance to next tuple on current page; or if there's no more, try to
1469  * step to the next page with data.
1470  */
1471  if (ScanDirectionIsForward(dir))
1472  {
1473  if (++so->currPos.itemIndex > so->currPos.lastItem)
1474  {
1475  if (!_bt_steppage(scan, dir))
1476  return false;
1477  }
1478  }
1479  else
1480  {
1481  if (--so->currPos.itemIndex < so->currPos.firstItem)
1482  {
1483  if (!_bt_steppage(scan, dir))
1484  return false;
1485  }
1486  }
1487 
1488  _bt_returnitem(scan, so);
1489  return true;
1490 }
static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:2024
int lastItem
Definition: nbtree.h:985
int itemIndex
Definition: nbtree.h:986

References _bt_returnitem(), _bt_steppage(), Assert, BTScanPosIsValid, BTScanOpaqueData::currPos, BTScanPosData::firstItem, BTScanPosData::itemIndex, BTScanPosData::lastItem, IndexScanDescData::opaque, and ScanDirectionIsForward.

Referenced by btgetbitmap(), and btgettuple().

◆ _bt_oppodir_checkkeys()

bool _bt_oppodir_checkkeys ( IndexScanDesc  scan,
ScanDirection  dir,
IndexTuple  finaltup 
)

Definition at line 3627 of file nbtutils.c.

3629 {
3630  Relation rel = scan->indexRelation;
3631  TupleDesc tupdesc = RelationGetDescr(rel);
3632  BTScanOpaque so = (BTScanOpaque) scan->opaque;
3633  int nfinaltupatts = BTreeTupleGetNAtts(finaltup, rel);
3634  bool continuescan;
3635  ScanDirection flipped = -dir;
3636  int ikey = 0;
3637 
3638  Assert(so->numArrayKeys);
3639 
3640  _bt_check_compare(scan, flipped, finaltup, nfinaltupatts, tupdesc,
3641  false, false, false, &continuescan, &ikey);
3642 
3643  if (!continuescan && so->keyData[ikey].sk_strategy != BTEqualStrategyNumber)
3644  return false;
3645 
3646  return true;
3647 }

References _bt_check_compare(), Assert, BTEqualStrategyNumber, BTreeTupleGetNAtts, IndexScanDescData::indexRelation, BTScanOpaqueData::keyData, BTScanOpaqueData::numArrayKeys, IndexScanDescData::opaque, RelationGetDescr, and ScanKeyData::sk_strategy.

Referenced by _bt_advance_array_keys(), and _bt_readpage().

◆ _bt_pagedel()

void _bt_pagedel ( Relation  rel,
Buffer  leafbuf,
BTVacState vstate 
)

Definition at line 1802 of file nbtpage.c.

1803 {
1804  BlockNumber rightsib;
1805  bool rightsib_empty;
1806  Page page;
1807  BTPageOpaque opaque;
1808 
1809  /*
1810  * Save original leafbuf block number from caller. Only deleted blocks
1811  * that are <= scanblkno are added to bulk delete stat's pages_deleted
1812  * count.
1813  */
1814  BlockNumber scanblkno = BufferGetBlockNumber(leafbuf);
1815 
1816  /*
1817  * "stack" is a search stack leading (approximately) to the target page.
1818  * It is initially NULL, but when iterating, we keep it to avoid
1819  * duplicated search effort.
1820  *
1821  * Also, when "stack" is not NULL, we have already checked that the
1822  * current page is not the right half of an incomplete split, i.e. the
1823  * left sibling does not have its INCOMPLETE_SPLIT flag set, including
1824  * when the current target page is to the right of caller's initial page
1825  * (the scanblkno page).
1826  */
1827  BTStack stack = NULL;
1828 
1829  for (;;)
1830  {
1831  page = BufferGetPage(leafbuf);
1832  opaque = BTPageGetOpaque(page);
1833 
1834  /*
1835  * Internal pages are never deleted directly, only as part of deleting
1836  * the whole subtree all the way down to leaf level.
1837  *
1838  * Also check for deleted pages here. Caller never passes us a fully
1839  * deleted page. Only VACUUM can delete pages, so there can't have
1840  * been a concurrent deletion. Assume that we reached any deleted
1841  * page encountered here by following a sibling link, and that the
1842  * index is corrupt.
1843  */
1844  Assert(!P_ISDELETED(opaque));
1845  if (!P_ISLEAF(opaque) || P_ISDELETED(opaque))
1846  {
1847  /*
1848  * Pre-9.4 page deletion only marked internal pages as half-dead,
1849  * but now we only use that flag on leaf pages. The old algorithm
1850  * was never supposed to leave half-dead pages in the tree, it was
1851  * just a transient state, but it was nevertheless possible in
1852  * error scenarios. We don't know how to deal with them here. They
1853  * are harmless as far as searches are considered, but inserts
1854  * into the deleted keyspace could add out-of-order downlinks in
1855  * the upper levels. Log a notice, hopefully the admin will notice
1856  * and reindex.
1857  */
1858  if (P_ISHALFDEAD(opaque))
1859  ereport(LOG,
1860  (errcode(ERRCODE_INDEX_CORRUPTED),
1861  errmsg("index \"%s\" contains a half-dead internal page",
1863  errhint("This can be caused by an interrupted VACUUM in version 9.3 or older, before upgrade. Please REINDEX it.")));
1864 
1865  if (P_ISDELETED(opaque))
1866  ereport(LOG,
1867  (errcode(ERRCODE_INDEX_CORRUPTED),
1868  errmsg_internal("found deleted block %u while following right link from block %u in index \"%s\"",
1869  BufferGetBlockNumber(leafbuf),
1870  scanblkno,
1871  RelationGetRelationName(rel))));
1872 
1873  _bt_relbuf(rel, leafbuf);
1874  return;
1875  }
1876 
1877  /*
1878  * We can never delete rightmost pages nor root pages. While at it,
1879  * check that page is empty, since it's possible that the leafbuf page
1880  * was empty a moment ago, but has since had some inserts.
1881  *
1882  * To keep the algorithm simple, we also never delete an incompletely
1883  * split page (they should be rare enough that this doesn't make any
1884  * meaningful difference to disk usage):
1885  *
1886  * The INCOMPLETE_SPLIT flag on the page tells us if the page is the
1887  * left half of an incomplete split, but ensuring that it's not the
1888  * right half is more complicated. For that, we have to check that
1889  * the left sibling doesn't have its INCOMPLETE_SPLIT flag set using
1890  * _bt_leftsib_splitflag(). On the first iteration, we temporarily
1891  * release the lock on scanblkno/leafbuf, check the left sibling, and
1892  * construct a search stack to scanblkno. On subsequent iterations,
1893  * we know we stepped right from a page that passed these tests, so
1894  * it's OK.
1895  */
1896  if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) ||
1897  P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page) ||
1898  P_INCOMPLETE_SPLIT(opaque))
1899  {
1900  /* Should never fail to delete a half-dead page */
1901  Assert(!P_ISHALFDEAD(opaque));
1902 
1903  _bt_relbuf(rel, leafbuf);
1904  return;
1905  }
1906 
1907  /*
1908  * First, remove downlink pointing to the page (or a parent of the
1909  * page, if we are going to delete a taller subtree), and mark the
1910  * leafbuf page half-dead
1911  */
1912  if (!P_ISHALFDEAD(opaque))
1913  {
1914  /*
1915  * We need an approximate pointer to the page's parent page. We
1916  * use a variant of the standard search mechanism to search for
1917  * the page's high key; this will give us a link to either the
1918  * current parent or someplace to its left (if there are multiple
1919  * equal high keys, which is possible with !heapkeyspace indexes).
1920  *
1921  * Also check if this is the right-half of an incomplete split
1922  * (see comment above).
1923  */
1924  if (!stack)
1925  {
1926  BTScanInsert itup_key;
1927  ItemId itemid;
1928  IndexTuple targetkey;
1929  BlockNumber leftsib,
1930  leafblkno;
1931  Buffer sleafbuf;
1932 
1933  itemid = PageGetItemId(page, P_HIKEY);
1934  targetkey = CopyIndexTuple((IndexTuple) PageGetItem(page, itemid));
1935 
1936  leftsib = opaque->btpo_prev;
1937  leafblkno = BufferGetBlockNumber(leafbuf);
1938 
1939  /*
1940  * To avoid deadlocks, we'd better drop the leaf page lock
1941  * before going further.
1942  */
1943  _bt_unlockbuf(rel, leafbuf);
1944 
1945  /*
1946  * Check that the left sibling of leafbuf (if any) is not
1947  * marked with INCOMPLETE_SPLIT flag before proceeding
1948  */
1949  Assert(leafblkno == scanblkno);
1950  if (_bt_leftsib_splitflag(rel, leftsib, leafblkno))
1951  {
1952  ReleaseBuffer(leafbuf);
1953  return;
1954  }
1955 
1956  /*
1957  * We need an insertion scan key, so build one.
1958  *
1959  * _bt_search searches for the leaf page that contains any
1960  * matching non-pivot tuples, but we need it to "search" for
1961  * the high key pivot from the page that we're set to delete.
1962  * Compensate for the mismatch by having _bt_search locate the
1963  * last position < equal-to-untruncated-prefix non-pivots.
1964  */
1965  itup_key = _bt_mkscankey(rel, targetkey);
1966 
1967  /* Set up a BTLessStrategyNumber-like insertion scan key */
1968  itup_key->nextkey = false;
1969  itup_key->backward = true;
1970  stack = _bt_search(rel, NULL, itup_key, &sleafbuf, BT_READ);
1971  /* won't need a second lock or pin on leafbuf */
1972  _bt_relbuf(rel, sleafbuf);
1973 
1974  /*
1975  * Re-lock the leaf page, and start over to use our stack
1976  * within _bt_mark_page_halfdead. We must do it that way
1977  * because it's possible that leafbuf can no longer be
1978  * deleted. We need to recheck.
1979  *
1980  * Note: We can't simply hold on to the sleafbuf lock instead,
1981  * because it's barely possible that sleafbuf is not the same
1982  * page as leafbuf. This happens when leafbuf split after our
1983  * original lock was dropped, but before _bt_search finished
1984  * its descent. We rely on the assumption that we'll find
1985  * leafbuf isn't safe to delete anymore in this scenario.
1986  * (Page deletion can cope with the stack being to the left of
1987  * leafbuf, but not to the right of leafbuf.)
1988  */
1989  _bt_lockbuf(rel, leafbuf, BT_WRITE);
1990  continue;
1991  }
1992 
1993  /*
1994  * See if it's safe to delete the leaf page, and determine how
1995  * many parent/internal pages above the leaf level will be
1996  * deleted. If it's safe then _bt_mark_page_halfdead will also
1997  * perform the first phase of deletion, which includes marking the
1998  * leafbuf page half-dead.
1999  */
2000  Assert(P_ISLEAF(opaque) && !P_IGNORE(opaque));
2001  if (!_bt_mark_page_halfdead(rel, vstate->info->heaprel, leafbuf,
2002  stack))
2003  {
2004  _bt_relbuf(rel, leafbuf);
2005  return;
2006  }
2007  }
2008 
2009  /*
2010  * Then unlink it from its siblings. Each call to
2011  * _bt_unlink_halfdead_page unlinks the topmost page from the subtree,
2012  * making it shallower. Iterate until the leafbuf page is deleted.
2013  */
2014  rightsib_empty = false;
2015  Assert(P_ISLEAF(opaque) && P_ISHALFDEAD(opaque));
2016  while (P_ISHALFDEAD(opaque))
2017  {
2018  /* Check for interrupts in _bt_unlink_halfdead_page */
2019  if (!_bt_unlink_halfdead_page(rel, leafbuf, scanblkno,
2020  &rightsib_empty, vstate))
2021  {
2022  /*
2023  * _bt_unlink_halfdead_page should never fail, since we
2024  * established that deletion is generally safe in
2025  * _bt_mark_page_halfdead -- index must be corrupt.
2026  *
2027  * Note that _bt_unlink_halfdead_page already released the
2028  * lock and pin on leafbuf for us.
2029  */
2030  Assert(false);
2031  return;
2032  }
2033  }
2034 
2035  Assert(P_ISLEAF(opaque) && P_ISDELETED(opaque));
2036 
2037  rightsib = opaque->btpo_next;
2038 
2039  _bt_relbuf(rel, leafbuf);
2040 
2041  /*
2042  * Check here, as calling loops will have locks held, preventing
2043  * interrupts from being processed.
2044  */
2046 
2047  /*
2048  * The page has now been deleted. If its right sibling is completely
2049  * empty, it's possible that the reason we haven't deleted it earlier
2050  * is that it was the rightmost child of the parent. Now that we
2051  * removed the downlink for this page, the right sibling might now be
2052  * the only child of the parent, and could be removed. It would be
2053  * picked up by the next vacuum anyway, but might as well try to
2054  * remove it now, so loop back to process the right sibling.
2055  *
2056  * Note: This relies on the assumption that _bt_getstackbuf() will be
2057  * able to reuse our original descent stack with a different child
2058  * block (provided that the child block is to the right of the
2059  * original leaf page reached by _bt_search()). It will even update
2060  * the descent stack each time we loop around, avoiding repeated work.
2061  */
2062  if (!rightsib_empty)
2063  break;
2064 
2065  leafbuf = _bt_getbuf(rel, rightsib, BT_WRITE);
2066  }
2067 }
#define LOG
Definition: elog.h:31
IndexTuple CopyIndexTuple(IndexTuple source)
Definition: indextuple.c:547
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:122
static bool _bt_leftsib_splitflag(Relation rel, BlockNumber leftsib, BlockNumber target)
Definition: nbtpage.c:1695
static bool _bt_mark_page_halfdead(Relation rel, Relation heaprel, Buffer leafbuf, BTStack stack)
Definition: nbtpage.c:2088
static bool _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno, bool *rightsib_empty, BTVacState *vstate)
Definition: nbtpage.c:2314
#define P_ISHALFDEAD(opaque)
Definition: nbtree.h:224
#define P_ISDELETED(opaque)
Definition: nbtree.h:222
#define P_ISROOT(opaque)
Definition: nbtree.h:221
IndexVacuumInfo * info
Definition: nbtree.h:332
Relation heaprel
Definition: genam.h:47

References _bt_getbuf(), _bt_leftsib_splitflag(), _bt_lockbuf(), _bt_mark_page_halfdead(), _bt_mkscankey(), _bt_relbuf(), _bt_search(), _bt_unlink_halfdead_page(), _bt_unlockbuf(), Assert, BTScanInsertData::backward, BT_READ, BT_WRITE, BTPageGetOpaque, BTPageOpaqueData::btpo_next, BTPageOpaqueData::btpo_prev, BufferGetBlockNumber(), BufferGetPage(), CHECK_FOR_INTERRUPTS, CopyIndexTuple(), ereport, errcode(), errhint(), errmsg(), errmsg_internal(), IndexVacuumInfo::heaprel, BTVacState::info, LOG, BTScanInsertData::nextkey, P_FIRSTDATAKEY, P_HIKEY, P_IGNORE, P_INCOMPLETE_SPLIT, P_ISDELETED, P_ISHALFDEAD, P_ISLEAF, P_ISROOT, P_RIGHTMOST, PageGetItem(), PageGetItemId(), PageGetMaxOffsetNumber(), RelationGetRelationName, and ReleaseBuffer().

Referenced by btvacuumpage().

◆ _bt_pageinit()

void _bt_pageinit ( Page  page,
Size  size 
)

Definition at line 1129 of file nbtpage.c.

1130 {
1131  PageInit(page, size, sizeof(BTPageOpaqueData));
1132 }
void PageInit(Page page, Size pageSize, Size specialSize)
Definition: bufpage.c:42
static pg_noinline void Size size
Definition: slab.c:607

References PageInit(), and size.

Referenced by _bt_allocbuf(), _bt_blnewpage(), _bt_initmetapage(), _bt_restore_meta(), _bt_split(), btree_xlog_mark_page_halfdead(), btree_xlog_newroot(), btree_xlog_split(), and btree_xlog_unlink_page().

◆ _bt_parallel_build_main()

void _bt_parallel_build_main ( dsm_segment seg,
shm_toc toc 
)

Definition at line 1743 of file nbtsort.c.

1744 {
1745  char *sharedquery;
1746  BTSpool *btspool;
1747  BTSpool *btspool2;
1748  BTShared *btshared;
1749  Sharedsort *sharedsort;
1750  Sharedsort *sharedsort2;
1751  Relation heapRel;
1752  Relation indexRel;
1753  LOCKMODE heapLockmode;
1754  LOCKMODE indexLockmode;
1755  WalUsage *walusage;
1756  BufferUsage *bufferusage;
1757  int sortmem;
1758 
1759 #ifdef BTREE_BUILD_STATS
1761  ResetUsage();
1762 #endif /* BTREE_BUILD_STATS */
1763 
1764  /*
1765  * The only possible status flag that can be set to the parallel worker is
1766  * PROC_IN_SAFE_IC.
1767  */
1768  Assert((MyProc->statusFlags == 0) ||
1770 
1771  /* Set debug_query_string for individual workers first */
1772  sharedquery = shm_toc_lookup(toc, PARALLEL_KEY_QUERY_TEXT, true);
1773  debug_query_string = sharedquery;
1774 
1775  /* Report the query string from leader */
1777 
1778  /* Look up nbtree shared state */
1779  btshared = shm_toc_lookup(toc, PARALLEL_KEY_BTREE_SHARED, false);
1780 
1781  /* Open relations using lock modes known to be obtained by index.c */
1782  if (!btshared->isconcurrent)
1783  {
1784  heapLockmode = ShareLock;
1785  indexLockmode = AccessExclusiveLock;
1786  }
1787  else
1788  {
1789  heapLockmode = ShareUpdateExclusiveLock;
1790  indexLockmode = RowExclusiveLock;
1791  }
1792 
1793  /* Track query ID */
1794  pgstat_report_query_id(btshared->queryid, false);
1795 
1796  /* Open relations within worker */
1797  heapRel = table_open(btshared->heaprelid, heapLockmode);
1798  indexRel = index_open(btshared->indexrelid, indexLockmode);
1799 
1800  /* Initialize worker's own spool */
1801  btspool = (BTSpool *) palloc0(sizeof(BTSpool));
1802  btspool->heap = heapRel;
1803  btspool->index = indexRel;
1804  btspool->isunique = btshared->isunique;
1805  btspool->nulls_not_distinct = btshared->nulls_not_distinct;
1806 
1807  /* Look up shared state private to tuplesort.c */
1808  sharedsort = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT, false);
1809  tuplesort_attach_shared(sharedsort, seg);
1810  if (!btshared->isunique)
1811  {
1812  btspool2 = NULL;
1813  sharedsort2 = NULL;
1814  }
1815  else
1816  {
1817  /* Allocate memory for worker's own private secondary spool */
1818  btspool2 = (BTSpool *) palloc0(sizeof(BTSpool));
1819 
1820  /* Initialize worker's own secondary spool */
1821  btspool2->heap = btspool->heap;
1822  btspool2->index = btspool->index;
1823  btspool2->isunique = false;
1824  /* Look up shared state private to tuplesort.c */
1825  sharedsort2 = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT_SPOOL2, false);
1826  tuplesort_attach_shared(sharedsort2, seg);
1827  }
1828 
1829  /* Prepare to track buffer usage during parallel execution */
1831 
1832  /* Perform sorting of spool, and possibly a spool2 */
1833  sortmem = maintenance_work_mem / btshared->scantuplesortstates;
1834  _bt_parallel_scan_and_sort(btspool, btspool2, btshared, sharedsort,
1835  sharedsort2, sortmem, false);
1836 
1837  /* Report WAL/buffer usage during parallel execution */
1838  bufferusage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false);
1839  walusage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false);
1841  &walusage[ParallelWorkerNumber]);
1842 
1843 #ifdef BTREE_BUILD_STATS
1845  {
1846  ShowUsage("BTREE BUILD (Worker Partial Spool) STATISTICS");
1847  ResetUsage();
1848  }
1849 #endif /* BTREE_BUILD_STATS */
1850 
1851  index_close(indexRel, indexLockmode);
1852  table_close(heapRel, heapLockmode);
1853 }
int ParallelWorkerNumber
Definition: parallel.c:114
void pgstat_report_query_id(uint64 query_id, bool force)
void pgstat_report_activity(BackendState state, const char *cmd_str)
@ STATE_RUNNING
int maintenance_work_mem
Definition: globals.c:132
bool log_btree_build_stats
Definition: guc_tables.c:507
void index_close(Relation relation, LOCKMODE lockmode)
Definition: indexam.c:177
Relation index_open(Oid relationId, LOCKMODE lockmode)
Definition: indexam.c:133
void InstrEndParallelQuery(BufferUsage *bufusage, WalUsage *walusage)
Definition: instrument.c:208
void InstrStartParallelQuery(void)
Definition: instrument.c:200
int LOCKMODE
Definition: lockdefs.h:26
#define AccessExclusiveLock
Definition: lockdefs.h:43
#define ShareUpdateExclusiveLock
Definition: lockdefs.h:39
#define ShareLock
Definition: lockdefs.h:40
#define RowExclusiveLock
Definition: lockdefs.h:38
#define PARALLEL_KEY_BUFFER_USAGE
Definition: nbtsort.c:66
#define PARALLEL_KEY_TUPLESORT_SPOOL2
Definition: nbtsort.c:63
static void _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2, BTShared *btshared, Sharedsort *sharedsort, Sharedsort *sharedsort2, int sortmem, bool progress)
Definition: nbtsort.c:1868
#define PARALLEL_KEY_BTREE_SHARED
Definition: nbtsort.c:61
#define PARALLEL_KEY_TUPLESORT
Definition: nbtsort.c:62
#define PARALLEL_KEY_QUERY_TEXT
Definition: nbtsort.c:64
#define PARALLEL_KEY_WAL_USAGE
Definition: nbtsort.c:65
const char * debug_query_string
Definition: postgres.c:87
void ShowUsage(const char *title)
Definition: postgres.c:5118
void ResetUsage(void)
Definition: postgres.c:5111
#define PROC_IN_SAFE_IC
Definition: proc.h:59
void * shm_toc_lookup(shm_toc *toc, uint64 key, bool noError)
Definition: shm_toc.c:232
PGPROC * MyProc
Definition: proc.c:66
bool isconcurrent
Definition: nbtsort.c:104
Oid heaprelid
Definition: nbtsort.c:100
bool isunique
Definition: nbtsort.c:102
int scantuplesortstates
Definition: nbtsort.c:105
uint64 queryid
Definition: nbtsort.c:108
Oid indexrelid
Definition: nbtsort.c:101
bool nulls_not_distinct
Definition: nbtsort.c:103
bool isunique
Definition: nbtsort.c:84
bool nulls_not_distinct
Definition: nbtsort.c:85
Relation heap
Definition: nbtsort.c:82
Relation index
Definition: nbtsort.c:83
uint8 statusFlags
Definition: proc.h:242
void table_close(Relation relation, LOCKMODE lockmode)
Definition: table.c:126
Relation table_open(Oid relationId, LOCKMODE lockmode)
Definition: table.c:40
void tuplesort_attach_shared(Sharedsort *shared, dsm_segment *seg)
Definition: tuplesort.c:2961

References _bt_parallel_scan_and_sort(), AccessExclusiveLock, Assert, debug_query_string, BTSpool::heap, BTShared::heaprelid, BTSpool::index, index_close(), index_open(), BTShared::indexrelid, InstrEndParallelQuery(), InstrStartParallelQuery(), BTShared::isconcurrent, BTSpool::isunique, BTShared::isunique, log_btree_build_stats, maintenance_work_mem, MyProc, BTSpool::nulls_not_distinct, BTShared::nulls_not_distinct, palloc0(), PARALLEL_KEY_BTREE_SHARED, PARALLEL_KEY_BUFFER_USAGE, PARALLEL_KEY_QUERY_TEXT, PARALLEL_KEY_TUPLESORT, PARALLEL_KEY_TUPLESORT_SPOOL2, PARALLEL_KEY_WAL_USAGE, ParallelWorkerNumber, pgstat_report_activity(), pgstat_report_query_id(), PROC_IN_SAFE_IC, BTShared::queryid, ResetUsage(), RowExclusiveLock, BTShared::scantuplesortstates, ShareLock, ShareUpdateExclusiveLock, shm_toc_lookup(), ShowUsage(), STATE_RUNNING, PGPROC::statusFlags, table_close(), table_open(), and tuplesort_attach_shared().

◆ _bt_parallel_done()

void _bt_parallel_done ( IndexScanDesc  scan)

Definition at line 774 of file nbtree.c.

775 {
776  BTScanOpaque so = (BTScanOpaque) scan->opaque;
777  ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
778  BTParallelScanDesc btscan;
779  bool status_changed = false;
780 
782 
783  /* Do nothing, for non-parallel scans */
784  if (parallel_scan == NULL)
785  return;
786 
787  /*
788  * Should not mark parallel scan done when there's still a pending
789  * primitive index scan
790  */
791  if (so->needPrimScan)
792  return;
793 
794  btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
795  parallel_scan->ps_offset);
796 
797  /*
798  * Mark the parallel scan as done, unless some other process did so
799  * already
800  */
801  SpinLockAcquire(&btscan->btps_mutex);
802  Assert(btscan->btps_pageStatus != BTPARALLEL_NEED_PRIMSCAN);
803  if (btscan->btps_pageStatus != BTPARALLEL_DONE)
804  {
805  btscan->btps_pageStatus = BTPARALLEL_DONE;
806  status_changed = true;
807  }
808  SpinLockRelease(&btscan->btps_mutex);
809 
810  /* wake up all the workers associated with this parallel scan */
811  if (status_changed)
812  ConditionVariableBroadcast(&btscan->btps_cv);
813 }
#define OffsetToPointer(base, offset)
Definition: c.h:777
void ConditionVariableBroadcast(ConditionVariable *cv)
@ BTPARALLEL_NEED_PRIMSCAN
Definition: nbtree.c:54
@ BTPARALLEL_DONE
Definition: nbtree.c:57
struct BTParallelScanDescData * BTParallelScanDesc
Definition: nbtree.c:82
#define SpinLockRelease(lock)
Definition: spin.h:61
#define SpinLockAcquire(lock)
Definition: spin.h:59

References Assert, BTPARALLEL_DONE, BTPARALLEL_NEED_PRIMSCAN, BTScanPosIsValid, ConditionVariableBroadcast(), BTScanOpaqueData::currPos, BTScanOpaqueData::needPrimScan, OffsetToPointer, IndexScanDescData::opaque, IndexScanDescData::parallel_scan, SpinLockAcquire, and SpinLockRelease.

Referenced by _bt_endpoint(), _bt_first(), _bt_parallel_seize(), _bt_readnextpage(), and _bt_start_prim_scan().

◆ _bt_parallel_primscan_schedule()

void _bt_parallel_primscan_schedule ( IndexScanDesc  scan,
BlockNumber  curr_page 
)

Definition at line 824 of file nbtree.c.

825 {
826  BTScanOpaque so = (BTScanOpaque) scan->opaque;
827  ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
828  BTParallelScanDesc btscan;
829 
830  Assert(so->numArrayKeys);
831 
832  btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
833  parallel_scan->ps_offset);
834 
835  SpinLockAcquire(&btscan->btps_mutex);
836  if (btscan->btps_lastCurrPage == curr_page &&
837  btscan->btps_pageStatus == BTPARALLEL_IDLE)
838  {
839  btscan->btps_nextScanPage = InvalidBlockNumber;
840  btscan->btps_lastCurrPage = InvalidBlockNumber;
841  btscan->btps_pageStatus = BTPARALLEL_NEED_PRIMSCAN;
842 
843  /* Serialize scan's current array keys */
844  for (int i = 0; i < so->numArrayKeys; i++)
845  {
846  BTArrayKeyInfo *array = &so->arrayKeys[i];
847 
848  btscan->btps_arrElems[i] = array->cur_elem;
849  }
850  }
851  SpinLockRelease(&btscan->btps_mutex);
852 }
@ BTPARALLEL_IDLE
Definition: nbtree.c:56
BTArrayKeyInfo * arrayKeys
Definition: nbtree.h:1043

References BTScanOpaqueData::arrayKeys, Assert, BTPARALLEL_IDLE, BTPARALLEL_NEED_PRIMSCAN, BTArrayKeyInfo::cur_elem, i, InvalidBlockNumber, BTScanOpaqueData::numArrayKeys, OffsetToPointer, IndexScanDescData::opaque, IndexScanDescData::parallel_scan, SpinLockAcquire, and SpinLockRelease.

Referenced by _bt_advance_array_keys().

◆ _bt_parallel_release()

void _bt_parallel_release ( IndexScanDesc  scan,
BlockNumber  next_scan_page,
BlockNumber  curr_page 
)

Definition at line 747 of file nbtree.c.

749 {
750  ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
751  BTParallelScanDesc btscan;
752 
753  Assert(BlockNumberIsValid(next_scan_page));
754 
755  btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
756  parallel_scan->ps_offset);
757 
758  SpinLockAcquire(&btscan->btps_mutex);
759  btscan->btps_nextScanPage = next_scan_page;
760  btscan->btps_lastCurrPage = curr_page;
762  SpinLockRelease(&btscan->btps_mutex);
764 }
void ConditionVariableSignal(ConditionVariable *cv)
slock_t btps_mutex
Definition: nbtree.c:72
BTPS_State btps_pageStatus
Definition: nbtree.c:69
BlockNumber btps_lastCurrPage
Definition: nbtree.c:67
ConditionVariable btps_cv
Definition: nbtree.c:73
BlockNumber btps_nextScanPage
Definition: nbtree.c:66

References Assert, BlockNumberIsValid(), BTPARALLEL_IDLE, BTParallelScanDescData::btps_cv, BTParallelScanDescData::btps_lastCurrPage, BTParallelScanDescData::btps_mutex, BTParallelScanDescData::btps_nextScanPage, BTParallelScanDescData::btps_pageStatus, ConditionVariableSignal(), OffsetToPointer, IndexScanDescData::parallel_scan, ParallelIndexScanDescData::ps_offset, SpinLockAcquire, and SpinLockRelease.

Referenced by _bt_readnextpage(), and _bt_readpage().

◆ _bt_parallel_seize()

bool _bt_parallel_seize ( IndexScanDesc  scan,
BlockNumber next_scan_page,
BlockNumber last_curr_page,
bool  first 
)

Definition at line 605 of file nbtree.c.

607 {
608  BTScanOpaque so = (BTScanOpaque) scan->opaque;
609  bool exit_loop = false,
610  status = true,
611  endscan = false;
612  ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
613  BTParallelScanDesc btscan;
614 
615  *next_scan_page = InvalidBlockNumber;
616  *last_curr_page = InvalidBlockNumber;
617 
618  /*
619  * Reset so->currPos, and initialize moreLeft/moreRight such that the next
620  * call to _bt_readnextpage treats this backend similarly to a serial
621  * backend that steps from *last_curr_page to *next_scan_page (unless this
622  * backend's so->currPos is initialized by _bt_readfirstpage before then).
623  */
625  so->currPos.moreLeft = so->currPos.moreRight = true;
626 
627  if (first)
628  {
629  /*
630  * Initialize array related state when called from _bt_first, assuming
631  * that this will be the first primitive index scan for the scan
632  */
633  so->needPrimScan = false;
634  so->scanBehind = false;
635  so->oppositeDirCheck = false;
636  }
637  else
638  {
639  /*
640  * Don't attempt to seize the scan when it requires another primitive
641  * index scan, since caller's backend cannot start it right now
642  */
643  if (so->needPrimScan)
644  return false;
645  }
646 
647  btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
648  parallel_scan->ps_offset);
649 
650  while (1)
651  {
652  SpinLockAcquire(&btscan->btps_mutex);
653 
654  if (btscan->btps_pageStatus == BTPARALLEL_DONE)
655  {
656  /* We're done with this parallel index scan */
657  status = false;
658  }
659  else if (btscan->btps_pageStatus == BTPARALLEL_IDLE &&
660  btscan->btps_nextScanPage == P_NONE)
661  {
662  /* End this parallel index scan */
663  status = false;
664  endscan = true;
665  }
666  else if (btscan->btps_pageStatus == BTPARALLEL_NEED_PRIMSCAN)
667  {
668  Assert(so->numArrayKeys);
669 
670  if (first)
671  {
672  /* Can start scheduled primitive scan right away, so do so */
673  btscan->btps_pageStatus = BTPARALLEL_ADVANCING;
674  for (int i = 0; i < so->numArrayKeys; i++)
675  {
676  BTArrayKeyInfo *array = &so->arrayKeys[i];
677  ScanKey skey = &so->keyData[array->scan_key];
678 
679  array->cur_elem = btscan->btps_arrElems[i];
680  skey->sk_argument = array->elem_values[array->cur_elem];
681  }
682  exit_loop = true;
683  }
684  else
685  {
686  /*
687  * Don't attempt to seize the scan when it requires another
688  * primitive index scan, since caller's backend cannot start
689  * it right now
690  */
691  status = false;
692  }
693 
694  /*
695  * Either way, update backend local state to indicate that a
696  * pending primitive scan is required
697  */
698  so->needPrimScan = true;
699  so->scanBehind = false;
700  so->oppositeDirCheck = false;
701  }
702  else if (btscan->btps_pageStatus != BTPARALLEL_ADVANCING)
703  {
704  /*
705  * We have successfully seized control of the scan for the purpose
706  * of advancing it to a new page!
707  */
708  btscan->btps_pageStatus = BTPARALLEL_ADVANCING;
709  Assert(btscan->btps_nextScanPage != P_NONE);
710  *next_scan_page = btscan->btps_nextScanPage;
711  *last_curr_page = btscan->btps_lastCurrPage;
712  exit_loop = true;
713  }
714  SpinLockRelease(&btscan->btps_mutex);
715  if (exit_loop || !status)
716  break;
717  ConditionVariableSleep(&btscan->btps_cv, WAIT_EVENT_BTREE_PAGE);
718  }
720 
721  /* When the scan has reached the rightmost (or leftmost) page, end it */
722  if (endscan)
723  _bt_parallel_done(scan);
724 
725  return status;
726 }
bool ConditionVariableCancelSleep(void)
void ConditionVariableSleep(ConditionVariable *cv, uint32 wait_event_info)
@ BTPARALLEL_ADVANCING
Definition: nbtree.c:55
#define BTScanPosInvalidate(scanpos)
Definition: nbtree.h:1016
Datum * elem_values
Definition: nbtree.h:1028
bool moreRight
Definition: nbtree.h:975
bool moreLeft
Definition: nbtree.h:974

References _bt_parallel_done(), BTScanOpaqueData::arrayKeys, Assert, BTPARALLEL_ADVANCING, BTPARALLEL_DONE, BTPARALLEL_IDLE, BTPARALLEL_NEED_PRIMSCAN, BTScanPosInvalidate, ConditionVariableCancelSleep(), ConditionVariableSleep(), BTArrayKeyInfo::cur_elem, BTScanOpaqueData::currPos, BTArrayKeyInfo::elem_values, i, InvalidBlockNumber, BTScanOpaqueData::keyData, BTScanPosData::moreLeft, BTScanPosData::moreRight, BTScanOpaqueData::needPrimScan, BTScanOpaqueData::numArrayKeys, OffsetToPointer, IndexScanDescData::opaque, BTScanOpaqueData::oppositeDirCheck, P_NONE, IndexScanDescData::parallel_scan, BTArrayKeyInfo::scan_key, BTScanOpaqueData::scanBehind, ScanKeyData::sk_argument, SpinLockAcquire, and SpinLockRelease.

Referenced by _bt_first(), and _bt_readnextpage().

◆ _bt_pendingfsm_finalize()

void _bt_pendingfsm_finalize ( Relation  rel,
BTVacState vstate 
)

Definition at line 2995 of file nbtpage.c.

2996 {
2997  IndexBulkDeleteResult *stats = vstate->stats;
2998  Relation heaprel = vstate->info->heaprel;
2999 
3000  Assert(stats->pages_newly_deleted >= vstate->npendingpages);
3001  Assert(heaprel != NULL);
3002 
3003  if (vstate->npendingpages == 0)
3004  {
3005  /* Just free memory when nothing to do */
3006  if (vstate->pendingpages)
3007  pfree(vstate->pendingpages);
3008 
3009  return;
3010  }
3011 
3012 #ifdef DEBUG_BTREE_PENDING_FSM
3013 
3014  /*
3015  * Debugging aid: Sleep for 5 seconds to greatly increase the chances of
3016  * placing pending pages in the FSM. Note that the optimization will
3017  * never be effective without some other backend concurrently consuming an
3018  * XID.
3019  */
3020  pg_usleep(5000000L);
3021 #endif
3022 
3023  /*
3024  * Recompute VACUUM XID boundaries.
3025  *
3026  * We don't actually care about the oldest non-removable XID. Computing
3027  * the oldest such XID has a useful side-effect that we rely on: it
3028  * forcibly updates the XID horizon state for this backend. This step is
3029  * essential; GlobalVisCheckRemovableFullXid() will not reliably recognize
3030  * that it is now safe to recycle newly deleted pages without this step.
3031  */
3033 
3034  for (int i = 0; i < vstate->npendingpages; i++)
3035  {
3036  BlockNumber target = vstate->pendingpages[i].target;
3037  FullTransactionId safexid = vstate->pendingpages[i].safexid;
3038 
3039  /*
3040  * Do the equivalent of checking BTPageIsRecyclable(), but without
3041  * accessing the page again a second time.
3042  *
3043  * Give up on finding the first non-recyclable page -- all later pages
3044  * must be non-recyclable too, since _bt_pendingfsm_add() adds pages
3045  * to the array in safexid order.
3046  */
3047  if (!GlobalVisCheckRemovableFullXid(heaprel, safexid))
3048  break;
3049 
3050  RecordFreeIndexPage(rel, target);
3051  stats->pages_free++;
3052  }
3053 
3054  pfree(vstate->pendingpages);
3055 }
void RecordFreeIndexPage(Relation rel, BlockNumber freeBlock)
Definition: indexfsm.c:52
TransactionId GetOldestNonRemovableTransactionId(Relation rel)
Definition: procarray.c:2005
bool GlobalVisCheckRemovableFullXid(Relation rel, FullTransactionId fxid)
Definition: procarray.c:4290
void pg_usleep(long microsec)
Definition: signal.c:53
FullTransactionId safexid
Definition: nbtree.h:327
BlockNumber target
Definition: nbtree.h:326
IndexBulkDeleteResult * stats
Definition: nbtree.h:333
BTPendingFSM * pendingpages
Definition: nbtree.h:344
int npendingpages
Definition: nbtree.h:345
BlockNumber pages_newly_deleted
Definition: genam.h:81
BlockNumber pages_free
Definition: genam.h:83

References Assert, GetOldestNonRemovableTransactionId(), GlobalVisCheckRemovableFullXid(), IndexVacuumInfo::heaprel, i, BTVacState::info, BTVacState::npendingpages, IndexBulkDeleteResult::pages_free, IndexBulkDeleteResult::pages_newly_deleted, BTVacState::pendingpages, pfree(), pg_usleep(), RecordFreeIndexPage(), BTPendingFSM::safexid, BTVacState::stats, and BTPendingFSM::target.

Referenced by btvacuumscan().

◆ _bt_pendingfsm_init()

void _bt_pendingfsm_init ( Relation  rel,
BTVacState vstate,
bool  cleanuponly 
)

Definition at line 2954 of file nbtpage.c.

2955 {
2956  int64 maxbufsize;
2957 
2958  /*
2959  * Don't bother with optimization in cleanup-only case -- we don't expect
2960  * any newly deleted pages. Besides, cleanup-only calls to btvacuumscan()
2961  * can only take place because this optimization didn't work out during
2962  * the last VACUUM.
2963  */
2964  if (cleanuponly)
2965  return;
2966 
2967  /*
2968  * Cap maximum size of array so that we always respect work_mem. Avoid
2969  * int overflow here.
2970  */
2971  vstate->bufsize = 256;
2972  maxbufsize = (work_mem * 1024L) / sizeof(BTPendingFSM);
2973  maxbufsize = Min(maxbufsize, INT_MAX);
2974  maxbufsize = Min(maxbufsize, MaxAllocSize / sizeof(BTPendingFSM));
2975  /* Stay sane with small work_mem */
2976  maxbufsize = Max(maxbufsize, vstate->bufsize);
2977  vstate->maxbufsize = maxbufsize;
2978 
2979  /* Allocate buffer, indicate that there are currently 0 pending pages */
2980  vstate->pendingpages = palloc(sizeof(BTPendingFSM) * vstate->bufsize);
2981  vstate->npendingpages = 0;
2982 }
#define MaxAllocSize
Definition: fe_memutils.h:22
int work_mem
Definition: globals.c:130
int bufsize
Definition: nbtree.h:342
int maxbufsize
Definition: nbtree.h:343

References BTVacState::bufsize, Max, MaxAllocSize, BTVacState::maxbufsize, Min, BTVacState::npendingpages, palloc(), BTVacState::pendingpages, and work_mem.

Referenced by btvacuumscan().

◆ _bt_preprocess_keys()

void _bt_preprocess_keys ( IndexScanDesc  scan)

Definition at line 2530 of file nbtutils.c.

2531 {
2532  BTScanOpaque so = (BTScanOpaque) scan->opaque;
2533  int numberOfKeys = scan->numberOfKeys;
2534  int16 *indoption = scan->indexRelation->rd_indoption;
2535  int new_numberOfKeys;
2536  int numberOfEqualCols;
2537  ScanKey inkeys;
2539  bool test_result;
2540  AttrNumber attno;
2541  ScanKey arrayKeyData;
2542  int *keyDataMap = NULL;
2543  int arrayidx = 0;
2544 
2545  if (so->numberOfKeys > 0)
2546  {
2547  /*
2548  * Only need to do preprocessing once per btrescan, at most. All
2549  * calls after the first are handled as no-ops.
2550  *
2551  * If there are array scan keys in so->keyData[], then the now-current
2552  * array elements must already be present in each array's scan key.
2553  * Verify that that happened using an assertion.
2554  */
2555  Assert(_bt_verify_keys_with_arraykeys(scan));
2556  return;
2557  }
2558 
2559  /* initialize result variables */
2560  so->qual_ok = true;
2561  so->numberOfKeys = 0;
2562 
2563  if (numberOfKeys < 1)
2564  return; /* done if qual-less scan */
2565 
2566  /* If any keys are SK_SEARCHARRAY type, set up array-key info */
2567  arrayKeyData = _bt_preprocess_array_keys(scan, &numberOfKeys);
2568  if (!so->qual_ok)
2569  {
2570  /* unmatchable array, so give up */
2571  return;
2572  }
2573 
2574  /*
2575  * Treat arrayKeyData[] (a partially preprocessed copy of scan->keyData[])
2576  * as our input if _bt_preprocess_array_keys just allocated it, else just
2577  * use scan->keyData[]
2578  */
2579  if (arrayKeyData)
2580  {
2581  inkeys = arrayKeyData;
2582 
2583  /* Also maintain keyDataMap for remapping so->orderProc[] later */
2584  keyDataMap = MemoryContextAlloc(so->arrayContext,
2585  numberOfKeys * sizeof(int));
2586  }
2587  else
2588  inkeys = scan->keyData;
2589 
2590  /* we check that input keys are correctly ordered */
2591  if (inkeys[0].sk_attno < 1)
2592  elog(ERROR, "btree index keys must be ordered by attribute");
2593 
2594  /* We can short-circuit most of the work if there's just one key */
2595  if (numberOfKeys == 1)
2596  {
2597  /* Apply indoption to scankey (might change sk_strategy!) */
2598  if (!_bt_fix_scankey_strategy(&inkeys[0], indoption))
2599  so->qual_ok = false;
2600  memcpy(&so->keyData[0], &inkeys[0], sizeof(ScanKeyData));
2601  so->numberOfKeys = 1;
2602  /* We can mark the qual as required if it's for first index col */
2603  if (inkeys[0].sk_attno == 1)
2605  if (arrayKeyData)
2606  {
2607  /*
2608  * Don't call _bt_preprocess_array_keys_final in this fast path
2609  * (we'll miss out on the single value array transformation, but
2610  * that's not nearly as important when there's only one scan key)
2611  */
2614  (so->arrayKeys[0].scan_key == 0 &&
2615  OidIsValid(so->orderProcs[0].fn_oid)));
2616  }
2617 
2618  return;
2619  }
2620 
2621  /*
2622  * Otherwise, do the full set of pushups.
2623  */
2624  new_numberOfKeys = 0;
2625  numberOfEqualCols = 0;
2626 
2627  /*
2628  * Initialize for processing of keys for attr 1.
2629  *
2630  * xform[i] points to the currently best scan key of strategy type i+1; it
2631  * is NULL if we haven't yet found such a key for this attr.
2632  */
2633  attno = 1;
2634  memset(xform, 0, sizeof(xform));
2635 
2636  /*
2637  * Loop iterates from 0 to numberOfKeys inclusive; we use the last pass to
2638  * handle after-last-key processing. Actual exit from the loop is at the
2639  * "break" statement below.
2640  */
2641  for (int i = 0;; i++)
2642  {
2643  ScanKey inkey = inkeys + i;
2644  int j;
2645 
2646  if (i < numberOfKeys)
2647  {
2648  /* Apply indoption to scankey (might change sk_strategy!) */
2649  if (!_bt_fix_scankey_strategy(inkey, indoption))
2650  {
2651  /* NULL can't be matched, so give up */
2652  so->qual_ok = false;
2653  return;
2654  }
2655  }
2656 
2657  /*
2658  * If we are at the end of the keys for a particular attr, finish up
2659  * processing and emit the cleaned-up keys.
2660  */
2661  if (i == numberOfKeys || inkey->sk_attno != attno)
2662  {
2663  int priorNumberOfEqualCols = numberOfEqualCols;
2664 
2665  /* check input keys are correctly ordered */
2666  if (i < numberOfKeys && inkey->sk_attno < attno)
2667  elog(ERROR, "btree index keys must be ordered by attribute");
2668 
2669  /*
2670  * If = has been specified, all other keys can be eliminated as
2671  * redundant. Note that this is no less true if the = key is
2672  * SEARCHARRAY; the only real difference is that the inequality
2673  * key _becomes_ redundant by making _bt_compare_scankey_args
2674  * eliminate the subset of elements that won't need to be matched.
2675  *
2676  * If we have a case like "key = 1 AND key > 2", we set qual_ok to
2677  * false and abandon further processing. We'll do the same thing
2678  * given a case like "key IN (0, 1) AND key > 2".
2679  *
2680  * We also have to deal with the case of "key IS NULL", which is
2681  * unsatisfiable in combination with any other index condition. By
2682  * the time we get here, that's been classified as an equality
2683  * check, and we've rejected any combination of it with a regular
2684  * equality condition; but not with other types of conditions.
2685  */
2686  if (xform[BTEqualStrategyNumber - 1].inkey)
2687  {
2688  ScanKey eq = xform[BTEqualStrategyNumber - 1].inkey;
2689  BTArrayKeyInfo *array = NULL;
2690  FmgrInfo *orderproc = NULL;
2691 
2692  if (arrayKeyData && (eq->sk_flags & SK_SEARCHARRAY))
2693  {
2694  int eq_in_ikey,
2695  eq_arrayidx;
2696 
2697  eq_in_ikey = xform[BTEqualStrategyNumber - 1].inkeyi;
2698  eq_arrayidx = xform[BTEqualStrategyNumber - 1].arrayidx;
2699  array = &so->arrayKeys[eq_arrayidx - 1];
2700  orderproc = so->orderProcs + eq_in_ikey;
2701 
2702  Assert(array->scan_key == eq_in_ikey);
2703  Assert(OidIsValid(orderproc->fn_oid));
2704  }
2705 
2706  for (j = BTMaxStrategyNumber; --j >= 0;)
2707  {
2708  ScanKey chk = xform[j].inkey;
2709 
2710  if (!chk || j == (BTEqualStrategyNumber - 1))
2711  continue;
2712 
2713  if (eq->sk_flags & SK_SEARCHNULL)
2714  {
2715  /* IS NULL is contradictory to anything else */
2716  so->qual_ok = false;
2717  return;
2718  }
2719 
2720  if (_bt_compare_scankey_args(scan, chk, eq, chk,
2721  array, orderproc,
2722  &test_result))
2723  {
2724  if (!test_result)
2725  {
2726  /* keys proven mutually contradictory */
2727  so->qual_ok = false;
2728  return;
2729  }
2730  /* else discard the redundant non-equality key */
2731  Assert(!array || array->num_elems > 0);
2732  xform[j].inkey = NULL;
2733  xform[j].inkeyi = -1;
2734  }
2735  /* else, cannot determine redundancy, keep both keys */
2736  }
2737  /* track number of attrs for which we have "=" keys */
2738  numberOfEqualCols++;
2739  }
2740 
2741  /* try to keep only one of <, <= */
2742  if (xform[BTLessStrategyNumber - 1].inkey &&
2743  xform[BTLessEqualStrategyNumber - 1].inkey)
2744  {
2745  ScanKey lt = xform[BTLessStrategyNumber - 1].inkey;
2746  ScanKey le = xform[BTLessEqualStrategyNumber - 1].inkey;
2747 
2748  if (_bt_compare_scankey_args(scan, le, lt, le, NULL, NULL,
2749  &test_result))
2750  {
2751  if (test_result)
2752  xform[BTLessEqualStrategyNumber - 1].inkey = NULL;
2753  else
2754  xform[BTLessStrategyNumber - 1].inkey = NULL;
2755  }
2756  }
2757 
2758  /* try to keep only one of >, >= */
2759  if (xform[BTGreaterStrategyNumber - 1].inkey &&
2760  xform[BTGreaterEqualStrategyNumber - 1].inkey)
2761  {
2762  ScanKey gt = xform[BTGreaterStrategyNumber - 1].inkey;
2763  ScanKey ge = xform[BTGreaterEqualStrategyNumber - 1].inkey;
2764 
2765  if (_bt_compare_scankey_args(scan, ge, gt, ge, NULL, NULL,
2766  &test_result))
2767  {
2768  if (test_result)
2769  xform[BTGreaterEqualStrategyNumber - 1].inkey = NULL;
2770  else
2771  xform[BTGreaterStrategyNumber - 1].inkey = NULL;
2772  }
2773  }
2774 
2775  /*
2776  * Emit the cleaned-up keys into the so->keyData[] array, and then
2777  * mark them if they are required. They are required (possibly
2778  * only in one direction) if all attrs before this one had "=".
2779  */
2780  for (j = BTMaxStrategyNumber; --j >= 0;)
2781  {
2782  if (xform[j].inkey)
2783  {
2784  ScanKey outkey = &so->keyData[new_numberOfKeys++];
2785 
2786  memcpy(outkey, xform[j].inkey, sizeof(ScanKeyData));
2787  if (arrayKeyData)
2788  keyDataMap[new_numberOfKeys - 1] = xform[j].inkeyi;
2789  if (priorNumberOfEqualCols == attno - 1)
2790  _bt_mark_scankey_required(outkey);
2791  }
2792  }
2793 
2794  /*
2795  * Exit loop here if done.
2796  */
2797  if (i == numberOfKeys)
2798  break;
2799 
2800  /* Re-initialize for new attno */
2801  attno = inkey->sk_attno;
2802  memset(xform, 0, sizeof(xform));
2803  }
2804 
2805  /* check strategy this key's operator corresponds to */
2806  j = inkey->sk_strategy - 1;
2807 
2808  /* if row comparison, push it directly to the output array */
2809  if (inkey->sk_flags & SK_ROW_HEADER)
2810  {
2811  ScanKey outkey = &so->keyData[new_numberOfKeys++];
2812 
2813  memcpy(outkey, inkey, sizeof(ScanKeyData));
2814  if (arrayKeyData)
2815  keyDataMap[new_numberOfKeys - 1] = i;
2816  if (numberOfEqualCols == attno - 1)
2817  _bt_mark_scankey_required(outkey);
2818 
2819  /*
2820  * We don't support RowCompare using equality; such a qual would
2821  * mess up the numberOfEqualCols tracking.
2822  */
2823  Assert(j != (BTEqualStrategyNumber - 1));
2824  continue;
2825  }
2826 
2827  if (inkey->sk_strategy == BTEqualStrategyNumber &&
2828  (inkey->sk_flags & SK_SEARCHARRAY))
2829  {
2830  /* must track how input scan keys map to arrays */
2831  Assert(arrayKeyData);
2832  arrayidx++;
2833  }
2834 
2835  /*
2836  * have we seen a scan key for this same attribute and using this same
2837  * operator strategy before now?
2838  */
2839  if (xform[j].inkey == NULL)
2840  {
2841  /* nope, so this scan key wins by default (at least for now) */
2842  xform[j].inkey = inkey;
2843  xform[j].inkeyi = i;
2844  xform[j].arrayidx = arrayidx;
2845  }
2846  else
2847  {
2848  FmgrInfo *orderproc = NULL;
2849  BTArrayKeyInfo *array = NULL;
2850 
2851  /*
2852  * Seen one of these before, so keep only the more restrictive key
2853  * if possible
2854  */
2855  if (j == (BTEqualStrategyNumber - 1) && arrayKeyData)
2856  {
2857  /*
2858  * Have to set up array keys
2859  */
2860  if (inkey->sk_flags & SK_SEARCHARRAY)
2861  {
2862  array = &so->arrayKeys[arrayidx - 1];
2863  orderproc = so->orderProcs + i;
2864 
2865  Assert(array->scan_key == i);
2866  Assert(OidIsValid(orderproc->fn_oid));
2867  }
2868  else if (xform[j].inkey->sk_flags & SK_SEARCHARRAY)
2869  {
2870  array = &so->arrayKeys[xform[j].arrayidx - 1];
2871  orderproc = so->orderProcs + xform[j].inkeyi;
2872 
2873  Assert(array->scan_key == xform[j].inkeyi);
2874  Assert(OidIsValid(orderproc->fn_oid));
2875  }
2876 
2877  /*
2878  * Both scan keys might have arrays, in which case we'll
2879  * arbitrarily pass only one of the arrays. That won't
2880  * matter, since _bt_compare_scankey_args is aware that two
2881  * SEARCHARRAY scan keys mean that _bt_preprocess_array_keys
2882  * failed to eliminate redundant arrays through array merging.
2883  * _bt_compare_scankey_args just returns false when it sees
2884  * this; it won't even try to examine either array.
2885  */
2886  }
2887 
2888  if (_bt_compare_scankey_args(scan, inkey, inkey, xform[j].inkey,
2889  array, orderproc, &test_result))
2890  {
2891  /* Have all we need to determine redundancy */
2892  if (test_result)
2893  {
2894  Assert(!array || array->num_elems > 0);
2895 
2896  /*
2897  * New key is more restrictive, and so replaces old key...
2898  */
2899  if (j != (BTEqualStrategyNumber - 1) ||
2900  !(xform[j].inkey->sk_flags & SK_SEARCHARRAY))
2901  {
2902  xform[j].inkey = inkey;
2903  xform[j].inkeyi = i;
2904  xform[j].arrayidx = arrayidx;
2905  }
2906  else
2907  {
2908  /*
2909  * ...unless we have to keep the old key because it's
2910  * an array that rendered the new key redundant. We
2911  * need to make sure that we don't throw away an array
2912  * scan key. _bt_preprocess_array_keys_final expects
2913  * us to keep all of the arrays that weren't already
2914  * eliminated by _bt_preprocess_array_keys earlier on.
2915  */
2916  Assert(!(inkey->sk_flags & SK_SEARCHARRAY));
2917  }
2918  }
2919  else if (j == (BTEqualStrategyNumber - 1))
2920  {
2921  /* key == a && key == b, but a != b */
2922  so->qual_ok = false;
2923  return;
2924  }
2925  /* else old key is more restrictive, keep it */
2926  }
2927  else
2928  {
2929  /*
2930  * We can't determine which key is more restrictive. Push
2931  * xform[j] directly to the output array, then set xform[j] to
2932  * the new scan key.
2933  *
2934  * Note: We do things this way around so that our arrays are
2935  * always in the same order as their corresponding scan keys,
2936  * even with incomplete opfamilies. _bt_advance_array_keys
2937  * depends on this.
2938  */
2939  ScanKey outkey = &so->keyData[new_numberOfKeys++];
2940 
2941  memcpy(outkey, xform[j].inkey, sizeof(ScanKeyData));
2942  if (arrayKeyData)
2943  keyDataMap[new_numberOfKeys - 1] = xform[j].inkeyi;
2944  if (numberOfEqualCols == attno - 1)
2945  _bt_mark_scankey_required(outkey);
2946  xform[j].inkey = inkey;
2947  xform[j].inkeyi = i;
2948  xform[j].arrayidx = arrayidx;
2949  }
2950  }
2951  }
2952 
2953  so->numberOfKeys = new_numberOfKeys;
2954 
2955  /*
2956  * Now that we've built a temporary mapping from so->keyData[] (output
2957  * scan keys) to arrayKeyData[] (our input scan keys), fix array->scan_key
2958  * references. Also consolidate the so->orderProcs[] array such that it
2959  * can be subscripted using so->keyData[]-wise offsets.
2960  */
2961  if (arrayKeyData)
2962  _bt_preprocess_array_keys_final(scan, keyDataMap);
2963 
2964  /* Could pfree arrayKeyData/keyDataMap now, but not worth the cycles */
2965 }
if(TABLE==NULL||TABLE_index==NULL)
Definition: isn.c:76
static bool _bt_fix_scankey_strategy(ScanKey skey, int16 *indoption)
Definition: nbtutils.c:3300
static void _bt_mark_scankey_required(ScanKey skey)
Definition: nbtutils.c:3402
static ScanKey _bt_preprocess_array_keys(IndexScanDesc scan, int *new_numberOfKeys)
Definition: nbtutils.c:270
static void _bt_preprocess_array_keys_final(IndexScanDesc scan, int *keyDataMap)
Definition: nbtutils.c:560
static bool _bt_compare_scankey_args(IndexScanDesc scan, ScanKey op, ScanKey leftarg, ScanKey rightarg, BTArrayKeyInfo *array, FmgrInfo *orderproc, bool *result)
Definition: nbtutils.c:3093
#define SK_SEARCHARRAY
Definition: skey.h:120
#define SK_SEARCHNULL
Definition: skey.h:121
#define BTMaxStrategyNumber
Definition: stratnum.h:35
FmgrInfo * orderProcs
Definition: nbtree.h:1044
MemoryContext arrayContext
Definition: nbtree.h:1045
Oid fn_oid
Definition: fmgr.h:59
struct ScanKeyData * keyData
Definition: relscan.h:145

References _bt_compare_scankey_args(), _bt_fix_scankey_strategy(), _bt_mark_scankey_required(), _bt_preprocess_array_keys(), _bt_preprocess_array_keys_final(), BTScanOpaqueData::arrayContext, BTScanOpaqueData::arrayKeys, Assert, BTEqualStrategyNumber, BTGreaterEqualStrategyNumber, BTGreaterStrategyNumber, BTLessEqualStrategyNumber, BTLessStrategyNumber, BTMaxStrategyNumber, elog, ERROR, FmgrInfo::fn_oid, i, if(), IndexScanDescData::indexRelation, j, BTScanOpaqueData::keyData, IndexScanDescData::keyData, MemoryContextAlloc(), BTArrayKeyInfo::num_elems, BTScanOpaqueData::numberOfKeys, IndexScanDescData::numberOfKeys, OidIsValid, IndexScanDescData::opaque, BTScanOpaqueData::orderProcs, BTScanOpaqueData::qual_ok, RelationData::rd_indoption, BTArrayKeyInfo::scan_key, ScanKeyData::sk_attno, ScanKeyData::sk_flags, SK_ROW_HEADER, SK_SEARCHARRAY, SK_SEARCHNULL, and ScanKeyData::sk_strategy.

Referenced by _bt_first().

◆ _bt_relandgetbuf()

Buffer _bt_relandgetbuf ( Relation  rel,
Buffer  obuf,
BlockNumber  blkno,
int  access 
)

Definition at line 1003 of file nbtpage.c.

1004 {
1005  Buffer buf;
1006 
1007  Assert(BlockNumberIsValid(blkno));
1008  if (BufferIsValid(obuf))
1009  _bt_unlockbuf(rel, obuf);
1010  buf = ReleaseAndReadBuffer(obuf, rel, blkno);
1011  _bt_lockbuf(rel, buf, access);
1012 
1013  _bt_checkpage(rel, buf);
1014  return buf;
1015 }
Buffer ReleaseAndReadBuffer(Buffer buffer, Relation relation, BlockNumber blockNum)
Definition: bufmgr.c:2594

References _bt_checkpage(), _bt_lockbuf(), _bt_unlockbuf(), Assert, BlockNumberIsValid(), buf, BufferIsValid(), and ReleaseAndReadBuffer().

Referenced by _bt_check_unique(), _bt_get_endpoint(), _bt_getroot(), _bt_gettrueroot(), _bt_lock_and_validate_left(), _bt_moveright(), _bt_search(), and _bt_stepright().

◆ _bt_relbuf()

◆ _bt_search()

BTStack _bt_search ( Relation  rel,
Relation  heaprel,
BTScanInsert  key,
Buffer bufP,
int  access 
)

Definition at line 102 of file nbtsearch.c.

104 {
105  BTStack stack_in = NULL;
106  int page_access = BT_READ;
107 
108  /* heaprel must be set whenever _bt_allocbuf is reachable */
109  Assert(access == BT_READ || access == BT_WRITE);
110  Assert(access == BT_READ || heaprel != NULL);
111 
112  /* Get the root page to start with */
113  *bufP = _bt_getroot(rel, heaprel, access);
114 
115  /* If index is empty and access = BT_READ, no root page is created. */
116  if (!BufferIsValid(*bufP))
117  return (BTStack) NULL;
118 
119  /* Loop iterates once per level descended in the tree */
120  for (;;)
121  {
122  Page page;
123  BTPageOpaque opaque;
124  OffsetNumber offnum;
125  ItemId itemid;
126  IndexTuple itup;
127  BlockNumber child;
128  BTStack new_stack;
129 
130  /*
131  * Race -- the page we just grabbed may have split since we read its
132  * downlink in its parent page (or the metapage). If it has, we may
133  * need to move right to its new sibling. Do that.
134  *
135  * In write-mode, allow _bt_moveright to finish any incomplete splits
136  * along the way. Strictly speaking, we'd only need to finish an
137  * incomplete split on the leaf page we're about to insert to, not on
138  * any of the upper levels (internal pages with incomplete splits are
139  * also taken care of in _bt_getstackbuf). But this is a good
140  * opportunity to finish splits of internal pages too.
141  */
142  *bufP = _bt_moveright(rel, heaprel, key, *bufP, (access == BT_WRITE),
143  stack_in, page_access);
144 
145  /* if this is a leaf page, we're done */
146  page = BufferGetPage(*bufP);
147  opaque = BTPageGetOpaque(page);
148  if (P_ISLEAF(opaque))
149  break;
150 
151  /*
152  * Find the appropriate pivot tuple on this page. Its downlink points
153  * to the child page that we're about to descend to.
154  */
155  offnum = _bt_binsrch(rel, key, *bufP);
156  itemid = PageGetItemId(page, offnum);
157  itup = (IndexTuple) PageGetItem(page, itemid);
158  Assert(BTreeTupleIsPivot(itup) || !key->heapkeyspace);
159  child = BTreeTupleGetDownLink(itup);
160 
161  /*
162  * We need to save the location of the pivot tuple we chose in a new
163  * stack entry for this page/level. If caller ends up splitting a
164  * page one level down, it usually ends up inserting a new pivot
165  * tuple/downlink immediately after the location recorded here.
166  */
167  new_stack = (BTStack) palloc(sizeof(BTStackData));
168  new_stack->bts_blkno = BufferGetBlockNumber(*bufP);
169  new_stack->bts_offset = offnum;
170  new_stack->bts_parent = stack_in;
171 
172  /*
173  * Page level 1 is lowest non-leaf page level prior to leaves. So, if
174  * we're on the level 1 and asked to lock leaf page in write mode,
175  * then lock next page in write mode, because it must be a leaf.
176  */
177  if (opaque->btpo_level == 1 && access == BT_WRITE)
178  page_access = BT_WRITE;
179 
180  /* drop the read lock on the page, then acquire one on its child */
181  *bufP = _bt_relandgetbuf(rel, *bufP, child, page_access);
182 
183  /* okay, all set to move down a level */
184  stack_in = new_stack;
185  }
186 
187  /*
188  * If we're asked to lock leaf in write mode, but didn't manage to, then
189  * relock. This should only happen when the root page is a leaf page (and
190  * the only page in the index other than the metapage).
191  */
192  if (access == BT_WRITE && page_access == BT_READ)
193  {
194  /* trade in our read lock for a write lock */
195  _bt_unlockbuf(rel, *bufP);
196  _bt_lockbuf(rel, *bufP, BT_WRITE);
197 
198  /*
199  * Race -- the leaf page may have split after we dropped the read lock
200  * but before we acquired a write lock. If it has, we may need to
201  * move right to its new sibling. Do that.
202  */
203  *bufP = _bt_moveright(rel, heaprel, key, *bufP, true, stack_in, BT_WRITE);
204  }
205 
206  return stack_in;
207 }
BTStackData * BTStack
Definition: nbtree.h:739
static Buffer _bt_moveright(Relation rel, Relation heaprel, BTScanInsert key, Buffer buf, bool forupdate, BTStack stack, int access)
Definition: nbtsearch.c:241

References _bt_binsrch(), _bt_getroot(), _bt_lockbuf(), _bt_moveright(), _bt_relandgetbuf(), _bt_unlockbuf(), Assert, BT_READ, BT_WRITE, BTPageGetOpaque, BTPageOpaqueData::btpo_level, BTreeTupleGetDownLink(), BTreeTupleIsPivot(), BTStackData::bts_blkno, BTStackData::bts_offset, BTStackData::bts_parent, BufferGetBlockNumber(), BufferGetPage(), BufferIsValid(), sort-test::key, P_ISLEAF, PageGetItem(), PageGetItemId(), and palloc().

Referenced by _bt_first(), _bt_pagedel(), _bt_search_insert(), and bt_rootdescend().

◆ _bt_set_cleanup_info()

void _bt_set_cleanup_info ( Relation  rel,
BlockNumber  num_delpages 
)

Definition at line 232 of file nbtpage.c.

233 {
234  Buffer metabuf;
235  Page metapg;
236  BTMetaPageData *metad;
237 
238  /*
239  * On-disk compatibility note: The btm_last_cleanup_num_delpages metapage
240  * field started out as a TransactionId field called btm_oldest_btpo_xact.
241  * Both "versions" are just uint32 fields. It was convenient to repurpose
242  * the field when we began to use 64-bit XIDs in deleted pages.
243  *
244  * It's possible that a pg_upgrade'd database will contain an XID value in
245  * what is now recognized as the metapage's btm_last_cleanup_num_delpages
246  * field. _bt_vacuum_needs_cleanup() may even believe that this value
247  * indicates that there are lots of pages that it needs to recycle, when
248  * in reality there are only one or two. The worst that can happen is
249  * that there will be a call to btvacuumscan a little earlier, which will
250  * set btm_last_cleanup_num_delpages to a sane value when we're called.
251  *
252  * Note also that the metapage's btm_last_cleanup_num_heap_tuples field is
253  * no longer used as of PostgreSQL 14. We set it to -1.0 on rewrite, just
254  * to be consistent.
255  */
256  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
257  metapg = BufferGetPage(metabuf);
258  metad = BTPageGetMeta(metapg);
259 
260  /* Don't miss chance to upgrade index/metapage when BTREE_MIN_VERSION */
261  if (metad->btm_version >= BTREE_NOVAC_VERSION &&
262  metad->btm_last_cleanup_num_delpages == num_delpages)
263  {
264  /* Usually means index continues to have num_delpages of 0 */
265  _bt_relbuf(rel, metabuf);
266  return;
267  }
268 
269  /* trade in our read lock for a write lock */
270  _bt_unlockbuf(rel, metabuf);
271  _bt_lockbuf(rel, metabuf, BT_WRITE);
272 
274 
275  /* upgrade meta-page if needed */
276  if (metad->btm_version < BTREE_NOVAC_VERSION)
277  _bt_upgrademetapage(metapg);
278 
279  /* update cleanup-related information */
280  metad->btm_last_cleanup_num_delpages = num_delpages;
281  metad->btm_last_cleanup_num_heap_tuples = -1.0;
282  MarkBufferDirty(metabuf);
283 
284  /* write wal record if needed */
285  if (RelationNeedsWAL(rel))
286  {
288  XLogRecPtr recptr;
289 
290  XLogBeginInsert();
292 
294  md.version = metad->btm_version;
295  md.root = metad->btm_root;
296  md.level = metad->btm_level;
297  md.fastroot = metad->btm_fastroot;
298  md.fastlevel = metad->btm_fastlevel;
299  md.last_cleanup_num_delpages = num_delpages;
300  md.allequalimage = metad->btm_allequalimage;
301 
302  XLogRegisterBufData(0, (char *) &md, sizeof(xl_btree_metadata));
303 
304  recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_META_CLEANUP);
305 
306  PageSetLSN(metapg, recptr);
307  }
308 
310 
311  _bt_relbuf(rel, metabuf);
312 }
#define XLOG_BTREE_META_CLEANUP
Definition: nbtxlog.h:41

References _bt_getbuf(), _bt_lockbuf(), _bt_relbuf(), _bt_unlockbuf(), _bt_upgrademetapage(), xl_btree_metadata::allequalimage, Assert, BT_READ, BT_WRITE, BTMetaPageData::btm_allequalimage, BTMetaPageData::btm_fastlevel, BTMetaPageData::btm_fastroot, BTMetaPageData::btm_last_cleanup_num_delpages, BTMetaPageData::btm_last_cleanup_num_heap_tuples, BTMetaPageData::btm_level, BTMetaPageData::btm_root, BTMetaPageData::btm_version, BTPageGetMeta, BTREE_METAPAGE, BTREE_NOVAC_VERSION, BufferGetPage(), END_CRIT_SECTION, xl_btree_metadata::fastlevel, xl_btree_metadata::fastroot, xl_btree_metadata::last_cleanup_num_delpages, xl_btree_metadata::level, MarkBufferDirty(), PageSetLSN(), REGBUF_STANDARD, REGBUF_WILL_INIT, RelationNeedsWAL, xl_btree_metadata::root, START_CRIT_SECTION, xl_btree_metadata::version, XLOG_BTREE_META_CLEANUP, XLogBeginInsert(), XLogInsert(), XLogRegisterBufData(), and XLogRegisterBuffer().

Referenced by btvacuumcleanup().

◆ _bt_start_array_keys()

void _bt_start_array_keys ( IndexScanDesc  scan,
ScanDirection  dir 
)

Definition at line 1352 of file nbtutils.c.

1353 {
1354  BTScanOpaque so = (BTScanOpaque) scan->opaque;
1355  int i;
1356 
1357  Assert(so->numArrayKeys);
1358  Assert(so->qual_ok);
1359 
1360  for (i = 0; i < so->numArrayKeys; i++)
1361  {
1362  BTArrayKeyInfo *curArrayKey = &so->arrayKeys[i];
1363  ScanKey skey = &so->keyData[curArrayKey->scan_key];
1364 
1365  Assert(curArrayKey->num_elems > 0);
1366  Assert(skey->sk_flags & SK_SEARCHARRAY);
1367 
1368  if (ScanDirectionIsBackward(dir))
1369  curArrayKey->cur_elem = curArrayKey->num_elems - 1;
1370  else
1371  curArrayKey->cur_elem = 0;
1372  skey->sk_argument = curArrayKey->elem_values[curArrayKey->cur_elem];
1373  }
1374  so->scanBehind = so->oppositeDirCheck = false; /* reset */
1375 }

References BTScanOpaqueData::arrayKeys, Assert, BTArrayKeyInfo::cur_elem, BTArrayKeyInfo::elem_values, i, BTScanOpaqueData::keyData, BTArrayKeyInfo::num_elems, BTScanOpaqueData::numArrayKeys, IndexScanDescData::opaque, BTScanOpaqueData::oppositeDirCheck, BTScanOpaqueData::qual_ok, BTArrayKeyInfo::scan_key, BTScanOpaqueData::scanBehind, ScanDirectionIsBackward, ScanKeyData::sk_argument, ScanKeyData::sk_flags, and SK_SEARCHARRAY.

Referenced by _bt_advance_array_keys_increment(), _bt_first(), and btrestrpos().

◆ _bt_start_prim_scan()

bool _bt_start_prim_scan ( IndexScanDesc  scan,
ScanDirection  dir 
)

Definition at line 1682 of file nbtutils.c.

1683 {
1684  BTScanOpaque so = (BTScanOpaque) scan->opaque;
1685 
1686  Assert(so->numArrayKeys);
1687 
1688  so->scanBehind = so->oppositeDirCheck = false; /* reset */
1689 
1690  /*
1691  * Array keys are advanced within _bt_checkkeys when the scan reaches the
1692  * leaf level (more precisely, they're advanced when the scan reaches the
1693  * end of each distinct set of array elements). This process avoids
1694  * repeat access to leaf pages (across multiple primitive index scans) by
1695  * advancing the scan's array keys when it allows the primitive index scan
1696  * to find nearby matching tuples (or when it eliminates ranges of array
1697  * key space that can't possibly be satisfied by any index tuple).
1698  *
1699  * _bt_checkkeys sets a simple flag variable to schedule another primitive
1700  * index scan. The flag tells us what to do.
1701  *
1702  * We cannot rely on _bt_first always reaching _bt_checkkeys. There are
1703  * various cases where that won't happen. For example, if the index is
1704  * completely empty, then _bt_first won't call _bt_readpage/_bt_checkkeys.
1705  * We also don't expect a call to _bt_checkkeys during searches for a
1706  * non-existent value that happens to be lower/higher than any existing
1707  * value in the index.
1708  *
1709  * We don't require special handling for these cases -- we don't need to
1710  * be explicitly instructed to _not_ perform another primitive index scan.
1711  * It's up to code under the control of _bt_first to always set the flag
1712  * when another primitive index scan will be required.
1713  *
1714  * This works correctly, even with the tricky cases listed above, which
1715  * all involve access to leaf pages "near the boundaries of the key space"
1716  * (whether it's from a leftmost/rightmost page, or an imaginary empty
1717  * leaf root page). If _bt_checkkeys cannot be reached by a primitive
1718  * index scan for one set of array keys, then it also won't be reached for
1719  * any later set ("later" in terms of the direction that we scan the index
1720  * and advance the arrays). The array keys won't have advanced in these
1721  * cases, but that's the correct behavior (even _bt_advance_array_keys
1722  * won't always advance the arrays at the point they become "exhausted").
1723  */
1724  if (so->needPrimScan)
1725  {
1726  Assert(_bt_verify_arrays_bt_first(scan, dir));
1727 
1728  /*
1729  * Flag was set -- must call _bt_first again, which will reset the
1730  * scan's needPrimScan flag
1731  */
1732  return true;
1733  }
1734 
1735  /* The top-level index scan ran out of tuples in this scan direction */
1736  if (scan->parallel_scan != NULL)
1737  _bt_parallel_done(scan);
1738 
1739  return false;
1740 }

References _bt_parallel_done(), Assert, BTScanOpaqueData::needPrimScan, BTScanOpaqueData::numArrayKeys, IndexScanDescData::opaque, BTScanOpaqueData::oppositeDirCheck, IndexScanDescData::parallel_scan, and BTScanOpaqueData::scanBehind.

◆ _bt_start_vacuum()

BTCycleId _bt_start_vacuum ( Relation  rel)

Definition at line 4429 of file nbtutils.c.

4430 {
4431  BTCycleId result;
4432  int i;
4433  BTOneVacInfo *vac;
4434 
4435  LWLockAcquire(BtreeVacuumLock, LW_EXCLUSIVE);
4436 
4437  /*
4438  * Assign the next cycle ID, being careful to avoid zero as well as the
4439  * reserved high values.
4440  */
4441  result = ++(btvacinfo->cycle_ctr);
4442  if (result == 0 || result > MAX_BT_CYCLE_ID)
4443  result = btvacinfo->cycle_ctr = 1;
4444 
4445  /* Let's just make sure there's no entry already for this index */
4446  for (i = 0; i < btvacinfo->num_vacuums; i++)
4447  {
4448  vac = &btvacinfo->vacuums[i];
4449  if (vac->relid.relId == rel->rd_lockInfo.lockRelId.relId &&
4450  vac->relid.dbId == rel->rd_lockInfo.lockRelId.dbId)
4451  {
4452  /*
4453  * Unlike most places in the backend, we have to explicitly
4454  * release our LWLock before throwing an error. This is because
4455  * we expect _bt_end_vacuum() to be called before transaction
4456  * abort cleanup can run to release LWLocks.
4457  */
4458  LWLockRelease(BtreeVacuumLock);
4459  elog(ERROR, "multiple active vacuums for index \"%s\"",
4461  }
4462  }
4463 
4464  /* OK, add an entry */
4466  {
4467  LWLockRelease(BtreeVacuumLock);
4468  elog(ERROR, "out of btvacinfo slots");
4469  }
4471  vac->relid = rel->rd_lockInfo.lockRelId;
4472  vac->cycleid = result;
4474 
4475  LWLockRelease(BtreeVacuumLock);
4476  return result;
4477 }
#define MAX_BT_CYCLE_ID
Definition: nbtree.h:93
uint16 BTCycleId
Definition: nbtree.h:29
BTCycleId cycleid
Definition: nbtutils.c:4371
BTCycleId cycle_ctr
Definition: nbtutils.c:4376
int max_vacuums
Definition: nbtutils.c:4378

References btvacinfo, BTVacInfo::cycle_ctr, BTOneVacInfo::cycleid, LockRelId::dbId, elog, ERROR, i, LockInfoData::lockRelId, LW_EXCLUSIVE, LWLockAcquire(), LWLockRelease(), MAX_BT_CYCLE_ID, BTVacInfo::max_vacuums, BTVacInfo::num_vacuums, RelationData::rd_lockInfo, RelationGetRelationName, BTOneVacInfo::relid, LockRelId::relId, and BTVacInfo::vacuums.

Referenced by btbulkdelete().

◆ _bt_swap_posting()

IndexTuple _bt_swap_posting ( IndexTuple  newitem,
IndexTuple  oposting,
int  postingoff 
)

Definition at line 1022 of file nbtdedup.c.

1023 {
1024  int nhtids;
1025  char *replacepos;
1026  char *replaceposright;
1027  Size nmovebytes;
1028  IndexTuple nposting;
1029 
1030  nhtids = BTreeTupleGetNPosting(oposting);
1031  Assert(_bt_posting_valid(oposting));
1032 
1033  /*
1034  * The postingoff argument originated as a _bt_binsrch_posting() return
1035  * value. It will be 0 in the event of corruption that makes a leaf page
1036  * contain a non-pivot tuple that's somehow identical to newitem (no two
1037  * non-pivot tuples should ever have the same TID). This has been known
1038  * to happen in the field from time to time.
1039  *
1040  * Perform a basic sanity check to catch this case now.
1041  */
1042  if (!(postingoff > 0 && postingoff < nhtids))
1043  elog(ERROR, "posting list tuple with %d items cannot be split at offset %d",
1044  nhtids, postingoff);
1045 
1046  /*
1047  * Move item pointers in posting list to make a gap for the new item's
1048  * heap TID. We shift TIDs one place to the right, losing original
1049  * rightmost TID. (nmovebytes must not include TIDs to the left of
1050  * postingoff, nor the existing rightmost/max TID that gets overwritten.)
1051  */
1052  nposting = CopyIndexTuple(oposting);
1053  replacepos = (char *) BTreeTupleGetPostingN(nposting, postingoff);
1054  replaceposright = (char *) BTreeTupleGetPostingN(nposting, postingoff + 1);
1055  nmovebytes = (nhtids - postingoff - 1) * sizeof(ItemPointerData);
1056  memmove(replaceposright, replacepos, nmovebytes);
1057 
1058  /* Fill the gap at postingoff with TID of new item (original new TID) */
1059  Assert(!BTreeTupleIsPivot(newitem) && !BTreeTupleIsPosting(newitem));
1060  ItemPointerCopy(&newitem->t_tid, (ItemPointer) replacepos);
1061 
1062  /* Now copy oposting's rightmost/max TID into new item (final new TID) */
1063  ItemPointerCopy(BTreeTupleGetMaxHeapTID(oposting), &newitem->t_tid);
1064 
1066  BTreeTupleGetHeapTID(newitem)) < 0);
1067  Assert(_bt_posting_valid(nposting));
1068 
1069  return nposting;
1070 }

References Assert, BTreeTupleGetHeapTID(), BTreeTupleGetMaxHeapTID(), BTreeTupleGetNPosting(), BTreeTupleGetPostingN(), BTreeTupleIsPivot(), BTreeTupleIsPosting(), CopyIndexTuple(), elog, ERROR, ItemPointerCompare(), ItemPointerCopy(), and IndexTupleData::t_tid.

Referenced by _bt_insertonpg(), btree_xlog_insert(), and btree_xlog_split().

◆ _bt_truncate()

IndexTuple _bt_truncate ( Relation  rel,
IndexTuple  lastleft,
IndexTuple  firstright,
BTScanInsert  itup_key 
)

Definition at line 4658 of file nbtutils.c.

4660 {
4661  TupleDesc itupdesc = RelationGetDescr(rel);
4663  int keepnatts;
4664  IndexTuple pivot;
4665  IndexTuple tidpivot;
4666  ItemPointer pivotheaptid;
4667  Size newsize;
4668 
4669  /*
4670  * We should only ever truncate non-pivot tuples from leaf pages. It's
4671  * never okay to truncate when splitting an internal page.
4672  */
4673  Assert(!BTreeTupleIsPivot(lastleft) && !BTreeTupleIsPivot(firstright));
4674 
4675  /* Determine how many attributes must be kept in truncated tuple */
4676  keepnatts = _bt_keep_natts(rel, lastleft, firstright, itup_key);
4677 
4678 #ifdef DEBUG_NO_TRUNCATE
4679  /* Force truncation to be ineffective for testing purposes */
4680  keepnatts = nkeyatts + 1;
4681 #endif
4682 
4683  pivot = index_truncate_tuple(itupdesc, firstright,
4684  Min(keepnatts, nkeyatts));
4685 
4686  if (BTreeTupleIsPosting(pivot))
4687  {
4688  /*
4689  * index_truncate_tuple() just returns a straight copy of firstright
4690  * when it has no attributes to truncate. When that happens, we may
4691  * need to truncate away a posting list here instead.
4692  */
4693  Assert(keepnatts == nkeyatts || keepnatts == nkeyatts + 1);
4694  Assert(IndexRelationGetNumberOfAttributes(rel) == nkeyatts);
4695  pivot->t_info &= ~INDEX_SIZE_MASK;
4696  pivot->t_info |= MAXALIGN(BTreeTupleGetPostingOffset(firstright));
4697  }
4698 
4699  /*
4700  * If there is a distinguishing key attribute within pivot tuple, we're
4701  * done
4702  */
4703  if (keepnatts <= nkeyatts)
4704  {
4705  BTreeTupleSetNAtts(pivot, keepnatts, false);
4706  return pivot;
4707  }
4708 
4709  /*
4710  * We have to store a heap TID in the new pivot tuple, since no non-TID
4711  * key attribute value in firstright distinguishes the right side of the
4712  * split from the left side. nbtree conceptualizes this case as an
4713  * inability to truncate away any key attributes, since heap TID is
4714  * treated as just another key attribute (despite lacking a pg_attribute
4715  * entry).
4716  *
4717  * Use enlarged space that holds a copy of pivot. We need the extra space
4718  * to store a heap TID at the end (using the special pivot tuple
4719  * representation). Note that the original pivot already has firstright's
4720  * possible posting list/non-key attribute values removed at this point.
4721  */
4722  newsize = MAXALIGN(IndexTupleSize(pivot)) + MAXALIGN(sizeof(ItemPointerData));
4723  tidpivot = palloc0(newsize);
4724  memcpy(tidpivot, pivot, MAXALIGN(IndexTupleSize(pivot)));
4725  /* Cannot leak memory here */
4726  pfree(pivot);
4727 
4728  /*
4729  * Store all of firstright's key attribute values plus a tiebreaker heap
4730  * TID value in enlarged pivot tuple
4731  */
4732  tidpivot->t_info &= ~INDEX_SIZE_MASK;
4733  tidpivot->t_info |= newsize;
4734  BTreeTupleSetNAtts(tidpivot, nkeyatts, true);
4735  pivotheaptid = BTreeTupleGetHeapTID(tidpivot);
4736 
4737  /*
4738  * Lehman & Yao use lastleft as the leaf high key in all cases, but don't
4739  * consider suffix truncation. It seems like a good idea to follow that
4740  * example in cases where no truncation takes place -- use lastleft's heap
4741  * TID. (This is also the closest value to negative infinity that's
4742  * legally usable.)
4743  */
4744  ItemPointerCopy(BTreeTupleGetMaxHeapTID(lastleft), pivotheaptid);
4745 
4746  /*
4747  * We're done. Assert() that heap TID invariants hold before returning.
4748  *
4749  * Lehman and Yao require that the downlink to the right page, which is to
4750  * be inserted into the parent page in the second phase of a page split be
4751  * a strict lower bound on items on the right page, and a non-strict upper
4752  * bound for items on the left page. Assert that heap TIDs follow these
4753  * invariants, since a heap TID value is apparently needed as a
4754  * tiebreaker.
4755  */
4756 #ifndef DEBUG_NO_TRUNCATE
4758  BTreeTupleGetHeapTID(firstright)) < 0);
4759  Assert(ItemPointerCompare(pivotheaptid,
4760  BTreeTupleGetHeapTID(lastleft)) >= 0);
4761  Assert(ItemPointerCompare(pivotheaptid,
4762  BTreeTupleGetHeapTID(firstright)) < 0);
4763 #else
4764 
4765  /*
4766  * Those invariants aren't guaranteed to hold for lastleft + firstright
4767  * heap TID attribute values when they're considered here only because
4768  * DEBUG_NO_TRUNCATE is defined (a heap TID is probably not actually
4769  * needed as a tiebreaker). DEBUG_NO_TRUNCATE must therefore use a heap
4770  * TID value that always works as a strict lower bound for items to the
4771  * right. In particular, it must avoid using firstright's leading key
4772  * attribute values along with lastleft's heap TID value when lastleft's
4773  * TID happens to be greater than firstright's TID.
4774  */
4775  ItemPointerCopy(BTreeTupleGetHeapTID(firstright), pivotheaptid);
4776 
4777  /*
4778  * Pivot heap TID should never be fully equal to firstright. Note that
4779  * the pivot heap TID will still end up equal to lastleft's heap TID when
4780  * that's the only usable value.
4781  */
4782  ItemPointerSetOffsetNumber(pivotheaptid,
4784  Assert(ItemPointerCompare(pivotheaptid,
4785  BTreeTupleGetHeapTID(firstright)) < 0);
4786 #endif
4787 
4788  return tidpivot;
4789 }
IndexTuple index_truncate_tuple(TupleDesc sourceDescriptor, IndexTuple source, int leavenatts)
Definition: indextuple.c:576
static void ItemPointerSetOffsetNumber(ItemPointerData *pointer, OffsetNumber offsetNumber)
Definition: itemptr.h:158
static void BTreeTupleSetNAtts(IndexTuple itup, uint16 nkeyatts, bool heaptid)
Definition: nbtree.h:595
static int _bt_keep_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright, BTScanInsert itup_key)
Definition: nbtutils.c:4803

References _bt_keep_natts(), Assert, BTreeTupleGetHeapTID(), BTreeTupleGetMaxHeapTID(), BTreeTupleGetPostingOffset(), BTreeTupleIsPivot(), BTreeTupleIsPosting(), BTreeTupleSetNAtts(), INDEX_SIZE_MASK, index_truncate_tuple(), IndexRelationGetNumberOfAttributes, IndexRelationGetNumberOfKeyAttributes, IndexTupleSize, ItemPointerCompare(), ItemPointerCopy(), ItemPointerGetOffsetNumber(), ItemPointerSetOffsetNumber(), MAXALIGN, Min, OffsetNumberPrev, palloc0(), pfree(), RelationGetDescr, and IndexTupleData::t_info.

Referenced by _bt_buildadd(), and _bt_split().

◆ _bt_unlockbuf()

void _bt_unlockbuf ( Relation  rel,
Buffer  buf 
)

Definition at line 1070 of file nbtpage.c.

1071 {
1072  /*
1073  * Buffer is pinned and locked, which means that it is expected to be
1074  * defined and addressable. Check that proactively.
1075  */
1077 
1078  /* LockBuffer() asserts that pin is held by this backend */
1080 
1081  if (!RelationUsesLocalBuffers(rel))
1083 }
#define BUFFER_LOCK_UNLOCK
Definition: bufmgr.h:189
#define VALGRIND_CHECK_MEM_IS_DEFINED(addr, size)
Definition: memdebug.h:23
#define VALGRIND_MAKE_MEM_NOACCESS(addr, size)
Definition: memdebug.h:27

References buf, BUFFER_LOCK_UNLOCK, BufferGetPage(), LockBuffer(), RelationUsesLocalBuffers, VALGRIND_CHECK_MEM_IS_DEFINED, and VALGRIND_MAKE_MEM_NOACCESS.

Referenced by _bt_drop_lock_and_maybe_pin(), _bt_getroot(), _bt_killitems(), _bt_moveright(), _bt_pagedel(), _bt_readfirstpage(), _bt_relandgetbuf(), _bt_relbuf(), _bt_search(), _bt_set_cleanup_info(), and _bt_unlink_halfdead_page().

◆ _bt_update_posting()

void _bt_update_posting ( BTVacuumPosting  vacposting)

Definition at line 924 of file nbtdedup.c.

925 {
926  IndexTuple origtuple = vacposting->itup;
927  uint32 keysize,
928  newsize;
929  IndexTuple itup;
930  int nhtids;
931  int ui,
932  d;
933  ItemPointer htids;
934 
935  nhtids = BTreeTupleGetNPosting(origtuple) - vacposting->ndeletedtids;
936 
937  Assert(_bt_posting_valid(origtuple));
938  Assert(nhtids > 0 && nhtids < BTreeTupleGetNPosting(origtuple));
939 
940  /*
941  * Determine final size of new tuple.
942  *
943  * This calculation needs to match the code used within _bt_form_posting()
944  * for new posting list tuples. We avoid calling _bt_form_posting() here
945  * to save ourselves a second memory allocation for a htids workspace.
946  */
947  keysize = BTreeTupleGetPostingOffset(origtuple);
948  if (nhtids > 1)
949  newsize = MAXALIGN(keysize +
950  nhtids * sizeof(ItemPointerData));
951  else
952  newsize = keysize;
953 
954  Assert(newsize <= INDEX_SIZE_MASK);
955  Assert(newsize == MAXALIGN(newsize));
956 
957  /* Allocate memory using palloc0() (matches index_form_tuple()) */
958  itup = palloc0(newsize);
959  memcpy(itup, origtuple, keysize);
960  itup->t_info &= ~INDEX_SIZE_MASK;
961  itup->t_info |= newsize;
962 
963  if (nhtids > 1)
964  {
965  /* Form posting list tuple */
966  BTreeTupleSetPosting(itup, nhtids, keysize);
967  htids = BTreeTupleGetPosting(itup);
968  }
969  else
970  {
971  /* Form standard non-pivot tuple */
972  itup->t_info &= ~INDEX_ALT_TID_MASK;
973  htids = &itup->t_tid;
974  }
975 
976  ui = 0;
977  d = 0;
978  for (int i = 0; i < BTreeTupleGetNPosting(origtuple); i++)
979  {
980  if (d < vacposting->ndeletedtids && vacposting->deletetids[d] == i)
981  {
982  d++;
983  continue;
984  }
985  htids[ui++] = *BTreeTupleGetPostingN(origtuple, i);
986  }
987  Assert(ui == nhtids);
988  Assert(d == vacposting->ndeletedtids);
989  Assert(nhtids == 1 || _bt_posting_valid(itup));
990  Assert(nhtids > 1 || ItemPointerIsValid(&itup->t_tid));
991 
992  /* vacposting arg's itup will now point to updated version */
993  vacposting->itup = itup;
994 }

References Assert, BTreeTupleGetNPosting(), BTreeTupleGetPosting(), BTreeTupleGetPostingN(), BTreeTupleGetPostingOffset(), BTreeTupleSetPosting(), BTVacuumPostingData::deletetids, i, INDEX_ALT_TID_MASK, INDEX_SIZE_MASK, ItemPointerIsValid(), BTVacuumPostingData::itup, MAXALIGN, BTVacuumPostingData::ndeletedtids, palloc0(), IndexTupleData::t_info, and IndexTupleData::t_tid.

Referenced by _bt_delitems_update(), and btree_xlog_updates().

◆ _bt_upgradelockbufcleanup()

void _bt_upgradelockbufcleanup ( Relation  rel,
Buffer  buf 
)

Definition at line 1109 of file nbtpage.c.

1110 {
1111  /*
1112  * Buffer is pinned and locked, which means that it is expected to be
1113  * defined and addressable. Check that proactively.
1114  */
1116 
1117  /* LockBuffer() asserts that pin is held by this backend */
1120 }
void LockBufferForCleanup(Buffer buffer)
Definition: bufmgr.c:5238

References buf, BUFFER_LOCK_UNLOCK, BufferGetPage(), LockBuffer(), LockBufferForCleanup(), and VALGRIND_CHECK_MEM_IS_DEFINED.

Referenced by btvacuumpage().

◆ _bt_upgrademetapage()

void _bt_upgrademetapage ( Page  page)

Definition at line 107 of file nbtpage.c.

108 {
109  BTMetaPageData *metad;
111 
112  metad = BTPageGetMeta(page);
113  metaopaque = BTPageGetOpaque(page);
114 
115  /* It must be really a meta page of upgradable version */
116  Assert(metaopaque->btpo_flags & BTP_META);
119 
120  /* Set version number and fill extra fields added into version 3 */
123  metad->btm_last_cleanup_num_heap_tuples = -1.0;
124  /* Only a REINDEX can set this field */
125  Assert(!metad->btm_allequalimage);
126  metad->btm_allequalimage = false;
127 
128  /* Adjust pd_lower (see _bt_initmetapage() for details) */
129  ((PageHeader) page)->pd_lower =
130  ((char *) metad + sizeof(BTMetaPageData)) - (char *) page;
131 }

References Assert, BTMetaPageData::btm_allequalimage, BTMetaPageData::btm_last_cleanup_num_delpages, BTMetaPageData::btm_last_cleanup_num_heap_tuples, BTMetaPageData::btm_version, BTP_META, BTPageGetMeta, BTPageGetOpaque, BTREE_MIN_VERSION, BTREE_NOVAC_VERSION, and PG_USED_FOR_ASSERTS_ONLY.

Referenced by _bt_getroot(), _bt_insertonpg(), _bt_newlevel(), _bt_set_cleanup_info(), and _bt_unlink_halfdead_page().

◆ _bt_vacuum_cycleid()

BTCycleId _bt_vacuum_cycleid ( Relation  rel)

Definition at line 4395 of file nbtutils.c.

4396 {
4397  BTCycleId result = 0;
4398  int i;
4399 
4400  /* Share lock is enough since this is a read-only operation */
4401  LWLockAcquire(BtreeVacuumLock, LW_SHARED);
4402 
4403  for (i = 0; i < btvacinfo->num_vacuums; i++)
4404  {
4405  BTOneVacInfo *vac = &btvacinfo->vacuums[i];
4406 
4407  if (vac->relid.relId == rel->rd_lockInfo.lockRelId.relId &&
4408  vac->relid.dbId == rel->rd_lockInfo.lockRelId.dbId)
4409  {
4410  result = vac->cycleid;
4411  break;
4412  }
4413  }
4414 
4415  LWLockRelease(BtreeVacuumLock);
4416  return result;
4417 }
@ LW_SHARED
Definition: lwlock.h:115

References btvacinfo, BTOneVacInfo::cycleid, LockRelId::dbId, i, LockInfoData::lockRelId, LW_SHARED, LWLockAcquire(), LWLockRelease(), BTVacInfo::num_vacuums, RelationData::rd_lockInfo, BTOneVacInfo::relid, LockRelId::relId, and BTVacInfo::vacuums.

Referenced by _bt_split().

◆ _bt_vacuum_needs_cleanup()

bool _bt_vacuum_needs_cleanup ( Relation  rel)

Definition at line 179 of file nbtpage.c.

180 {
181  Buffer metabuf;
182  Page metapg;
183  BTMetaPageData *metad;
184  uint32 btm_version;
185  BlockNumber prev_num_delpages;
186 
187  /*
188  * Copy details from metapage to local variables quickly.
189  *
190  * Note that we deliberately avoid using cached version of metapage here.
191  */
192  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
193  metapg = BufferGetPage(metabuf);
194  metad = BTPageGetMeta(metapg);
195  btm_version = metad->btm_version;
196 
197  if (btm_version < BTREE_NOVAC_VERSION)
198  {
199  /*
200  * Metapage needs to be dynamically upgraded to store fields that are
201  * only present when btm_version >= BTREE_NOVAC_VERSION
202  */
203  _bt_relbuf(rel, metabuf);
204  return true;
205  }
206 
207  prev_num_delpages = metad->btm_last_cleanup_num_delpages;
208  _bt_relbuf(rel, metabuf);
209 
210  /*
211  * Trigger cleanup in rare cases where prev_num_delpages exceeds 5% of the
212  * total size of the index. We can reasonably expect (though are not
213  * guaranteed) to be able to recycle this many pages if we decide to do a
214  * btvacuumscan call during the ongoing btvacuumcleanup. For further
215  * details see the nbtree/README section on placing deleted pages in the
216  * FSM.
217  */
218  if (prev_num_delpages > 0 &&
219  prev_num_delpages > RelationGetNumberOfBlocks(rel) / 20)
220  return true;
221 
222  return false;
223 }
#define RelationGetNumberOfBlocks(reln)
Definition: bufmgr.h:273

References _bt_getbuf(), _bt_relbuf(), BT_READ, BTMetaPageData::btm_last_cleanup_num_delpages, BTMetaPageData::btm_version, BTPageGetMeta, BTREE_METAPAGE, BTREE_NOVAC_VERSION, BufferGetPage(), and RelationGetNumberOfBlocks.

Referenced by btvacuumcleanup().

◆ btadjustmembers()

void btadjustmembers ( Oid  opfamilyoid,
Oid  opclassoid,
List operators,
List functions 
)

Definition at line 293 of file nbtvalidate.c.

297 {
298  Oid opcintype;
299  ListCell *lc;
300 
301  /*
302  * Btree operators and comparison support functions are always "loose"
303  * members of the opfamily if they are cross-type. If they are not
304  * cross-type, we prefer to tie them to the appropriate opclass ... but if
305  * the user hasn't created one, we can't do that, and must fall back to
306  * using the opfamily dependency. (We mustn't force creation of an
307  * opclass in such a case, as leaving an incomplete opclass laying about
308  * would be bad. Throwing an error is another undesirable alternative.)
309  *
310  * This behavior results in a bit of a dump/reload hazard, in that the
311  * order of restoring objects could affect what dependencies we end up
312  * with. pg_dump's existing behavior will preserve the dependency choices
313  * in most cases, but not if a cross-type operator has been bound tightly
314  * into an opclass. That's a mistake anyway, so silently "fixing" it
315  * isn't awful.
316  *
317  * Optional support functions are always "loose" family members.
318  *
319  * To avoid repeated lookups, we remember the most recently used opclass's
320  * input type.
321  */
322  if (OidIsValid(opclassoid))
323  {
324  /* During CREATE OPERATOR CLASS, need CCI to see the pg_opclass row */
326  opcintype = get_opclass_input_type(opclassoid);
327  }
328  else
329  opcintype = InvalidOid;
330 
331  /*
332  * We handle operators and support functions almost identically, so rather
333  * than duplicate this code block, just join the lists.
334  */
335  foreach(lc, list_concat_copy(operators, functions))
336  {
337  OpFamilyMember *op = (OpFamilyMember *) lfirst(lc);
338 
339  if (op->is_func && op->number != BTORDER_PROC)
340  {
341  /* Optional support proc, so always a soft family dependency */
342  op->ref_is_hard = false;
343  op->ref_is_family = true;
344  op->refobjid = opfamilyoid;
345  }
346  else if (op->lefttype != op->righttype)
347  {
348  /* Cross-type, so always a soft family dependency */
349  op->ref_is_hard = false;
350  op->ref_is_family = true;
351  op->refobjid = opfamilyoid;
352  }
353  else
354  {
355  /* Not cross-type; is there a suitable opclass? */
356  if (op->lefttype != opcintype)
357  {
358  /* Avoid repeating this expensive lookup, even if it fails */
359  opcintype = op->lefttype;
360  opclassoid = opclass_for_family_datatype(BTREE_AM_OID,
361  opfamilyoid,
362  opcintype);
363  }
364  if (OidIsValid(opclassoid))
365  {
366  /* Hard dependency on opclass */
367  op->ref_is_hard = true;
368  op->ref_is_family = false;
369  op->refobjid = opclassoid;
370  }
371  else
372  {
373  /* We're stuck, so make a soft dependency on the opfamily */
374  op->ref_is_hard = false;
375  op->ref_is_family = true;
376  op->refobjid = opfamilyoid;
377  }
378  }
379  }
380 }
Oid opclass_for_family_datatype(Oid amoid, Oid opfamilyoid, Oid datatypeoid)
Definition: amvalidate.c:236
List * list_concat_copy(const List *list1, const List *list2)
Definition: list.c:598
Oid get_opclass_input_type(Oid opclass)
Definition: lsyscache.c:1212
#define lfirst(lc)
Definition: pg_list.h:172
static const struct fns functions
Definition: regcomp.c:358
Oid refobjid
Definition: amapi.h:90
Oid lefttype
Definition: amapi.h:85
bool ref_is_family
Definition: amapi.h:89
Oid righttype
Definition: amapi.h:86
int number
Definition: amapi.h:84
bool is_func
Definition: amapi.h:82
bool ref_is_hard
Definition: amapi.h:88
void CommandCounterIncrement(void)
Definition: xact.c:1099

References BTORDER_PROC, CommandCounterIncrement(), functions, get_opclass_input_type(), InvalidOid, OpFamilyMember::is_func, OpFamilyMember::lefttype, lfirst, list_concat_copy(), OpFamilyMember::number, OidIsValid, opclass_for_family_datatype(), OpFamilyMember::ref_is_family, OpFamilyMember::ref_is_hard, OpFamilyMember::refobjid, and OpFamilyMember::righttype.

Referenced by bthandler().

◆ btbeginscan()

IndexScanDesc btbeginscan ( Relation  rel,
int  nkeys,
int  norderbys 
)

Definition at line 312 of file nbtree.c.

313 {
314  IndexScanDesc scan;
315  BTScanOpaque so;
316 
317  /* no order by operators allowed */
318  Assert(norderbys == 0);
319 
320  /* get the scan */
321  scan = RelationGetIndexScan(rel, nkeys, norderbys);
322 
323  /* allocate private workspace */
324  so = (BTScanOpaque) palloc(sizeof(BTScanOpaqueData));
327  if (scan->numberOfKeys > 0)
328  so->keyData = (ScanKey) palloc(scan->numberOfKeys * sizeof(ScanKeyData));
329  else
330  so->keyData = NULL;
331 
332  so->needPrimScan = false;
333  so->scanBehind = false;
334  so->oppositeDirCheck = false;
335  so->arrayKeys = NULL;
336  so->orderProcs = NULL;
337  so->arrayContext = NULL;
338 
339  so->killedItems = NULL; /* until needed */
340  so->numKilled = 0;
341 
342  /*
343  * We don't know yet whether the scan will be index-only, so we do not
344  * allocate the tuple workspace arrays until btrescan. However, we set up
345  * scan->xs_itupdesc whether we'll need it or not, since that's so cheap.
346  */
347  so->currTuples = so->markTuples = NULL;
348 
349  scan->xs_itupdesc = RelationGetDescr(rel);
350 
351  scan->opaque = so;
352 
353  return scan;
354 }
IndexScanDesc RelationGetIndexScan(Relation indexRelation, int nkeys, int norderbys)
Definition: genam.c:80
char * markTuples
Definition: nbtree.h:1057
char * currTuples
Definition: nbtree.h:1056
BTScanPosData markPos
Definition: nbtree.h:1070
struct TupleDescData * xs_itupdesc
Definition: relscan.h:166

References BTScanOpaqueData::arrayContext, BTScanOpaqueData::arrayKeys, Assert, BTScanPosInvalidate, BTScanOpaqueData::currPos, BTScanOpaqueData::currTuples, BTScanOpaqueData::keyData, BTScanOpaqueData::killedItems, BTScanOpaqueData::markPos, BTScanOpaqueData::markTuples, BTScanOpaqueData::needPrimScan, IndexScanDescData::numberOfKeys, BTScanOpaqueData::numKilled, IndexScanDescData::opaque, BTScanOpaqueData::oppositeDirCheck, BTScanOpaqueData::orderProcs, palloc(), RelationGetDescr, RelationGetIndexScan(), BTScanOpaqueData::scanBehind, and IndexScanDescData::xs_itupdesc.

Referenced by bthandler().

◆ btbuild()

IndexBuildResult* btbuild ( Relation  heap,
Relation  index,
struct IndexInfo indexInfo 
)

Definition at line 295 of file nbtsort.c.

296 {
297  IndexBuildResult *result;
298  BTBuildState buildstate;
299  double reltuples;
300 
301 #ifdef BTREE_BUILD_STATS
303  ResetUsage();
304 #endif /* BTREE_BUILD_STATS */
305 
306  buildstate.isunique = indexInfo->ii_Unique;
307  buildstate.nulls_not_distinct = indexInfo->ii_NullsNotDistinct;
308  buildstate.havedead = false;
309  buildstate.heap = heap;
310  buildstate.spool = NULL;
311  buildstate.spool2 = NULL;
312  buildstate.indtuples = 0;
313  buildstate.btleader = NULL;
314 
315  /*
316  * We expect to be called exactly once for any index relation. If that's
317  * not the case, big trouble's what we have.
318  */
320  elog(ERROR, "index \"%s\" already contains data",
322 
323  reltuples = _bt_spools_heapscan(heap, index, &buildstate, indexInfo);
324 
325  /*
326  * Finish the build by (1) completing the sort of the spool file, (2)
327  * inserting the sorted tuples into btree pages and (3) building the upper
328  * levels. Finally, it may also be necessary to end use of parallelism.
329  */
330  _bt_leafbuild(buildstate.spool, buildstate.spool2);
331  _bt_spooldestroy(buildstate.spool);
332  if (buildstate.spool2)
333  _bt_spooldestroy(buildstate.spool2);
334  if (buildstate.btleader)
335  _bt_end_parallel(buildstate.btleader);
336 
337  result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
338 
339  result->heap_tuples = reltuples;
340  result->index_tuples = buildstate.indtuples;
341 
342 #ifdef BTREE_BUILD_STATS
344  {
345  ShowUsage("BTREE BUILD STATS");
346  ResetUsage();
347  }
348 #endif /* BTREE_BUILD_STATS */
349 
350  return result;
351 }
static void _bt_end_parallel(BTLeader *btleader)
Definition: nbtsort.c:1610
static void _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2)
Definition: nbtsort.c:538
static double _bt_spools_heapscan(Relation heap, Relation index, BTBuildState *buildstate, IndexInfo *indexInfo)
Definition: nbtsort.c:365
static void _bt_spooldestroy(BTSpool *btspool)
Definition: nbtsort.c:517
bool isunique
Definition: nbtsort.c:206
BTSpool * spool
Definition: nbtsort.c:210
BTLeader * btleader
Definition: nbtsort.c:224
bool nulls_not_distinct
Definition: nbtsort.c:207
bool havedead
Definition: nbtsort.c:208
Relation heap
Definition: nbtsort.c:209
BTSpool * spool2
Definition: nbtsort.c:216
double indtuples
Definition: nbtsort.c:217
double heap_tuples
Definition: genam.h:32
double index_tuples
Definition: genam.h:33
bool ii_Unique
Definition: execnodes.h:199
bool ii_NullsNotDistinct
Definition: execnodes.h:200
Definition: type.h:95

References _bt_end_parallel(), _bt_leafbuild(), _bt_spooldestroy(), _bt_spools_heapscan(), BTBuildState::btleader, elog, ERROR, BTBuildState::havedead, BTBuildState::heap, IndexBuildResult::heap_tuples, IndexInfo::ii_NullsNotDistinct, IndexInfo::ii_Unique, IndexBuildResult::index_tuples, BTBuildState::indtuples, BTBuildState::isunique, log_btree_build_stats, BTBuildState::nulls_not_distinct, palloc(), RelationGetNumberOfBlocks, RelationGetRelationName, ResetUsage(), ShowUsage(), BTBuildState::spool, and BTBuildState::spool2.

Referenced by bthandler().

◆ btbuildempty()

void btbuildempty ( Relation  index)

Definition at line 159 of file nbtree.c.

160 {
161  bool allequalimage = _bt_allequalimage(index, false);
162  BulkWriteState *bulkstate;
163  BulkWriteBuffer metabuf;
164 
165  bulkstate = smgr_bulk_start_rel(index, INIT_FORKNUM);
166 
167  /* Construct metapage. */
168  metabuf = smgr_bulk_get_buf(bulkstate);
169  _bt_initmetapage((Page) metabuf, P_NONE, 0, allequalimage);
170  smgr_bulk_write(bulkstate, BTREE_METAPAGE, metabuf, true);
171 
172  smgr_bulk_finish(bulkstate);
173 }
void smgr_bulk_write(BulkWriteState *bulkstate, BlockNumber blocknum, BulkWriteBuffer buf, bool page_std)
Definition: bulk_write.c:323
BulkWriteBuffer smgr_bulk_get_buf(BulkWriteState *bulkstate)
Definition: bulk_write.c:347
void smgr_bulk_finish(BulkWriteState *bulkstate)
Definition: bulk_write.c:130
BulkWriteState * smgr_bulk_start_rel(Relation rel, ForkNumber forknum)
Definition: bulk_write.c:87
void _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level, bool allequalimage)
Definition: nbtpage.c:67
bool _bt_allequalimage(Relation rel, bool debugmessage)
Definition: nbtutils.c:5142
@ INIT_FORKNUM
Definition: relpath.h:61

References _bt_allequalimage(), _bt_initmetapage(), BTREE_METAPAGE, INIT_FORKNUM, P_NONE, smgr_bulk_finish(), smgr_bulk_get_buf(), smgr_bulk_start_rel(), and smgr_bulk_write().

Referenced by bthandler().

◆ btbuildphasename()

char* btbuildphasename ( int64  phasenum)

Definition at line 4610 of file nbtutils.c.

4611 {
4612  switch (phasenum)
4613  {
4615  return "initializing";
4617  return "scanning table";
4619  return "sorting live tuples";
4621  return "sorting dead tuples";
4623  return "loading tuples in tree";
4624  default:
4625  return NULL;
4626  }
4627 }
#define PROGRESS_BTREE_PHASE_PERFORMSORT_2
Definition: nbtree.h:1149
#define PROGRESS_BTREE_PHASE_LEAF_LOAD
Definition: nbtree.h:1150
#define PROGRESS_BTREE_PHASE_INDEXBUILD_TABLESCAN
Definition: nbtree.h:1147
#define PROGRESS_BTREE_PHASE_PERFORMSORT_1
Definition: nbtree.h:1148
#define PROGRESS_CREATEIDX_SUBPHASE_INITIALIZE
Definition: progress.h:107

References PROGRESS_BTREE_PHASE_INDEXBUILD_TABLESCAN, PROGRESS_BTREE_PHASE_LEAF_LOAD, PROGRESS_BTREE_PHASE_PERFORMSORT_1, PROGRESS_BTREE_PHASE_PERFORMSORT_2, and PROGRESS_CREATEIDX_SUBPHASE_INITIALIZE.

Referenced by bthandler().

◆ btbulkdelete()

IndexBulkDeleteResult* btbulkdelete ( IndexVacuumInfo info,
IndexBulkDeleteResult stats,
IndexBulkDeleteCallback  callback,
void *  callback_state 
)

Definition at line 862 of file nbtree.c.

864 {
865  Relation rel = info->index;
866  BTCycleId cycleid;
867 
868  /* allocate stats if first time through, else re-use existing struct */
869  if (stats == NULL)
871 
872  /* Establish the vacuum cycle ID to use for this scan */
873  /* The ENSURE stuff ensures we clean up shared memory on failure */
875  {
876  cycleid = _bt_start_vacuum(rel);
877 
878  btvacuumscan(info, stats, callback, callback_state, cycleid);
879  }
881  _bt_end_vacuum(rel);
882 
883  return stats;
884 }
#define PG_ENSURE_ERROR_CLEANUP(cleanup_function, arg)
Definition: ipc.h:47
#define PG_END_ENSURE_ERROR_CLEANUP(cleanup_function, arg)
Definition: ipc.h:52
static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state, BTCycleId cycleid)
Definition: nbtree.c:980
void _bt_end_vacuum_callback(int code, Datum arg)
Definition: nbtutils.c:4514
BTCycleId _bt_start_vacuum(Relation rel)
Definition: nbtutils.c:4429
static Datum PointerGetDatum(const void *X)
Definition: postgres.h:322
Relation index
Definition: genam.h:46
static void callback(struct sockaddr *addr, struct sockaddr *mask, void *unused)
Definition: test_ifaddrs.c:46

References _bt_end_vacuum(), _bt_end_vacuum_callback(), _bt_start_vacuum(), btvacuumscan(), callback(), IndexVacuumInfo::index, palloc0(), PG_END_ENSURE_ERROR_CLEANUP, PG_ENSURE_ERROR_CLEANUP, and PointerGetDatum().

Referenced by bthandler().

◆ btcanreturn()

bool btcanreturn ( Relation  index,
int  attno 
)

Definition at line 1498 of file nbtree.c.

1499 {
1500  return true;
1501 }

Referenced by bthandler().

◆ btendscan()

void btendscan ( IndexScanDesc  scan)

Definition at line 417 of file nbtree.c.

418 {
419  BTScanOpaque so = (BTScanOpaque) scan->opaque;
420 
421  /* we aren't holding any read locks, but gotta drop the pins */
423  {
424  /* Before leaving current page, deal with any killed items */
425  if (so->numKilled > 0)
426  _bt_killitems(scan);
428  }
429 
430  so->markItemIndex = -1;
432 
433  /* No need to invalidate positions, the RAM is about to be freed. */
434 
435  /* Release storage */
436  if (so->keyData != NULL)
437  pfree(so->keyData);
438  /* so->arrayKeys and so->orderProcs are in arrayContext */
439  if (so->arrayContext != NULL)
441  if (so->killedItems != NULL)
442  pfree(so->killedItems);
443  if (so->currTuples != NULL)
444  pfree(so->currTuples);
445  /* so->markTuples should not be pfree'd, see btrescan */
446  pfree(so);
447 }
void MemoryContextDelete(MemoryContext context)
Definition: mcxt.c:454
#define BTScanPosUnpinIfPinned(scanpos)
Definition: nbtree.h:1004
void _bt_killitems(IndexScanDesc scan)
Definition: nbtutils.c:4178

References _bt_killitems(), BTScanOpaqueData::arrayContext, BTScanPosIsValid, BTScanPosUnpinIfPinned, BTScanOpaqueData::currPos, BTScanOpaqueData::currTuples, if(), BTScanOpaqueData::keyData, BTScanOpaqueData::killedItems, BTScanOpaqueData::markItemIndex, BTScanOpaqueData::markPos, MemoryContextDelete(), BTScanOpaqueData::numKilled, IndexScanDescData::opaque, and pfree().

Referenced by bthandler().

◆ btestimateparallelscan()

Size btestimateparallelscan ( int  nkeys,
int  norderbys 
)

Definition at line 537 of file nbtree.c.

538 {
539  /* Pessimistically assume all input scankeys will be output with arrays */
540  return offsetof(BTParallelScanDescData, btps_arrElems) + sizeof(int) * nkeys;
541 }

References BTParallelScanDescData::btps_arrElems.

Referenced by bthandler().

◆ btgetbitmap()

int64 btgetbitmap ( IndexScanDesc  scan,
TIDBitmap tbm 
)

Definition at line 266 of file nbtree.c.

267 {
268  BTScanOpaque so = (BTScanOpaque) scan->opaque;
269  int64 ntids = 0;
270  ItemPointer heapTid;
271 
272  /* Each loop iteration performs another primitive index scan */
273  do
274  {
275  /* Fetch the first page & tuple */
277  {
278  /* Save tuple ID, and continue scanning */
279  heapTid = &scan->xs_heaptid;
280  tbm_add_tuples(tbm, heapTid, 1, false);
281  ntids++;
282 
283  for (;;)
284  {
285  /*
286  * Advance to next tuple within page. This is the same as the
287  * easy case in _bt_next().
288  */
289  if (++so->currPos.itemIndex > so->currPos.lastItem)
290  {
291  /* let _bt_next do the heavy lifting */
292  if (!_bt_next(scan, ForwardScanDirection))
293  break;
294  }
295 
296  /* Save tuple ID, and continue scanning */
297  heapTid = &so->currPos.items[so->currPos.itemIndex].heapTid;
298  tbm_add_tuples(tbm, heapTid, 1, false);
299  ntids++;
300  }
301  }
302  /* Now see if we need another primitive index scan */
303  } while (so->numArrayKeys && _bt_start_prim_scan(scan, ForwardScanDirection));
304 
305  return ntids;
306 }
bool _bt_first(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:882
bool _bt_next(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:1461
bool _bt_start_prim_scan(IndexScanDesc scan, ScanDirection dir)
Definition: nbtutils.c:1682
@ ForwardScanDirection
Definition: sdir.h:28
ItemPointerData xs_heaptid
Definition: relscan.h:170
void tbm_add_tuples(TIDBitmap *tbm, const ItemPointer tids, int ntids, bool recheck)
Definition: tidbitmap.c:377

References _bt_first(), _bt_next(), BTScanOpaqueData::currPos, ForwardScanDirection, BTScanPosItem::heapTid, if(), BTScanPosData::itemIndex, BTScanPosData::items, BTScanPosData::lastItem, IndexScanDescData::opaque, tbm_add_tuples(), and IndexScanDescData::xs_heaptid.

Referenced by bthandler().

◆ btgettreeheight()

int btgettreeheight ( Relation  rel)

Definition at line 1507 of file nbtree.c.

1508 {
1509  return _bt_getrootheight(rel);
1510 }
int _bt_getrootheight(Relation rel)
Definition: nbtpage.c:675

References _bt_getrootheight().

Referenced by bthandler().

◆ btgettuple()

bool btgettuple ( IndexScanDesc  scan,
ScanDirection  dir 
)

Definition at line 206 of file nbtree.c.

207 {
208  BTScanOpaque so = (BTScanOpaque) scan->opaque;
209  bool res;
210 
211  /* btree indexes are never lossy */
212  scan->xs_recheck = false;
213 
214  /* Each loop iteration performs another primitive index scan */
215  do
216  {
217  /*
218  * If we've already initialized this scan, we can just advance it in
219  * the appropriate direction. If we haven't done so yet, we call
220  * _bt_first() to get the first item in the scan.
221  */
222  if (!BTScanPosIsValid(so->currPos))
223  res = _bt_first(scan, dir);
224  else
225  {
226  /*
227  * Check to see if we should kill the previously-fetched tuple.
228  */
229  if (scan->kill_prior_tuple)
230  {
231  /*
232  * Yes, remember it for later. (We'll deal with all such
233  * tuples at once right before leaving the index page.) The
234  * test for numKilled overrun is not just paranoia: if the
235  * caller reverses direction in the indexscan then the same
236  * item might get entered multiple times. It's not worth
237  * trying to optimize that, so we don't detect it, but instead
238  * just forget any excess entries.
239  */
240  if (so->killedItems == NULL)
241  so->killedItems = (int *)
242  palloc(MaxTIDsPerBTreePage * sizeof(int));
243  if (so->numKilled < MaxTIDsPerBTreePage)
244  so->killedItems[so->numKilled++] = so->currPos.itemIndex;
245  }
246 
247  /*
248  * Now continue the scan.
249  */
250  res = _bt_next(scan, dir);
251  }
252 
253  /* If we have a tuple, return it ... */
254  if (res)
255  break;
256  /* ... otherwise see if we need another primitive index scan */
257  } while (so->numArrayKeys && _bt_start_prim_scan(scan, dir));
258 
259  return res;
260 }
bool kill_prior_tuple
Definition: relscan.h:151

References _bt_first(), _bt_next(), BTScanPosIsValid, BTScanOpaqueData::currPos, if(), BTScanPosData::itemIndex, IndexScanDescData::kill_prior_tuple, BTScanOpaqueData::killedItems, MaxTIDsPerBTreePage, BTScanOpaqueData::numKilled, IndexScanDescData::opaque, palloc(), res, and IndexScanDescData::xs_recheck.

Referenced by bthandler().

◆ btinitparallelscan()

void btinitparallelscan ( void *  target)

◆ btinsert()

bool btinsert ( Relation  rel,
Datum values,
bool isnull,
ItemPointer  ht_ctid,
Relation  heapRel,
IndexUniqueCheck  checkUnique,
bool  indexUnchanged,
struct IndexInfo indexInfo 
)

Definition at line 182 of file nbtree.c.

187 {
188  bool result;
189  IndexTuple itup;
190 
191  /* generate an index tuple */
192  itup = index_form_tuple(RelationGetDescr(rel), values, isnull);
193  itup->t_tid = *ht_ctid;
194 
195  result = _bt_doinsert(rel, itup, checkUnique, indexUnchanged, heapRel);
196 
197  pfree(itup);
198 
199  return result;
200 }
static Datum values[MAXATTR]
Definition: bootstrap.c:151
IndexTuple index_form_tuple(TupleDesc tupleDescriptor, const Datum *values, const bool *isnull)
Definition: indextuple.c:44
bool _bt_doinsert(Relation rel, IndexTuple itup, IndexUniqueCheck checkUnique, bool indexUnchanged, Relation heapRel)
Definition: nbtinsert.c:102

References _bt_doinsert(), index_form_tuple(), pfree(), RelationGetDescr, IndexTupleData::t_tid, and values.

Referenced by bthandler().

◆ btmarkpos()

void btmarkpos ( IndexScanDesc  scan)

Definition at line 453 of file nbtree.c.

454 {
455  BTScanOpaque so = (BTScanOpaque) scan->opaque;
456 
457  /* There may be an old mark with a pin (but no lock). */
459 
460  /*
461  * Just record the current itemIndex. If we later step to next page
462  * before releasing the marked position, _bt_steppage makes a full copy of
463  * the currPos struct in markPos. If (as often happens) the mark is moved
464  * before we leave the page, we don't have to do that work.
465  */
466  if (BTScanPosIsValid(so->currPos))
467  so->markItemIndex = so->currPos.itemIndex;
468  else
469  {
471  so->markItemIndex = -1;
472  }
473 }

References BTScanPosInvalidate, BTScanPosIsValid, BTScanPosUnpinIfPinned, BTScanOpaqueData::currPos, BTScanPosData::itemIndex, BTScanOpaqueData::markItemIndex, BTScanOpaqueData::markPos, and IndexScanDescData::opaque.

Referenced by bthandler().

◆ btoptions()

bytea* btoptions ( Datum  reloptions,
bool  validate 
)

Definition at line 4564 of file nbtutils.c.

4565 {
4566  static const relopt_parse_elt tab[] = {
4567  {"fillfactor", RELOPT_TYPE_INT, offsetof(BTOptions, fillfactor)},
4568  {"vacuum_cleanup_index_scale_factor", RELOPT_TYPE_REAL,
4569  offsetof(BTOptions, vacuum_cleanup_index_scale_factor)},
4570  {"deduplicate_items", RELOPT_TYPE_BOOL,
4571  offsetof(BTOptions, deduplicate_items)}
4572  };
4573 
4574  return (bytea *) build_reloptions(reloptions, validate,
4576  sizeof(BTOptions),
4577  tab, lengthof(tab));
4578 }
#define lengthof(array)
Definition: c.h:793
static int fillfactor
Definition: pgbench.c:187
void * build_reloptions(Datum reloptions, bool validate, relopt_kind kind, Size relopt_struct_size, const relopt_parse_elt *relopt_elems, int num_relopt_elems)
Definition: reloptions.c:1908
@ RELOPT_KIND_BTREE
Definition: reloptions.h:44
@ RELOPT_TYPE_INT
Definition: reloptions.h:32
@ RELOPT_TYPE_BOOL
Definition: reloptions.h:31
@ RELOPT_TYPE_REAL
Definition: reloptions.h:33
Definition: c.h:692

References build_reloptions(), fillfactor, lengthof, RELOPT_KIND_BTREE, RELOPT_TYPE_BOOL, RELOPT_TYPE_INT, and RELOPT_TYPE_REAL.

Referenced by bthandler().

◆ BTPageGetDeleteXid()

static FullTransactionId BTPageGetDeleteXid ( Page  page)
inlinestatic

Definition at line 260 of file nbtree.h.

261 {
262  BTPageOpaque opaque;
263  BTDeletedPageData *contents;
264 
265  /* We only expect to be called with a deleted page */
266  Assert(!PageIsNew(page));
267  opaque = BTPageGetOpaque(page);
268  Assert(P_ISDELETED(opaque));
269 
270  /* pg_upgrade'd deleted page -- must be safe to recycle now */
271  if (!P_HAS_FULLXID(opaque))
273 
274  /* Get safexid from deleted page */
275  contents = ((BTDeletedPageData *) PageGetContents(page));
276  return contents->safexid;
277 }
static char * PageGetContents(Page page)
Definition: bufpage.h:257
#define P_HAS_FULLXID(opaque)
Definition: nbtree.h:228
FullTransactionId safexid
Definition: nbtree.h:235
#define FirstNormalFullTransactionId
Definition: transam.h:57

References Assert, BTPageGetOpaque, FirstNormalFullTransactionId, P_HAS_FULLXID, P_ISDELETED, PageGetContents(), PageIsNew(), and BTDeletedPageData::safexid.

Referenced by _bt_allocbuf(), BTPageIsRecyclable(), and GetBTPageStatistics().

◆ BTPageIsRecyclable()

static bool BTPageIsRecyclable ( Page  page,
Relation  heaprel 
)
inlinestatic

Definition at line 291 of file nbtree.h.

292 {
293  BTPageOpaque opaque;
294 
295  Assert(!PageIsNew(page));
296  Assert(heaprel != NULL);
297 
298  /* Recycling okay iff page is deleted and safexid is old enough */
299  opaque = BTPageGetOpaque(page);
300  if (P_ISDELETED(opaque))
301  {
302  FullTransactionId safexid = BTPageGetDeleteXid(page);
303 
304  /*
305  * The page was deleted, but when? If it was just deleted, a scan
306  * might have seen the downlink to it, and will read the page later.
307  * As long as that can happen, we must keep the deleted page around as
308  * a tombstone.
309  *
310  * For that check if the deletion XID could still be visible to
311  * anyone. If not, then no scan that's still in progress could have
312  * seen its downlink, and we can recycle it.
313  */
314  return GlobalVisCheckRemovableFullXid(heaprel, safexid);
315  }
316 
317  return false;
318 }

References Assert, BTPageGetDeleteXid(), BTPageGetOpaque, GlobalVisCheckRemovableFullXid(), P_ISDELETED, and PageIsNew().

Referenced by _bt_allocbuf(), and btvacuumpage().

◆ BTPageSetDeleted()

static void BTPageSetDeleted ( Page  page,
FullTransactionId  safexid 
)
inlinestatic

Definition at line 239 of file nbtree.h.

240 {
241  BTPageOpaque opaque;
242  PageHeader header;
243  BTDeletedPageData *contents;
244 
245  opaque = BTPageGetOpaque(page);
246  header = ((PageHeader) page);
247 
248  opaque->btpo_flags &= ~BTP_HALF_DEAD;
251  sizeof(BTDeletedPageData);
252  header->pd_upper = header->pd_special;
253 
254  /* Set safexid in deleted page */
255  contents = ((BTDeletedPageData *) PageGetContents(page));
256  contents->safexid = safexid;
257 }
#define BTP_HAS_FULLXID
Definition: nbtree.h:84
#define BTP_HALF_DEAD
Definition: nbtree.h:80
struct BTDeletedPageData BTDeletedPageData
#define BTP_DELETED
Definition: nbtree.h:78
LocationIndex pd_special
Definition: bufpage.h:167
LocationIndex pd_upper
Definition: bufpage.h:166
LocationIndex pd_lower
Definition: bufpage.h:165

References BTP_DELETED, BTP_HALF_DEAD, BTP_HAS_FULLXID, BTPageGetOpaque, BTPageOpaqueData::btpo_flags, MAXALIGN, PageGetContents(), PageHeaderData::pd_lower, PageHeaderData::pd_special, PageHeaderData::pd_upper, BTDeletedPageData::safexid, and SizeOfPageHeaderData.

Referenced by _bt_unlink_halfdead_page(), and btree_xlog_unlink_page().

◆ btparallelrescan()

void btparallelrescan ( IndexScanDesc  scan)

Definition at line 562 of file nbtree.c.

563 {
564  BTParallelScanDesc btscan;
565  ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
566 
567  Assert(parallel_scan);
568 
569  btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
570  parallel_scan->ps_offset);
571 
572  /*
573  * In theory, we don't need to acquire the spinlock here, because there
574  * shouldn't be any other workers running at this point, but we do so for
575  * consistency.
576  */
577  SpinLockAcquire(&btscan->btps_mutex);
581  SpinLockRelease(&btscan->btps_mutex);
582 }

References Assert, BTPARALLEL_NOT_INITIALIZED, BTParallelScanDescData::btps_lastCurrPage, BTParallelScanDescData::btps_mutex, BTParallelScanDescData::btps_nextScanPage, BTParallelScanDescData::btps_pageStatus, InvalidBlockNumber, OffsetToPointer, IndexScanDescData::parallel_scan, ParallelIndexScanDescData::ps_offset, SpinLockAcquire, and SpinLockRelease.

Referenced by bthandler().

◆ btproperty()

bool btproperty ( Oid  index_oid,
int  attno,
IndexAMProperty  prop,
const char *  propname,
bool res,
bool isnull 
)

Definition at line 4587 of file nbtutils.c.

4590 {
4591  switch (prop)
4592  {
4593  case AMPROP_RETURNABLE:
4594  /* answer only for columns, not AM or whole index */
4595  if (attno == 0)
4596  return false;
4597  /* otherwise, btree can always return data */
4598  *res = true;
4599  return true;
4600 
4601  default:
4602  return false; /* punt to generic code */
4603  }
4604 }
@ AMPROP_RETURNABLE
Definition: amapi.h:43

References AMPROP_RETURNABLE, and res.

Referenced by bthandler().

◆ BTreeShmemInit()

void BTreeShmemInit ( void  )

Definition at line 4536 of file nbtutils.c.

4537 {
4538  bool found;
4539 
4540  btvacinfo = (BTVacInfo *) ShmemInitStruct("BTree Vacuum State",
4541  BTreeShmemSize(),
4542  &found);
4543 
4544  if (!IsUnderPostmaster)
4545  {
4546  /* Initialize shared memory area */
4547  Assert(!found);
4548 
4549  /*
4550  * It doesn't really matter what the cycle counter starts at, but
4551  * having it always start the same doesn't seem good. Seed with
4552  * low-order bits of time() instead.
4553  */
4554  btvacinfo->cycle_ctr = (BTCycleId) time(NULL);
4555 
4556  btvacinfo->num_vacuums = 0;
4558  }
4559  else
4560  Assert(found);
4561 }
bool IsUnderPostmaster
Definition: globals.c:119
int MaxBackends
Definition: globals.c:145
Size BTreeShmemSize(void)
Definition: nbtutils.c:4523
void * ShmemInitStruct(const char *name, Size size, bool *foundPtr)
Definition: shmem.c:382

References Assert, BTreeShmemSize(), btvacinfo, BTVacInfo::cycle_ctr, IsUnderPostmaster, BTVacInfo::max_vacuums, MaxBackends, BTVacInfo::num_vacuums, and ShmemInitStruct().

Referenced by CreateOrAttachShmemStructs().

◆ BTreeShmemSize()

Size BTreeShmemSize ( void  )

Definition at line 4523 of file nbtutils.c.

4524 {
4525  Size size;
4526 
4527  size = offsetof(BTVacInfo, vacuums);
4529  return size;
4530 }
Size add_size(Size s1, Size s2)
Definition: shmem.c:488
Size mul_size(Size s1, Size s2)
Definition: shmem.c:505

References add_size(), MaxBackends, mul_size(), and size.

Referenced by BTreeShmemInit(), and CalculateShmemSize().

◆ BTreeTupleGetDownLink()

static BlockNumber BTreeTupleGetDownLink ( IndexTuple  pivot)
inlinestatic

◆ BTreeTupleGetHeapTID()

static ItemPointer BTreeTupleGetHeapTID ( IndexTuple  itup)
inlinestatic

Definition at line 638 of file nbtree.h.

639 {
640  if (BTreeTupleIsPivot(itup))
641  {
642  /* Pivot tuple heap TID representation? */
645  return (ItemPointer) ((char *) itup + IndexTupleSize(itup) -
646  sizeof(ItemPointerData));
647 
648  /* Heap TID attribute was truncated */
649  return NULL;
650  }
651  else if (BTreeTupleIsPosting(itup))
652  return BTreeTupleGetPosting(itup);
653 
654  return &itup->t_tid;
655 }
struct ItemPointerData ItemPointerData

References BT_PIVOT_HEAP_TID_ATTR, BTreeTupleGetPosting(), BTreeTupleIsPivot(), BTreeTupleIsPosting(), IndexTupleSize, ItemPointerGetOffsetNumberNoCheck(), and IndexTupleData::t_tid.

Referenced by _bt_bottomupdel_finish_pending(), _bt_check_natts(), _bt_check_third_page(), _bt_compare(), _bt_delitems_delete_check(), _bt_mkscankey(), _bt_swap_posting(), _bt_truncate(), bt_entry_unique_check(), bt_page_print_tuples(), bt_target_page_check(), BTreeTupleGetHeapTIDCareful(), and BTreeTupleGetPointsToTID().

◆ BTreeTupleGetMaxHeapTID()

static ItemPointer BTreeTupleGetMaxHeapTID ( IndexTuple  itup)
inlinestatic

Definition at line 664 of file nbtree.h.

665 {
666  Assert(!BTreeTupleIsPivot(itup));
667 
668  if (BTreeTupleIsPosting(itup))
669  {
670  uint16 nposting = BTreeTupleGetNPosting(itup);
671 
672  return BTreeTupleGetPostingN(itup, nposting - 1);
673  }
674 
675  return &itup->t_tid;
676 }

References Assert, BTreeTupleGetNPosting(), BTreeTupleGetPostingN(), BTreeTupleIsPivot(), BTreeTupleIsPosting(), and IndexTupleData::t_tid.

Referenced by _bt_bottomupdel_finish_pending(), _bt_compare(), _bt_delitems_delete_check(), _bt_swap_posting(), _bt_truncate(), and bt_target_page_check().

◆ BTreeTupleGetNPosting()

◆ BTreeTupleGetPosting()

static ItemPointer BTreeTupleGetPosting ( IndexTuple  posting)
inlinestatic

◆ BTreeTupleGetPostingN()

◆ BTreeTupleGetPostingOffset()

◆ BTreeTupleGetTopParent()

static BlockNumber BTreeTupleGetTopParent ( IndexTuple  leafhikey)
inlinestatic

Definition at line 620 of file nbtree.h.

621 {
622  return ItemPointerGetBlockNumberNoCheck(&leafhikey->t_tid);
623 }

References ItemPointerGetBlockNumberNoCheck(), and IndexTupleData::t_tid.

Referenced by _bt_unlink_halfdead_page(), and bt_downlink_missing_check().

◆ BTreeTupleIsPivot()

◆ BTreeTupleIsPosting()

◆ BTreeTupleSetDownLink()

static void BTreeTupleSetDownLink ( IndexTuple  pivot,
BlockNumber  blkno 
)
inlinestatic

Definition at line 562 of file nbtree.h.

563 {
564  ItemPointerSetBlockNumber(&pivot->t_tid, blkno);
565 }
static void ItemPointerSetBlockNumber(ItemPointerData *pointer, BlockNumber blockNumber)
Definition: itemptr.h:147

References ItemPointerSetBlockNumber(), and IndexTupleData::t_tid.

Referenced by _bt_buildadd(), _bt_insert_parent(), _bt_mark_page_halfdead(), _bt_newlevel(), _bt_uppershutdown(), and btree_xlog_mark_page_halfdead().

◆ BTreeTupleSetNAtts()

static void BTreeTupleSetNAtts ( IndexTuple  itup,
uint16  nkeyatts,
bool  heaptid 
)
inlinestatic

Definition at line 595 of file nbtree.h.

596 {
597  Assert(nkeyatts <= INDEX_MAX_KEYS);
598  Assert((nkeyatts & BT_STATUS_OFFSET_MASK) == 0);
599  Assert(!heaptid || nkeyatts > 0);
600  Assert(!BTreeTupleIsPivot(itup) || nkeyatts == 0);
601 
602  itup->t_info |= INDEX_ALT_TID_MASK;
603 
604  if (heaptid)
605  nkeyatts |= BT_PIVOT_HEAP_TID_ATTR;
606 
607  /* BT_IS_POSTING bit is deliberately unset here */
608  ItemPointerSetOffsetNumber(&itup->t_tid, nkeyatts);
609  Assert(BTreeTupleIsPivot(itup));
610 }
#define BT_STATUS_OFFSET_MASK
Definition: nbtree.h:463

References Assert, BT_PIVOT_HEAP_TID_ATTR, BT_STATUS_OFFSET_MASK, BTreeTupleIsPivot(), INDEX_ALT_TID_MASK, INDEX_MAX_KEYS, ItemPointerSetOffsetNumber(), IndexTupleData::t_info, and IndexTupleData::t_tid.

Referenced by _bt_buildadd(), _bt_newlevel(), _bt_pgaddtup(), _bt_sortaddtup(), _bt_truncate(), and BTreeTupleSetTopParent().

◆ BTreeTupleSetPosting()

static void BTreeTupleSetPosting ( IndexTuple  itup,
uint16  nhtids,
int  postingoffset 
)
inlinestatic

Definition at line 504 of file nbtree.h.

505 {
506  Assert(nhtids > 1);
507  Assert((nhtids & BT_STATUS_OFFSET_MASK) == 0);
508  Assert((size_t) postingoffset == MAXALIGN(postingoffset));
509  Assert(postingoffset < INDEX_SIZE_MASK);
510  Assert(!BTreeTupleIsPivot(itup));
511 
512  itup->t_info |= INDEX_ALT_TID_MASK;
513  ItemPointerSetOffsetNumber(&itup->t_tid, (nhtids | BT_IS_POSTING));
514  ItemPointerSetBlockNumber(&itup->t_tid, postingoffset);
515 }

References Assert, BT_IS_POSTING, BT_STATUS_OFFSET_MASK, BTreeTupleIsPivot(), INDEX_ALT_TID_MASK, INDEX_SIZE_MASK, ItemPointerSetBlockNumber(), ItemPointerSetOffsetNumber(), MAXALIGN, IndexTupleData::t_info, and IndexTupleData::t_tid.

Referenced by _bt_form_posting(), and _bt_update_posting().

◆ BTreeTupleSetTopParent()

static void BTreeTupleSetTopParent ( IndexTuple  leafhikey,
BlockNumber  blkno 
)
inlinestatic

◆ btrescan()

void btrescan ( IndexScanDesc  scan,
ScanKey  scankey,
int  nscankeys,
ScanKey  orderbys,
int  norderbys 
)

Definition at line 360 of file nbtree.c.

362 {
363  BTScanOpaque so = (BTScanOpaque) scan->opaque;
364 
365  /* we aren't holding any read locks, but gotta drop the pins */
367  {
368  /* Before leaving current page, deal with any killed items */
369  if (so->numKilled > 0)
370  _bt_killitems(scan);
373  }
374 
375  so->markItemIndex = -1;
376  so->needPrimScan = false;
377  so->scanBehind = false;
378  so->oppositeDirCheck = false;
381 
382  /*
383  * Allocate tuple workspace arrays, if needed for an index-only scan and
384  * not already done in a previous rescan call. To save on palloc
385  * overhead, both workspaces are allocated as one palloc block; only this
386  * function and btendscan know that.
387  *
388  * NOTE: this data structure also makes it safe to return data from a
389  * "name" column, even though btree name_ops uses an underlying storage
390  * datatype of cstring. The risk there is that "name" is supposed to be
391  * padded to NAMEDATALEN, but the actual index tuple is probably shorter.
392  * However, since we only return data out of tuples sitting in the
393  * currTuples array, a fetch of NAMEDATALEN bytes can at worst pull some
394  * data out of the markTuples array --- running off the end of memory for
395  * a SIGSEGV is not possible. Yeah, this is ugly as sin, but it beats
396  * adding special-case treatment for name_ops elsewhere.
397  */
398  if (scan->xs_want_itup && so->currTuples == NULL)
399  {
400  so->currTuples = (char *) palloc(BLCKSZ * 2);
401  so->markTuples = so->currTuples + BLCKSZ;
402  }
403 
404  /*
405  * Reset the scan keys
406  */
407  if (scankey && scan->numberOfKeys > 0)
408  memcpy(scan->keyData, scankey, scan->numberOfKeys * sizeof(ScanKeyData));
409  so->numberOfKeys = 0; /* until _bt_preprocess_keys sets it */
410  so->numArrayKeys = 0; /* ditto */
411 }

References _bt_killitems(), BTScanPosInvalidate, BTScanPosIsValid, BTScanPosUnpinIfPinned, BTScanOpaqueData::currPos, BTScanOpaqueData::currTuples, if(), IndexScanDescData::keyData, BTScanOpaqueData::markItemIndex, BTScanOpaqueData::markPos, BTScanOpaqueData::markTuples, BTScanOpaqueData::needPrimScan, BTScanOpaqueData::numArrayKeys, BTScanOpaqueData::numberOfKeys, IndexScanDescData::numberOfKeys, BTScanOpaqueData::numKilled, IndexScanDescData::opaque, BTScanOpaqueData::oppositeDirCheck, palloc(), BTScanOpaqueData::scanBehind, and IndexScanDescData::xs_want_itup.

Referenced by bthandler().

◆ btrestrpos()

void btrestrpos ( IndexScanDesc  scan)

Definition at line 479 of file nbtree.c.

480 {
481  BTScanOpaque so = (BTScanOpaque) scan->opaque;
482 
483  if (so->markItemIndex >= 0)
484  {
485  /*
486  * The scan has never moved to a new page since the last mark. Just
487  * restore the itemIndex.
488  *
489  * NB: In this case we can't count on anything in so->markPos to be
490  * accurate.
491  */
492  so->currPos.itemIndex = so->markItemIndex;
493  }
494  else
495  {
496  /*
497  * The scan moved to a new page after last mark or restore, and we are
498  * now restoring to the marked page. We aren't holding any read
499  * locks, but if we're still holding the pin for the current position,
500  * we must drop it.
501  */
502  if (BTScanPosIsValid(so->currPos))
503  {
504  /* Before leaving current page, deal with any killed items */
505  if (so->numKilled > 0)
506  _bt_killitems(scan);
508  }
509 
510  if (BTScanPosIsValid(so->markPos))
511  {
512  /* bump pin on mark buffer for assignment to current buffer */
513  if (BTScanPosIsPinned(so->markPos))
515  memcpy(&so->currPos, &so->markPos,
516  offsetof(BTScanPosData, items[1]) +
517  so->markPos.lastItem * sizeof(BTScanPosItem));
518  if (so->currTuples)
519  memcpy(so->currTuples, so->markTuples,
521  /* Reset the scan's array keys (see _bt_steppage for why) */
522  if (so->numArrayKeys)
523  {
524  _bt_start_array_keys(scan, so->currPos.dir);
525  so->needPrimScan = false;
526  }
527  }
528  else
530  }
531 }
void IncrBufferRefCount(Buffer buffer)
Definition: bufmgr.c:4956
int nextTupleOffset
Definition: nbtree.h:968
static ItemArray items
Definition: test_tidstore.c:48

References _bt_killitems(), _bt_start_array_keys(), BTScanPosInvalidate, BTScanPosIsPinned, BTScanPosIsValid, BTScanPosUnpinIfPinned, BTScanPosData::buf, BTScanOpaqueData::currPos, BTScanOpaqueData::currTuples, BTScanPosData::dir, if(), IncrBufferRefCount(), BTScanPosData::itemIndex, items, BTScanPosData::lastItem, BTScanOpaqueData::markItemIndex, BTScanOpaqueData::markPos, BTScanOpaqueData::markTuples, BTScanOpaqueData::needPrimScan, BTScanPosData::nextTupleOffset, BTScanOpaqueData::numArrayKeys, BTScanOpaqueData::numKilled, and IndexScanDescData::opaque.

Referenced by bthandler().

◆ btvacuumcleanup()

IndexBulkDeleteResult* btvacuumcleanup ( IndexVacuumInfo info,
IndexBulkDeleteResult stats 
)

Definition at line 892 of file nbtree.c.

893 {
894  BlockNumber num_delpages;
895 
896  /* No-op in ANALYZE ONLY mode */
897  if (info->analyze_only)
898  return stats;
899 
900  /*
901  * If btbulkdelete was called, we need not do anything (we just maintain
902  * the information used within _bt_vacuum_needs_cleanup() by calling
903  * _bt_set_cleanup_info() below).
904  *
905  * If btbulkdelete was _not_ called, then we have a choice to make: we
906  * must decide whether or not a btvacuumscan() call is needed now (i.e.
907  * whether the ongoing VACUUM operation can entirely avoid a physical scan
908  * of the index). A call to _bt_vacuum_needs_cleanup() decides it for us
909  * now.
910  */
911  if (stats == NULL)
912  {
913  /* Check if VACUUM operation can entirely avoid btvacuumscan() call */
914  if (!_bt_vacuum_needs_cleanup(info->index))
915  return NULL;
916 
917  /*
918  * Since we aren't going to actually delete any leaf items, there's no
919  * need to go through all the vacuum-cycle-ID pushups here.
920  *
921  * Posting list tuples are a source of inaccuracy for cleanup-only
922  * scans. btvacuumscan() will assume that the number of index tuples
923  * from each page can be used as num_index_tuples, even though
924  * num_index_tuples is supposed to represent the number of TIDs in the
925  * index. This naive approach can underestimate the number of tuples
926  * in the index significantly.
927  *
928  * We handle the problem by making num_index_tuples an estimate in
929  * cleanup-only case.
930  */
932  btvacuumscan(info, stats, NULL, NULL, 0);
933  stats->estimated_count = true;
934  }
935 
936  /*
937  * Maintain num_delpages value in metapage for _bt_vacuum_needs_cleanup().
938  *
939  * num_delpages is the number of deleted pages now in the index that were
940  * not safe to place in the FSM to be recycled just yet. num_delpages is
941  * greater than 0 only when _bt_pagedel() actually deleted pages during
942  * our call to btvacuumscan(). Even then, _bt_pendingfsm_finalize() must
943  * have failed to place any newly deleted pages in the FSM just moments
944  * ago. (Actually, there are edge cases where recycling of the current
945  * VACUUM's newly deleted pages does not even become safe by the time the
946  * next VACUUM comes around. See nbtree/README.)
947  */
948  Assert(stats->pages_deleted >= stats->pages_free);
949  num_delpages = stats->pages_deleted - stats->pages_free;
950  _bt_set_cleanup_info(info->index, num_delpages);
951 
952  /*
953  * It's quite possible for us to be fooled by concurrent page splits into
954  * double-counting some index tuples, so disbelieve any total that exceeds
955  * the underlying heap's count ... if we know that accurately. Otherwise
956  * this might just make matters worse.
957  */
958  if (!info->estimated_count)
959  {
960  if (stats->num_index_tuples > info->num_heap_tuples)
961  stats->num_index_tuples = info->num_heap_tuples;
962  }
963 
964  return stats;
965 }
void _bt_set_cleanup_info(Relation rel, BlockNumber num_delpages)
Definition: nbtpage.c:232
bool _bt_vacuum_needs_cleanup(Relation rel)
Definition: nbtpage.c:179
bool estimated_count
Definition: genam.h:78
BlockNumber pages_deleted
Definition: genam.h:82
double num_index_tuples
Definition: genam.h:79
double num_heap_tuples
Definition: genam.h:52
bool analyze_only
Definition: genam.h:48
bool estimated_count
Definition: genam.h:50

References _bt_set_cleanup_info(), _bt_vacuum_needs_cleanup(), IndexVacuumInfo::analyze_only, Assert, btvacuumscan(), IndexVacuumInfo::estimated_count, IndexBulkDeleteResult::estimated_count, IndexVacuumInfo::index, IndexVacuumInfo::num_heap_tuples, IndexBulkDeleteResult::num_index_tuples, IndexBulkDeleteResult::pages_deleted, IndexBulkDeleteResult::pages_free, and palloc0().

Referenced by bthandler().

◆ btvalidate()

bool btvalidate ( Oid  opclassoid)

Definition at line 41 of file nbtvalidate.c.

42 {
43  bool result = true;
44  HeapTuple classtup;
45  Form_pg_opclass classform;
46  Oid opfamilyoid;
47  Oid opcintype;
48  char *opclassname;
49  HeapTuple familytup;
50  Form_pg_opfamily familyform;
51  char *opfamilyname;
52  CatCList *proclist,
53  *oprlist;
54  List *grouplist;
55  OpFamilyOpFuncGroup *opclassgroup;
56  List *familytypes;
57  int usefulgroups;
58  int i;
59  ListCell *lc;
60 
61  /* Fetch opclass information */
62  classtup = SearchSysCache1(CLAOID, ObjectIdGetDatum(opclassoid));
63  if (!HeapTupleIsValid(classtup))
64  elog(ERROR, "cache lookup failed for operator class %u", opclassoid);
65  classform = (Form_pg_opclass) GETSTRUCT(classtup);
66 
67  opfamilyoid = classform->opcfamily;
68  opcintype = classform->opcintype;
69  opclassname = NameStr(classform->opcname);
70 
71  /* Fetch opfamily information */
72  familytup = SearchSysCache1(OPFAMILYOID, ObjectIdGetDatum(opfamilyoid));
73  if (!HeapTupleIsValid(familytup))
74  elog(ERROR, "cache lookup failed for operator family %u", opfamilyoid);
75  familyform = (Form_pg_opfamily) GETSTRUCT(familytup);
76 
77  opfamilyname = NameStr(familyform->opfname);
78 
79  /* Fetch all operators and support functions of the opfamily */
80  oprlist = SearchSysCacheList1(AMOPSTRATEGY, ObjectIdGetDatum(opfamilyoid));
81  proclist = SearchSysCacheList1(AMPROCNUM, ObjectIdGetDatum(opfamilyoid));
82 
83  /* Check individual support functions */
84  for (i = 0; i < proclist->n_members; i++)
85  {
86  HeapTuple proctup = &proclist->members[i]->tuple;
87  Form_pg_amproc procform = (Form_pg_amproc) GETSTRUCT(proctup);
88  bool ok;
89 
90  /* Check procedure numbers and function signatures */
91  switch (procform->amprocnum)
92  {
93  case BTORDER_PROC:
94  ok = check_amproc_signature(procform->amproc, INT4OID, true,
95  2, 2, procform->amproclefttype,
96  procform->amprocrighttype);
97  break;
98  case BTSORTSUPPORT_PROC:
99  ok = check_amproc_signature(procform->amproc, VOIDOID, true,
100  1, 1, INTERNALOID);
101  break;
102  case BTINRANGE_PROC:
103  ok = check_amproc_signature(procform->amproc, BOOLOID, true,
104  5, 5,
105  procform->amproclefttype,
106  procform->amproclefttype,
107  procform->amprocrighttype,
108  BOOLOID, BOOLOID);
109  break;
110  case BTEQUALIMAGE_PROC:
111  ok = check_amproc_signature(procform->amproc, BOOLOID, true,
112  1, 1, OIDOID);
113  break;
114  case BTOPTIONS_PROC:
115  ok = check_amoptsproc_signature(procform->amproc);
116  break;
117  default:
118  ereport(INFO,
119  (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
120  errmsg("operator family \"%s\" of access method %s contains function %s with invalid support number %d",
121  opfamilyname, "btree",
122  format_procedure(procform->amproc),
123  procform->amprocnum)));
124  result = false;
125  continue; /* don't want additional message */
126  }
127 
128  if (!ok)
129  {
130  ereport(INFO,
131  (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
132  errmsg("operator family \"%s\" of access method %s contains function %s with wrong signature for support number %d",
133  opfamilyname, "btree",
134  format_procedure(procform->amproc),
135  procform->amprocnum)));
136  result = false;
137  }
138  }
139 
140  /* Check individual operators */
141  for (i = 0; i < oprlist->n_members; i++)
142  {
143  HeapTuple oprtup = &oprlist->members[i]->tuple;
144  Form_pg_amop oprform = (Form_pg_amop) GETSTRUCT(oprtup);
145 
146  /* Check that only allowed strategy numbers exist */
147  if (oprform->amopstrategy < 1 ||
148  oprform->amopstrategy > BTMaxStrategyNumber)
149  {
150  ereport(INFO,
151  (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
152  errmsg("operator family \"%s\" of access method %s contains operator %s with invalid strategy number %d",
153  opfamilyname, "btree",
154  format_operator(oprform->amopopr),
155  oprform->amopstrategy)));
156  result = false;
157  }
158 
159  /* btree doesn't support ORDER BY operators */
160  if (oprform->amoppurpose != AMOP_SEARCH ||
161  OidIsValid(oprform->amopsortfamily))
162  {
163  ereport(INFO,
164  (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
165  errmsg("operator family \"%s\" of access method %s contains invalid ORDER BY specification for operator %s",
166  opfamilyname, "btree",
167  format_operator(oprform->amopopr))));
168  result = false;
169  }
170 
171  /* Check operator signature --- same for all btree strategies */
172  if (!check_amop_signature(oprform->amopopr, BOOLOID,
173  oprform->amoplefttype,
174  oprform->amoprighttype))
175  {
176  ereport(INFO,
177  (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
178  errmsg("operator family \"%s\" of access method %s contains operator %s with wrong signature",
179  opfamilyname, "btree",
180  format_operator(oprform->amopopr))));
181  result = false;
182  }
183  }
184 
185  /* Now check for inconsistent groups of operators/functions */
186  grouplist = identify_opfamily_groups(oprlist, proclist);
187  usefulgroups = 0;
188  opclassgroup = NULL;
189  familytypes = NIL;
190  foreach(lc, grouplist)
191  {
192  OpFamilyOpFuncGroup *thisgroup = (OpFamilyOpFuncGroup *) lfirst(lc);
193 
194  /*
195  * It is possible for an in_range support function to have a RHS type
196  * that is otherwise irrelevant to the opfamily --- for instance, SQL
197  * requires the datetime_ops opclass to have range support with an
198  * interval offset. So, if this group appears to contain only an
199  * in_range function, ignore it: it doesn't represent a pair of
200  * supported types.
201  */
202  if (thisgroup->operatorset == 0 &&
203  thisgroup->functionset == (1 << BTINRANGE_PROC))
204  continue;
205 
206  /* Else count it as a relevant group */
207  usefulgroups++;
208 
209  /* Remember the group exactly matching the test opclass */
210  if (thisgroup->lefttype == opcintype &&
211  thisgroup->righttype == opcintype)
212  opclassgroup = thisgroup;
213 
214  /*
215  * Identify all distinct data types handled in this opfamily. This
216  * implementation is O(N^2), but there aren't likely to be enough
217  * types in the family for it to matter.
218  */
219  familytypes = list_append_unique_oid(familytypes, thisgroup->lefttype);
220  familytypes = list_append_unique_oid(familytypes, thisgroup->righttype);
221 
222  /*
223  * Complain if there seems to be an incomplete set of either operators
224  * or support functions for this datatype pair. The sortsupport,
225  * in_range, and equalimage functions are considered optional.
226  */
227  if (thisgroup->operatorset !=
228  ((1 << BTLessStrategyNumber) |
230  (1 << BTEqualStrategyNumber) |
232  (1 << BTGreaterStrategyNumber)))
233  {
234  ereport(INFO,
235  (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
236  errmsg("operator family \"%s\" of access method %s is missing operator(s) for types %s and %s",
237  opfamilyname, "btree",
238  format_type_be(thisgroup->lefttype),
239  format_type_be(thisgroup->righttype))));
240  result = false;
241  }
242  if ((thisgroup->functionset & (1 << BTORDER_PROC)) == 0)
243  {
244  ereport(INFO,
245  (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
246  errmsg("operator family \"%s\" of access method %s is missing support function for types %s and %s",
247  opfamilyname, "btree",
248  format_type_be(thisgroup->lefttype),
249  format_type_be(thisgroup->righttype))));
250  result = false;
251  }
252  }
253 
254  /* Check that the originally-named opclass is supported */
255  /* (if group is there, we already checked it adequately above) */
256  if (!opclassgroup)
257  {
258  ereport(INFO,
259  (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
260  errmsg("operator class \"%s\" of access method %s is missing operator(s)",
261  opclassname, "btree")));
262  result = false;
263  }
264 
265  /*
266  * Complain if the opfamily doesn't have entries for all possible
267  * combinations of its supported datatypes. While missing cross-type
268  * operators are not fatal, they do limit the planner's ability to derive
269  * additional qual clauses from equivalence classes, so it seems
270  * reasonable to insist that all built-in btree opfamilies be complete.
271  */
272  if (usefulgroups != (list_length(familytypes) * list_length(familytypes)))
273  {
274  ereport(INFO,
275  (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
276  errmsg("operator family \"%s\" of access method %s is missing cross-type operator(s)",
277  opfamilyname, "btree")));
278  result = false;
279  }
280 
281  ReleaseCatCacheList(proclist);
282  ReleaseCatCacheList(oprlist);
283  ReleaseSysCache(familytup);
284  ReleaseSysCache(classtup);
285 
286  return result;
287 }
bool check_amproc_signature(Oid funcid, Oid restype, bool exact, int minargs, int maxargs,...)
Definition: amvalidate.c:152
bool check_amop_signature(Oid opno, Oid restype, Oid lefttype, Oid righttype)
Definition: amvalidate.c:206
List * identify_opfamily_groups(CatCList *oprlist, CatCList *proclist)
Definition: amvalidate.c:43
bool check_amoptsproc_signature(Oid funcid)
Definition: amvalidate.c:192
#define NameStr(name)
Definition: c.h:751
void ReleaseCatCacheList(CatCList *list)
Definition: catcache.c:1985
#define INFO
Definition: elog.h:34
char * format_type_be(Oid type_oid)
Definition: format_type.c:343
#define HeapTupleIsValid(tuple)
Definition: htup.h:78
#define GETSTRUCT(TUP)
Definition: htup_details.h:653
List * list_append_unique_oid(List *list, Oid datum)
Definition: list.c:1380
#define BTSORTSUPPORT_PROC
Definition: nbtree.h:708
#define BTINRANGE_PROC
Definition: nbtree.h:709
#define BTOPTIONS_PROC
Definition: nbtree.h:711
FormData_pg_amop * Form_pg_amop
Definition: pg_amop.h:88
FormData_pg_amproc * Form_pg_amproc
Definition: pg_amproc.h:68
static int list_length(const List *l)
Definition: pg_list.h:152
#define NIL
Definition: pg_list.h:68
FormData_pg_opclass * Form_pg_opclass
Definition: pg_opclass.h:83
FormData_pg_opfamily * Form_pg_opfamily
Definition: pg_opfamily.h:51
char * format_operator(Oid operator_oid)
Definition: regproc.c:793
char * format_procedure(Oid procedure_oid)
Definition: regproc.c:299
Definition: pg_list.h:54
CatCTup * members[FLEXIBLE_ARRAY_MEMBER]
Definition: catcache.h:180
int n_members
Definition: catcache.h:178
HeapTupleData tuple
Definition: catcache.h:123
void ReleaseSysCache(HeapTuple tuple)
Definition: syscache.c:269
HeapTuple SearchSysCache1(int cacheId, Datum key1)
Definition: syscache.c:221
#define SearchSysCacheList1(cacheId, key1)
Definition: syscache.h:127

References BTEQUALIMAGE_PROC, BTEqualStrategyNumber, BTGreaterEqualStrategyNumber, BTGreaterStrategyNumber, BTINRANGE_PROC, BTLessEqualStrategyNumber, BTLessStrategyNumber, BTMaxStrategyNumber, BTOPTIONS_PROC, BTORDER_PROC, BTSORTSUPPORT_PROC, check_amop_signature(), check_amoptsproc_signature(), check_amproc_signature(), elog, ereport, errcode(), errmsg(), ERROR, format_operator(), format_procedure(), format_type_be(), OpFamilyOpFuncGroup::functionset, GETSTRUCT, HeapTupleIsValid, i, identify_opfamily_groups(), INFO, OpFamilyOpFuncGroup::lefttype, lfirst, list_append_unique_oid(), list_length(), catclist::members, catclist::n_members, NameStr, NIL, ObjectIdGetDatum(), OidIsValid, OpFamilyOpFuncGroup::operatorset, ReleaseCatCacheList(), ReleaseSysCache(), OpFamilyOpFuncGroup::righttype, SearchSysCache1(), SearchSysCacheList1, and catctup::tuple.

Referenced by bthandler().

◆ StaticAssertDecl()

StaticAssertDecl ( BT_OFFSET_MASK >=  INDEX_MAX_KEYS,
"BT_OFFSET_MASK can't fit INDEX_MAX_KEYS  
)