PostgreSQL Source Code git master
nbtree.h File Reference
#include "access/amapi.h"
#include "access/itup.h"
#include "access/sdir.h"
#include "catalog/pg_am_d.h"
#include "catalog/pg_class.h"
#include "catalog/pg_index.h"
#include "lib/stringinfo.h"
#include "storage/bufmgr.h"
#include "storage/dsm.h"
#include "storage/shm_toc.h"
#include "utils/skipsupport.h"
Include dependency graph for nbtree.h:
This graph shows which files directly or indirectly include this file:

Go to the source code of this file.

Data Structures

struct  BTPageOpaqueData
 
struct  BTMetaPageData
 
struct  BTDeletedPageData
 
struct  BTPendingFSM
 
struct  BTVacState
 
struct  BTStackData
 
struct  BTScanInsertData
 
struct  BTInsertStateData
 
struct  BTDedupInterval
 
struct  BTDedupStateData
 
struct  BTVacuumPostingData
 
struct  BTScanPosItem
 
struct  BTScanPosData
 
struct  BTArrayKeyInfo
 
struct  BTScanOpaqueData
 
struct  BTOptions
 

Macros

#define BTPageGetOpaque(page)   ((BTPageOpaque) PageGetSpecialPointer(page))
 
#define BTP_LEAF   (1 << 0) /* leaf page, i.e. not internal page */
 
#define BTP_ROOT   (1 << 1) /* root page (has no parent) */
 
#define BTP_DELETED   (1 << 2) /* page has been deleted from tree */
 
#define BTP_META   (1 << 3) /* meta-page */
 
#define BTP_HALF_DEAD   (1 << 4) /* empty, but still in tree */
 
#define BTP_SPLIT_END   (1 << 5) /* rightmost page of split group */
 
#define BTP_HAS_GARBAGE   (1 << 6) /* page has LP_DEAD tuples (deprecated) */
 
#define BTP_INCOMPLETE_SPLIT   (1 << 7) /* right sibling's downlink is missing */
 
#define BTP_HAS_FULLXID   (1 << 8) /* contains BTDeletedPageData */
 
#define MAX_BT_CYCLE_ID   0xFF7F
 
#define BTPageGetMeta(p)    ((BTMetaPageData *) PageGetContents(p))
 
#define BTREE_METAPAGE   0 /* first page is meta */
 
#define BTREE_MAGIC   0x053162 /* magic number in metapage */
 
#define BTREE_VERSION   4 /* current version number */
 
#define BTREE_MIN_VERSION   2 /* minimum supported version */
 
#define BTREE_NOVAC_VERSION   3 /* version with all meta fields set */
 
#define BTMaxItemSize
 
#define BTMaxItemSizeNoHeapTid
 
#define MaxTIDsPerBTreePage
 
#define BTREE_MIN_FILLFACTOR   10
 
#define BTREE_DEFAULT_FILLFACTOR   90
 
#define BTREE_NONLEAF_FILLFACTOR   70
 
#define BTREE_SINGLEVAL_FILLFACTOR   96
 
#define P_NONE   0
 
#define P_LEFTMOST(opaque)   ((opaque)->btpo_prev == P_NONE)
 
#define P_RIGHTMOST(opaque)   ((opaque)->btpo_next == P_NONE)
 
#define P_ISLEAF(opaque)   (((opaque)->btpo_flags & BTP_LEAF) != 0)
 
#define P_ISROOT(opaque)   (((opaque)->btpo_flags & BTP_ROOT) != 0)
 
#define P_ISDELETED(opaque)   (((opaque)->btpo_flags & BTP_DELETED) != 0)
 
#define P_ISMETA(opaque)   (((opaque)->btpo_flags & BTP_META) != 0)
 
#define P_ISHALFDEAD(opaque)   (((opaque)->btpo_flags & BTP_HALF_DEAD) != 0)
 
#define P_IGNORE(opaque)   (((opaque)->btpo_flags & (BTP_DELETED|BTP_HALF_DEAD)) != 0)
 
#define P_HAS_GARBAGE(opaque)   (((opaque)->btpo_flags & BTP_HAS_GARBAGE) != 0)
 
#define P_INCOMPLETE_SPLIT(opaque)   (((opaque)->btpo_flags & BTP_INCOMPLETE_SPLIT) != 0)
 
#define P_HAS_FULLXID(opaque)   (((opaque)->btpo_flags & BTP_HAS_FULLXID) != 0)
 
#define P_HIKEY   ((OffsetNumber) 1)
 
#define P_FIRSTKEY   ((OffsetNumber) 2)
 
#define P_FIRSTDATAKEY(opaque)   (P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY)
 
#define INDEX_ALT_TID_MASK   INDEX_AM_RESERVED_BIT
 
#define BT_OFFSET_MASK   0x0FFF
 
#define BT_STATUS_OFFSET_MASK   0xF000
 
#define BT_PIVOT_HEAP_TID_ATTR   0x1000
 
#define BT_IS_POSTING   0x2000
 
#define BTreeTupleGetNAtts(itup, rel)
 
#define BTCommuteStrategyNumber(strat)   (BTMaxStrategyNumber + 1 - (strat))
 
#define BTORDER_PROC   1
 
#define BTSORTSUPPORT_PROC   2
 
#define BTINRANGE_PROC   3
 
#define BTEQUALIMAGE_PROC   4
 
#define BTOPTIONS_PROC   5
 
#define BTSKIPSUPPORT_PROC   6
 
#define BTNProcs   6
 
#define BT_READ   BUFFER_LOCK_SHARE
 
#define BT_WRITE   BUFFER_LOCK_EXCLUSIVE
 
#define BTScanPosIsPinned(scanpos)
 
#define BTScanPosUnpin(scanpos)
 
#define BTScanPosUnpinIfPinned(scanpos)
 
#define BTScanPosIsValid(scanpos)
 
#define BTScanPosInvalidate(scanpos)
 
#define SK_BT_REQFWD   0x00010000 /* required to continue forward scan */
 
#define SK_BT_REQBKWD   0x00020000 /* required to continue backward scan */
 
#define SK_BT_SKIP   0x00040000 /* skip array on column without input = */
 
#define SK_BT_MINVAL   0x00080000 /* invalid sk_argument, use low_compare */
 
#define SK_BT_MAXVAL   0x00100000 /* invalid sk_argument, use high_compare */
 
#define SK_BT_NEXT   0x00200000 /* positions the scan > sk_argument */
 
#define SK_BT_PRIOR   0x00400000 /* positions the scan < sk_argument */
 
#define SK_BT_INDOPTION_SHIFT   24 /* must clear the above bits */
 
#define SK_BT_DESC   (INDOPTION_DESC << SK_BT_INDOPTION_SHIFT)
 
#define SK_BT_NULLS_FIRST   (INDOPTION_NULLS_FIRST << SK_BT_INDOPTION_SHIFT)
 
#define BTGetFillFactor(relation)
 
#define BTGetTargetPageFreeSpace(relation)    (BLCKSZ * (100 - BTGetFillFactor(relation)) / 100)
 
#define BTGetDeduplicateItems(relation)
 
#define PROGRESS_BTREE_PHASE_INDEXBUILD_TABLESCAN   2
 
#define PROGRESS_BTREE_PHASE_PERFORMSORT_1   3
 
#define PROGRESS_BTREE_PHASE_PERFORMSORT_2   4
 
#define PROGRESS_BTREE_PHASE_LEAF_LOAD   5
 

Typedefs

typedef uint16 BTCycleId
 
typedef struct BTPageOpaqueData BTPageOpaqueData
 
typedef BTPageOpaqueDataBTPageOpaque
 
typedef struct BTMetaPageData BTMetaPageData
 
typedef struct BTDeletedPageData BTDeletedPageData
 
typedef struct BTPendingFSM BTPendingFSM
 
typedef struct BTVacState BTVacState
 
typedef struct BTStackData BTStackData
 
typedef BTStackDataBTStack
 
typedef struct BTScanInsertData BTScanInsertData
 
typedef BTScanInsertDataBTScanInsert
 
typedef struct BTInsertStateData BTInsertStateData
 
typedef BTInsertStateDataBTInsertState
 
typedef struct BTDedupInterval BTDedupInterval
 
typedef struct BTDedupStateData BTDedupStateData
 
typedef BTDedupStateDataBTDedupState
 
typedef struct BTVacuumPostingData BTVacuumPostingData
 
typedef BTVacuumPostingDataBTVacuumPosting
 
typedef struct BTScanPosItem BTScanPosItem
 
typedef struct BTScanPosData BTScanPosData
 
typedef BTScanPosDataBTScanPos
 
typedef struct BTArrayKeyInfo BTArrayKeyInfo
 
typedef struct BTScanOpaqueData BTScanOpaqueData
 
typedef BTScanOpaqueDataBTScanOpaque
 
typedef struct BTOptions BTOptions
 

Functions

static void BTPageSetDeleted (Page page, FullTransactionId safexid)
 
static FullTransactionId BTPageGetDeleteXid (Page page)
 
static bool BTPageIsRecyclable (Page page, Relation heaprel)
 
 StaticAssertDecl (BT_OFFSET_MASK >=INDEX_MAX_KEYS, "BT_OFFSET_MASK can't fit INDEX_MAX_KEYS")
 
static bool BTreeTupleIsPivot (IndexTuple itup)
 
static bool BTreeTupleIsPosting (IndexTuple itup)
 
static void BTreeTupleSetPosting (IndexTuple itup, uint16 nhtids, int postingoffset)
 
static uint16 BTreeTupleGetNPosting (IndexTuple posting)
 
static uint32 BTreeTupleGetPostingOffset (IndexTuple posting)
 
static ItemPointer BTreeTupleGetPosting (IndexTuple posting)
 
static ItemPointer BTreeTupleGetPostingN (IndexTuple posting, int n)
 
static BlockNumber BTreeTupleGetDownLink (IndexTuple pivot)
 
static void BTreeTupleSetDownLink (IndexTuple pivot, BlockNumber blkno)
 
static void BTreeTupleSetNAtts (IndexTuple itup, uint16 nkeyatts, bool heaptid)
 
static BlockNumber BTreeTupleGetTopParent (IndexTuple leafhikey)
 
static void BTreeTupleSetTopParent (IndexTuple leafhikey, BlockNumber blkno)
 
static ItemPointer BTreeTupleGetHeapTID (IndexTuple itup)
 
static ItemPointer BTreeTupleGetMaxHeapTID (IndexTuple itup)
 
void btbuildempty (Relation index)
 
bool btinsert (Relation rel, Datum *values, bool *isnull, ItemPointer ht_ctid, Relation heapRel, IndexUniqueCheck checkUnique, bool indexUnchanged, struct IndexInfo *indexInfo)
 
IndexScanDesc btbeginscan (Relation rel, int nkeys, int norderbys)
 
Size btestimateparallelscan (Relation rel, int nkeys, int norderbys)
 
void btinitparallelscan (void *target)
 
bool btgettuple (IndexScanDesc scan, ScanDirection dir)
 
int64 btgetbitmap (IndexScanDesc scan, TIDBitmap *tbm)
 
void btrescan (IndexScanDesc scan, ScanKey scankey, int nscankeys, ScanKey orderbys, int norderbys)
 
void btparallelrescan (IndexScanDesc scan)
 
void btendscan (IndexScanDesc scan)
 
void btmarkpos (IndexScanDesc scan)
 
void btrestrpos (IndexScanDesc scan)
 
IndexBulkDeleteResultbtbulkdelete (IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state)
 
IndexBulkDeleteResultbtvacuumcleanup (IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
 
bool btcanreturn (Relation index, int attno)
 
int btgettreeheight (Relation rel)
 
CompareType bttranslatestrategy (StrategyNumber strategy, Oid opfamily)
 
StrategyNumber bttranslatecmptype (CompareType cmptype, Oid opfamily)
 
bool _bt_parallel_seize (IndexScanDesc scan, BlockNumber *next_scan_page, BlockNumber *last_curr_page, bool first)
 
void _bt_parallel_release (IndexScanDesc scan, BlockNumber next_scan_page, BlockNumber curr_page)
 
void _bt_parallel_done (IndexScanDesc scan)
 
void _bt_parallel_primscan_schedule (IndexScanDesc scan, BlockNumber curr_page)
 
void _bt_dedup_pass (Relation rel, Buffer buf, IndexTuple newitem, Size newitemsz, bool bottomupdedup)
 
bool _bt_bottomupdel_pass (Relation rel, Buffer buf, Relation heapRel, Size newitemsz)
 
void _bt_dedup_start_pending (BTDedupState state, IndexTuple base, OffsetNumber baseoff)
 
bool _bt_dedup_save_htid (BTDedupState state, IndexTuple itup)
 
Size _bt_dedup_finish_pending (Page newpage, BTDedupState state)
 
IndexTuple _bt_form_posting (IndexTuple base, const ItemPointerData *htids, int nhtids)
 
void _bt_update_posting (BTVacuumPosting vacposting)
 
IndexTuple _bt_swap_posting (IndexTuple newitem, IndexTuple oposting, int postingoff)
 
bool _bt_doinsert (Relation rel, IndexTuple itup, IndexUniqueCheck checkUnique, bool indexUnchanged, Relation heapRel)
 
void _bt_finish_split (Relation rel, Relation heaprel, Buffer lbuf, BTStack stack)
 
Buffer _bt_getstackbuf (Relation rel, Relation heaprel, BTStack stack, BlockNumber child)
 
OffsetNumber _bt_findsplitloc (Relation rel, Page origpage, OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem, bool *newitemonleft)
 
void _bt_initmetapage (Page page, BlockNumber rootbknum, uint32 level, bool allequalimage)
 
bool _bt_vacuum_needs_cleanup (Relation rel)
 
void _bt_set_cleanup_info (Relation rel, BlockNumber num_delpages)
 
void _bt_upgrademetapage (Page page)
 
Buffer _bt_getroot (Relation rel, Relation heaprel, int access)
 
Buffer _bt_gettrueroot (Relation rel)
 
int _bt_getrootheight (Relation rel)
 
void _bt_metaversion (Relation rel, bool *heapkeyspace, bool *allequalimage)
 
void _bt_checkpage (Relation rel, Buffer buf)
 
Buffer _bt_getbuf (Relation rel, BlockNumber blkno, int access)
 
Buffer _bt_allocbuf (Relation rel, Relation heaprel)
 
Buffer _bt_relandgetbuf (Relation rel, Buffer obuf, BlockNumber blkno, int access)
 
void _bt_relbuf (Relation rel, Buffer buf)
 
void _bt_lockbuf (Relation rel, Buffer buf, int access)
 
void _bt_unlockbuf (Relation rel, Buffer buf)
 
bool _bt_conditionallockbuf (Relation rel, Buffer buf)
 
void _bt_upgradelockbufcleanup (Relation rel, Buffer buf)
 
void _bt_pageinit (Page page, Size size)
 
void _bt_delitems_vacuum (Relation rel, Buffer buf, OffsetNumber *deletable, int ndeletable, BTVacuumPosting *updatable, int nupdatable)
 
void _bt_delitems_delete_check (Relation rel, Buffer buf, Relation heapRel, struct TM_IndexDeleteOp *delstate)
 
void _bt_pagedel (Relation rel, Buffer leafbuf, BTVacState *vstate)
 
void _bt_pendingfsm_init (Relation rel, BTVacState *vstate, bool cleanuponly)
 
void _bt_pendingfsm_finalize (Relation rel, BTVacState *vstate)
 
void _bt_preprocess_keys (IndexScanDesc scan)
 
bool _bt_readpage (IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, bool firstpage)
 
void _bt_start_array_keys (IndexScanDesc scan, ScanDirection dir)
 
int _bt_binsrch_array_skey (FmgrInfo *orderproc, bool cur_elem_trig, ScanDirection dir, Datum tupdatum, bool tupnull, BTArrayKeyInfo *array, ScanKey cur, int32 *set_elem_result)
 
BTStack _bt_search (Relation rel, Relation heaprel, BTScanInsert key, Buffer *bufP, int access)
 
OffsetNumber _bt_binsrch_insert (Relation rel, BTInsertState insertstate)
 
int32 _bt_compare (Relation rel, BTScanInsert key, Page page, OffsetNumber offnum)
 
bool _bt_first (IndexScanDesc scan, ScanDirection dir)
 
bool _bt_next (IndexScanDesc scan, ScanDirection dir)
 
Buffer _bt_get_endpoint (Relation rel, uint32 level, bool rightmost)
 
BTScanInsert _bt_mkscankey (Relation rel, IndexTuple itup)
 
void _bt_freestack (BTStack stack)
 
void _bt_killitems (IndexScanDesc scan)
 
BTCycleId _bt_vacuum_cycleid (Relation rel)
 
BTCycleId _bt_start_vacuum (Relation rel)
 
void _bt_end_vacuum (Relation rel)
 
void _bt_end_vacuum_callback (int code, Datum arg)
 
Size BTreeShmemSize (void)
 
void BTreeShmemInit (void)
 
byteabtoptions (Datum reloptions, bool validate)
 
bool btproperty (Oid index_oid, int attno, IndexAMProperty prop, const char *propname, bool *res, bool *isnull)
 
char * btbuildphasename (int64 phasenum)
 
IndexTuple _bt_truncate (Relation rel, IndexTuple lastleft, IndexTuple firstright, BTScanInsert itup_key)
 
int _bt_keep_natts_fast (Relation rel, IndexTuple lastleft, IndexTuple firstright)
 
bool _bt_check_natts (Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum)
 
void _bt_check_third_page (Relation rel, Relation heap, bool needheaptidspace, Page page, IndexTuple newtup)
 
bool _bt_allequalimage (Relation rel, bool debugmessage)
 
bool btvalidate (Oid opclassoid)
 
void btadjustmembers (Oid opfamilyoid, Oid opclassoid, List *operators, List *functions)
 
IndexBuildResultbtbuild (Relation heap, Relation index, struct IndexInfo *indexInfo)
 
void _bt_parallel_build_main (dsm_segment *seg, shm_toc *toc)
 

Macro Definition Documentation

◆ BT_IS_POSTING

#define BT_IS_POSTING   0x2000

Definition at line 467 of file nbtree.h.

◆ BT_OFFSET_MASK

#define BT_OFFSET_MASK   0x0FFF

Definition at line 463 of file nbtree.h.

◆ BT_PIVOT_HEAP_TID_ATTR

#define BT_PIVOT_HEAP_TID_ATTR   0x1000

Definition at line 466 of file nbtree.h.

◆ BT_READ

#define BT_READ   BUFFER_LOCK_SHARE

Definition at line 730 of file nbtree.h.

◆ BT_STATUS_OFFSET_MASK

#define BT_STATUS_OFFSET_MASK   0xF000

Definition at line 464 of file nbtree.h.

◆ BT_WRITE

#define BT_WRITE   BUFFER_LOCK_EXCLUSIVE

Definition at line 731 of file nbtree.h.

◆ BTCommuteStrategyNumber

#define BTCommuteStrategyNumber (   strat)    (BTMaxStrategyNumber + 1 - (strat))

Definition at line 686 of file nbtree.h.

◆ BTEQUALIMAGE_PROC

#define BTEQUALIMAGE_PROC   4

Definition at line 720 of file nbtree.h.

◆ BTGetDeduplicateItems

#define BTGetDeduplicateItems (   relation)
Value:
(AssertMacro(relation->rd_rel->relkind == RELKIND_INDEX && \
relation->rd_rel->relam == BTREE_AM_OID), \
((relation)->rd_options ? \
((BTOptions *) (relation)->rd_options)->deduplicate_items : true))
#define AssertMacro(condition)
Definition: c.h:872

Definition at line 1135 of file nbtree.h.

◆ BTGetFillFactor

#define BTGetFillFactor (   relation)
Value:
(AssertMacro(relation->rd_rel->relkind == RELKIND_INDEX && \
relation->rd_rel->relam == BTREE_AM_OID), \
(relation)->rd_options ? \
((BTOptions *) (relation)->rd_options)->fillfactor : \
BTREE_DEFAULT_FILLFACTOR)

Definition at line 1127 of file nbtree.h.

◆ BTGetTargetPageFreeSpace

#define BTGetTargetPageFreeSpace (   relation)     (BLCKSZ * (100 - BTGetFillFactor(relation)) / 100)

Definition at line 1133 of file nbtree.h.

◆ BTINRANGE_PROC

#define BTINRANGE_PROC   3

Definition at line 719 of file nbtree.h.

◆ BTMaxItemSize

#define BTMaxItemSize
Value:
(MAXALIGN_DOWN((BLCKSZ - \
MAXALIGN(sizeof(BTPageOpaqueData))) / 3) - \
MAXALIGN(sizeof(ItemPointerData)))
#define SizeOfPageHeaderData
Definition: bufpage.h:216
#define MAXALIGN_DOWN(LEN)
Definition: c.h:836
#define MAXALIGN(LEN)
Definition: c.h:824

Definition at line 165 of file nbtree.h.

◆ BTMaxItemSizeNoHeapTid

#define BTMaxItemSizeNoHeapTid
Value:
MAXALIGN_DOWN((BLCKSZ - \
MAXALIGN(sizeof(BTPageOpaqueData))) / 3)

Definition at line 170 of file nbtree.h.

◆ BTNProcs

#define BTNProcs   6

Definition at line 723 of file nbtree.h.

◆ BTOPTIONS_PROC

#define BTOPTIONS_PROC   5

Definition at line 721 of file nbtree.h.

◆ BTORDER_PROC

#define BTORDER_PROC   1

Definition at line 717 of file nbtree.h.

◆ BTP_DELETED

#define BTP_DELETED   (1 << 2) /* page has been deleted from tree */

Definition at line 79 of file nbtree.h.

◆ BTP_HALF_DEAD

#define BTP_HALF_DEAD   (1 << 4) /* empty, but still in tree */

Definition at line 81 of file nbtree.h.

◆ BTP_HAS_FULLXID

#define BTP_HAS_FULLXID   (1 << 8) /* contains BTDeletedPageData */

Definition at line 85 of file nbtree.h.

◆ BTP_HAS_GARBAGE

#define BTP_HAS_GARBAGE   (1 << 6) /* page has LP_DEAD tuples (deprecated) */

Definition at line 83 of file nbtree.h.

◆ BTP_INCOMPLETE_SPLIT

#define BTP_INCOMPLETE_SPLIT   (1 << 7) /* right sibling's downlink is missing */

Definition at line 84 of file nbtree.h.

◆ BTP_LEAF

#define BTP_LEAF   (1 << 0) /* leaf page, i.e. not internal page */

Definition at line 77 of file nbtree.h.

◆ BTP_META

#define BTP_META   (1 << 3) /* meta-page */

Definition at line 80 of file nbtree.h.

◆ BTP_ROOT

#define BTP_ROOT   (1 << 1) /* root page (has no parent) */

Definition at line 78 of file nbtree.h.

◆ BTP_SPLIT_END

#define BTP_SPLIT_END   (1 << 5) /* rightmost page of split group */

Definition at line 82 of file nbtree.h.

◆ BTPageGetMeta

#define BTPageGetMeta (   p)     ((BTMetaPageData *) PageGetContents(p))

Definition at line 122 of file nbtree.h.

◆ BTPageGetOpaque

#define BTPageGetOpaque (   page)    ((BTPageOpaque) PageGetSpecialPointer(page))

Definition at line 74 of file nbtree.h.

◆ BTREE_DEFAULT_FILLFACTOR

#define BTREE_DEFAULT_FILLFACTOR   90

Definition at line 201 of file nbtree.h.

◆ BTREE_MAGIC

#define BTREE_MAGIC   0x053162 /* magic number in metapage */

Definition at line 150 of file nbtree.h.

◆ BTREE_METAPAGE

#define BTREE_METAPAGE   0 /* first page is meta */

Definition at line 149 of file nbtree.h.

◆ BTREE_MIN_FILLFACTOR

#define BTREE_MIN_FILLFACTOR   10

Definition at line 200 of file nbtree.h.

◆ BTREE_MIN_VERSION

#define BTREE_MIN_VERSION   2 /* minimum supported version */

Definition at line 152 of file nbtree.h.

◆ BTREE_NONLEAF_FILLFACTOR

#define BTREE_NONLEAF_FILLFACTOR   70

Definition at line 202 of file nbtree.h.

◆ BTREE_NOVAC_VERSION

#define BTREE_NOVAC_VERSION   3 /* version with all meta fields set */

Definition at line 153 of file nbtree.h.

◆ BTREE_SINGLEVAL_FILLFACTOR

#define BTREE_SINGLEVAL_FILLFACTOR   96

Definition at line 203 of file nbtree.h.

◆ BTREE_VERSION

#define BTREE_VERSION   4 /* current version number */

Definition at line 151 of file nbtree.h.

◆ BTreeTupleGetNAtts

#define BTreeTupleGetNAtts (   itup,
  rel 
)
Value:
( \
(BTreeTupleIsPivot(itup)) ? \
( \
) \
: \
)
static OffsetNumber ItemPointerGetOffsetNumberNoCheck(const ItemPointerData *pointer)
Definition: itemptr.h:114
static bool BTreeTupleIsPivot(IndexTuple itup)
Definition: nbtree.h:481
#define BT_OFFSET_MASK
Definition: nbtree.h:463
#define IndexRelationGetNumberOfAttributes(relation)
Definition: rel.h:527

Definition at line 578 of file nbtree.h.

◆ BTScanPosInvalidate

#define BTScanPosInvalidate (   scanpos)
Value:
do { \
(scanpos).buf = InvalidBuffer; \
(scanpos).currPage = InvalidBlockNumber; \
} while (0)
#define InvalidBlockNumber
Definition: block.h:33
#define InvalidBuffer
Definition: buf.h:25
static char buf[DEFAULT_XLOG_SEG_SIZE]
Definition: pg_test_fsync.c:71

Definition at line 1027 of file nbtree.h.

◆ BTScanPosIsPinned

#define BTScanPosIsPinned (   scanpos)
Value:
( \
AssertMacro(BlockNumberIsValid((scanpos).currPage) || \
!BufferIsValid((scanpos).buf)), \
BufferIsValid((scanpos).buf) \
)
static bool BlockNumberIsValid(BlockNumber blockNumber)
Definition: block.h:71
static bool BufferIsValid(Buffer bufnum)
Definition: bufmgr.h:387

Definition at line 1004 of file nbtree.h.

◆ BTScanPosIsValid

#define BTScanPosIsValid (   scanpos)
Value:
( \
AssertMacro(BlockNumberIsValid((scanpos).currPage) || \
!BufferIsValid((scanpos).buf)), \
BlockNumberIsValid((scanpos).currPage) \
)

Definition at line 1021 of file nbtree.h.

◆ BTScanPosUnpin

#define BTScanPosUnpin (   scanpos)
Value:
do { \
ReleaseBuffer((scanpos).buf); \
(scanpos).buf = InvalidBuffer; \
} while (0)

Definition at line 1010 of file nbtree.h.

◆ BTScanPosUnpinIfPinned

#define BTScanPosUnpinIfPinned (   scanpos)
Value:
do { \
if (BTScanPosIsPinned(scanpos)) \
BTScanPosUnpin(scanpos); \
} while (0)
#define BTScanPosIsPinned(scanpos)
Definition: nbtree.h:1004

Definition at line 1015 of file nbtree.h.

◆ BTSKIPSUPPORT_PROC

#define BTSKIPSUPPORT_PROC   6

Definition at line 722 of file nbtree.h.

◆ BTSORTSUPPORT_PROC

#define BTSORTSUPPORT_PROC   2

Definition at line 718 of file nbtree.h.

◆ INDEX_ALT_TID_MASK

#define INDEX_ALT_TID_MASK   INDEX_AM_RESERVED_BIT

Definition at line 460 of file nbtree.h.

◆ MAX_BT_CYCLE_ID

#define MAX_BT_CYCLE_ID   0xFF7F

Definition at line 94 of file nbtree.h.

◆ MaxTIDsPerBTreePage

#define MaxTIDsPerBTreePage
Value:
(int) ((BLCKSZ - SizeOfPageHeaderData - sizeof(BTPageOpaqueData)) / \
sizeof(ItemPointerData))

Definition at line 186 of file nbtree.h.

◆ P_FIRSTDATAKEY

#define P_FIRSTDATAKEY (   opaque)    (P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY)

Definition at line 370 of file nbtree.h.

◆ P_FIRSTKEY

#define P_FIRSTKEY   ((OffsetNumber) 2)

Definition at line 369 of file nbtree.h.

◆ P_HAS_FULLXID

#define P_HAS_FULLXID (   opaque)    (((opaque)->btpo_flags & BTP_HAS_FULLXID) != 0)

Definition at line 229 of file nbtree.h.

◆ P_HAS_GARBAGE

#define P_HAS_GARBAGE (   opaque)    (((opaque)->btpo_flags & BTP_HAS_GARBAGE) != 0)

Definition at line 227 of file nbtree.h.

◆ P_HIKEY

#define P_HIKEY   ((OffsetNumber) 1)

Definition at line 368 of file nbtree.h.

◆ P_IGNORE

#define P_IGNORE (   opaque)    (((opaque)->btpo_flags & (BTP_DELETED|BTP_HALF_DEAD)) != 0)

Definition at line 226 of file nbtree.h.

◆ P_INCOMPLETE_SPLIT

#define P_INCOMPLETE_SPLIT (   opaque)    (((opaque)->btpo_flags & BTP_INCOMPLETE_SPLIT) != 0)

Definition at line 228 of file nbtree.h.

◆ P_ISDELETED

#define P_ISDELETED (   opaque)    (((opaque)->btpo_flags & BTP_DELETED) != 0)

Definition at line 223 of file nbtree.h.

◆ P_ISHALFDEAD

#define P_ISHALFDEAD (   opaque)    (((opaque)->btpo_flags & BTP_HALF_DEAD) != 0)

Definition at line 225 of file nbtree.h.

◆ P_ISLEAF

#define P_ISLEAF (   opaque)    (((opaque)->btpo_flags & BTP_LEAF) != 0)

Definition at line 221 of file nbtree.h.

◆ P_ISMETA

#define P_ISMETA (   opaque)    (((opaque)->btpo_flags & BTP_META) != 0)

Definition at line 224 of file nbtree.h.

◆ P_ISROOT

#define P_ISROOT (   opaque)    (((opaque)->btpo_flags & BTP_ROOT) != 0)

Definition at line 222 of file nbtree.h.

◆ P_LEFTMOST

#define P_LEFTMOST (   opaque)    ((opaque)->btpo_prev == P_NONE)

Definition at line 219 of file nbtree.h.

◆ P_NONE

#define P_NONE   0

Definition at line 213 of file nbtree.h.

◆ P_RIGHTMOST

#define P_RIGHTMOST (   opaque)    ((opaque)->btpo_next == P_NONE)

Definition at line 220 of file nbtree.h.

◆ PROGRESS_BTREE_PHASE_INDEXBUILD_TABLESCAN

#define PROGRESS_BTREE_PHASE_INDEXBUILD_TABLESCAN   2

Definition at line 1146 of file nbtree.h.

◆ PROGRESS_BTREE_PHASE_LEAF_LOAD

#define PROGRESS_BTREE_PHASE_LEAF_LOAD   5

Definition at line 1149 of file nbtree.h.

◆ PROGRESS_BTREE_PHASE_PERFORMSORT_1

#define PROGRESS_BTREE_PHASE_PERFORMSORT_1   3

Definition at line 1147 of file nbtree.h.

◆ PROGRESS_BTREE_PHASE_PERFORMSORT_2

#define PROGRESS_BTREE_PHASE_PERFORMSORT_2   4

Definition at line 1148 of file nbtree.h.

◆ SK_BT_DESC

#define SK_BT_DESC   (INDOPTION_DESC << SK_BT_INDOPTION_SHIFT)

Definition at line 1116 of file nbtree.h.

◆ SK_BT_INDOPTION_SHIFT

#define SK_BT_INDOPTION_SHIFT   24 /* must clear the above bits */

Definition at line 1115 of file nbtree.h.

◆ SK_BT_MAXVAL

#define SK_BT_MAXVAL   0x00100000 /* invalid sk_argument, use high_compare */

Definition at line 1110 of file nbtree.h.

◆ SK_BT_MINVAL

#define SK_BT_MINVAL   0x00080000 /* invalid sk_argument, use low_compare */

Definition at line 1109 of file nbtree.h.

◆ SK_BT_NEXT

#define SK_BT_NEXT   0x00200000 /* positions the scan > sk_argument */

Definition at line 1111 of file nbtree.h.

◆ SK_BT_NULLS_FIRST

#define SK_BT_NULLS_FIRST   (INDOPTION_NULLS_FIRST << SK_BT_INDOPTION_SHIFT)

Definition at line 1117 of file nbtree.h.

◆ SK_BT_PRIOR

#define SK_BT_PRIOR   0x00400000 /* positions the scan < sk_argument */

Definition at line 1112 of file nbtree.h.

◆ SK_BT_REQBKWD

#define SK_BT_REQBKWD   0x00020000 /* required to continue backward scan */

Definition at line 1105 of file nbtree.h.

◆ SK_BT_REQFWD

#define SK_BT_REQFWD   0x00010000 /* required to continue forward scan */

Definition at line 1104 of file nbtree.h.

◆ SK_BT_SKIP

#define SK_BT_SKIP   0x00040000 /* skip array on column without input = */

Definition at line 1106 of file nbtree.h.

Typedef Documentation

◆ BTArrayKeyInfo

◆ BTCycleId

typedef uint16 BTCycleId

Definition at line 30 of file nbtree.h.

◆ BTDedupInterval

◆ BTDedupState

Definition at line 904 of file nbtree.h.

◆ BTDedupStateData

◆ BTDeletedPageData

◆ BTInsertState

Definition at line 846 of file nbtree.h.

◆ BTInsertStateData

◆ BTMetaPageData

◆ BTOptions

typedef struct BTOptions BTOptions

◆ BTPageOpaque

Definition at line 72 of file nbtree.h.

◆ BTPageOpaqueData

◆ BTPendingFSM

typedef struct BTPendingFSM BTPendingFSM

◆ BTScanInsert

Definition at line 807 of file nbtree.h.

◆ BTScanInsertData

◆ BTScanOpaque

Definition at line 1097 of file nbtree.h.

◆ BTScanOpaqueData

◆ BTScanPos

Definition at line 1002 of file nbtree.h.

◆ BTScanPosData

typedef struct BTScanPosData BTScanPosData

◆ BTScanPosItem

typedef struct BTScanPosItem BTScanPosItem

◆ BTStack

typedef BTStackData* BTStack

Definition at line 750 of file nbtree.h.

◆ BTStackData

typedef struct BTStackData BTStackData

◆ BTVacState

typedef struct BTVacState BTVacState

◆ BTVacuumPosting

Definition at line 925 of file nbtree.h.

◆ BTVacuumPostingData

Function Documentation

◆ _bt_allequalimage()

bool _bt_allequalimage ( Relation  rel,
bool  debugmessage 
)

Definition at line 1181 of file nbtutils.c.

1182{
1183 bool allequalimage = true;
1184
1185 /* INCLUDE indexes can never support deduplication */
1188 return false;
1189
1190 for (int i = 0; i < IndexRelationGetNumberOfKeyAttributes(rel); i++)
1191 {
1192 Oid opfamily = rel->rd_opfamily[i];
1193 Oid opcintype = rel->rd_opcintype[i];
1194 Oid collation = rel->rd_indcollation[i];
1195 Oid equalimageproc;
1196
1197 equalimageproc = get_opfamily_proc(opfamily, opcintype, opcintype,
1199
1200 /*
1201 * If there is no BTEQUALIMAGE_PROC then deduplication is assumed to
1202 * be unsafe. Otherwise, actually call proc and see what it says.
1203 */
1204 if (!OidIsValid(equalimageproc) ||
1205 !DatumGetBool(OidFunctionCall1Coll(equalimageproc, collation,
1206 ObjectIdGetDatum(opcintype))))
1207 {
1208 allequalimage = false;
1209 break;
1210 }
1211 }
1212
1213 if (debugmessage)
1214 {
1215 if (allequalimage)
1216 elog(DEBUG1, "index \"%s\" can safely use deduplication",
1218 else
1219 elog(DEBUG1, "index \"%s\" cannot use deduplication",
1221 }
1222
1223 return allequalimage;
1224}
#define OidIsValid(objectId)
Definition: c.h:788
#define DEBUG1
Definition: elog.h:30
#define elog(elevel,...)
Definition: elog.h:226
Datum OidFunctionCall1Coll(Oid functionId, Oid collation, Datum arg1)
Definition: fmgr.c:1412
int i
Definition: isn.c:77
Oid get_opfamily_proc(Oid opfamily, Oid lefttype, Oid righttype, int16 procnum)
Definition: lsyscache.c:887
#define BTEQUALIMAGE_PROC
Definition: nbtree.h:720
static bool DatumGetBool(Datum X)
Definition: postgres.h:100
static Datum ObjectIdGetDatum(Oid X)
Definition: postgres.h:262
unsigned int Oid
Definition: postgres_ext.h:32
#define RelationGetRelationName(relation)
Definition: rel.h:549
#define IndexRelationGetNumberOfKeyAttributes(relation)
Definition: rel.h:534
Oid * rd_opcintype
Definition: rel.h:208
Oid * rd_opfamily
Definition: rel.h:207
Oid * rd_indcollation
Definition: rel.h:217

References BTEQUALIMAGE_PROC, DatumGetBool(), DEBUG1, elog, get_opfamily_proc(), i, IndexRelationGetNumberOfAttributes, IndexRelationGetNumberOfKeyAttributes, ObjectIdGetDatum(), OidFunctionCall1Coll(), OidIsValid, RelationData::rd_indcollation, RelationData::rd_opcintype, RelationData::rd_opfamily, and RelationGetRelationName.

Referenced by _bt_leafbuild(), bt_index_check_callback(), and btbuildempty().

◆ _bt_allocbuf()

Buffer _bt_allocbuf ( Relation  rel,
Relation  heaprel 
)

Definition at line 870 of file nbtpage.c.

871{
872 Buffer buf;
873 BlockNumber blkno;
874 Page page;
875
876 Assert(heaprel != NULL);
877
878 /*
879 * First see if the FSM knows of any free pages.
880 *
881 * We can't trust the FSM's report unreservedly; we have to check that the
882 * page is still free. (For example, an already-free page could have been
883 * re-used between the time the last VACUUM scanned it and the time the
884 * VACUUM made its FSM updates.)
885 *
886 * In fact, it's worse than that: we can't even assume that it's safe to
887 * take a lock on the reported page. If somebody else has a lock on it,
888 * or even worse our own caller does, we could deadlock. (The own-caller
889 * scenario is actually not improbable. Consider an index on a serial or
890 * timestamp column. Nearly all splits will be at the rightmost page, so
891 * it's entirely likely that _bt_split will call us while holding a lock
892 * on the page most recently acquired from FSM. A VACUUM running
893 * concurrently with the previous split could well have placed that page
894 * back in FSM.)
895 *
896 * To get around that, we ask for only a conditional lock on the reported
897 * page. If we fail, then someone else is using the page, and we may
898 * reasonably assume it's not free. (If we happen to be wrong, the worst
899 * consequence is the page will be lost to use till the next VACUUM, which
900 * is no big problem.)
901 */
902 for (;;)
903 {
904 blkno = GetFreeIndexPage(rel);
905 if (blkno == InvalidBlockNumber)
906 break;
907 buf = ReadBuffer(rel, blkno);
908 if (_bt_conditionallockbuf(rel, buf))
909 {
910 page = BufferGetPage(buf);
911
912 /*
913 * It's possible to find an all-zeroes page in an index. For
914 * example, a backend might successfully extend the relation one
915 * page and then crash before it is able to make a WAL entry for
916 * adding the page. If we find a zeroed page then reclaim it
917 * immediately.
918 */
919 if (PageIsNew(page))
920 {
921 /* Okay to use page. Initialize and return it. */
923 return buf;
924 }
925
926 if (BTPageIsRecyclable(page, heaprel))
927 {
928 /*
929 * If we are generating WAL for Hot Standby then create a WAL
930 * record that will allow us to conflict with queries running
931 * on standby, in case they have snapshots older than safexid
932 * value
933 */
935 {
936 xl_btree_reuse_page xlrec_reuse;
937
938 /*
939 * Note that we don't register the buffer with the record,
940 * because this operation doesn't modify the page (that
941 * already happened, back when VACUUM deleted the page).
942 * This record only exists to provide a conflict point for
943 * Hot Standby. See record REDO routine comments.
944 */
945 xlrec_reuse.locator = rel->rd_locator;
946 xlrec_reuse.block = blkno;
948 xlrec_reuse.isCatalogRel =
950
953
954 XLogInsert(RM_BTREE_ID, XLOG_BTREE_REUSE_PAGE);
955 }
956
957 /* Okay to use page. Re-initialize and return it. */
959 return buf;
960 }
961 elog(DEBUG2, "FSM returned nonrecyclable page");
962 _bt_relbuf(rel, buf);
963 }
964 else
965 {
966 elog(DEBUG2, "FSM returned nonlockable page");
967 /* couldn't get lock, so just drop pin */
969 }
970 }
971
972 /*
973 * Extend the relation by one page. Need to use RBM_ZERO_AND_LOCK or we
974 * risk a race condition against btvacuumscan --- see comments therein.
975 * This forces us to repeat the valgrind request that _bt_lockbuf()
976 * otherwise would make, as we can't use _bt_lockbuf() without introducing
977 * a race.
978 */
980 if (!RelationUsesLocalBuffers(rel))
982
983 /* Initialize the new page before returning it */
984 page = BufferGetPage(buf);
985 Assert(PageIsNew(page));
987
988 return buf;
989}
uint32 BlockNumber
Definition: block.h:31
int Buffer
Definition: buf.h:23
Buffer ExtendBufferedRel(BufferManagerRelation bmr, ForkNumber forkNum, BufferAccessStrategy strategy, uint32 flags)
Definition: bufmgr.c:845
void ReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:5366
Buffer ReadBuffer(Relation reln, BlockNumber blockNum)
Definition: bufmgr.c:745
static Page BufferGetPage(Buffer buffer)
Definition: bufmgr.h:436
static Size BufferGetPageSize(Buffer buffer)
Definition: bufmgr.h:425
@ EB_LOCK_FIRST
Definition: bufmgr.h:87
#define BMR_REL(p_rel)
Definition: bufmgr.h:114
static bool PageIsNew(const PageData *page)
Definition: bufpage.h:233
PageData * Page
Definition: bufpage.h:81
#define DEBUG2
Definition: elog.h:29
Assert(PointerIsAligned(start, uint64))
BlockNumber GetFreeIndexPage(Relation rel)
Definition: indexfsm.c:38
#define VALGRIND_MAKE_MEM_DEFINED(addr, size)
Definition: memdebug.h:26
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:1024
void _bt_pageinit(Page page, Size size)
Definition: nbtpage.c:1130
bool _bt_conditionallockbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:1094
static FullTransactionId BTPageGetDeleteXid(Page page)
Definition: nbtree.h:261
static bool BTPageIsRecyclable(Page page, Relation heaprel)
Definition: nbtree.h:292
#define XLOG_BTREE_REUSE_PAGE
Definition: nbtxlog.h:40
#define SizeOfBtreeReusePage
Definition: nbtxlog.h:192
#define RelationIsAccessibleInLogicalDecoding(relation)
Definition: rel.h:694
#define RelationNeedsWAL(relation)
Definition: rel.h:638
#define RelationUsesLocalBuffers(relation)
Definition: rel.h:647
@ MAIN_FORKNUM
Definition: relpath.h:58
RelFileLocator rd_locator
Definition: rel.h:57
FullTransactionId snapshotConflictHorizon
Definition: nbtxlog.h:187
RelFileLocator locator
Definition: nbtxlog.h:185
BlockNumber block
Definition: nbtxlog.h:186
#define XLogStandbyInfoActive()
Definition: xlog.h:123
XLogRecPtr XLogInsert(RmgrId rmid, uint8 info)
Definition: xloginsert.c:478
void XLogRegisterData(const void *data, uint32 len)
Definition: xloginsert.c:368
void XLogBeginInsert(void)
Definition: xloginsert.c:152

References _bt_conditionallockbuf(), _bt_pageinit(), _bt_relbuf(), Assert(), xl_btree_reuse_page::block, BMR_REL, BTPageGetDeleteXid(), BTPageIsRecyclable(), buf, BufferGetPage(), BufferGetPageSize(), DEBUG2, EB_LOCK_FIRST, elog, ExtendBufferedRel(), GetFreeIndexPage(), InvalidBlockNumber, xl_btree_reuse_page::isCatalogRel, xl_btree_reuse_page::locator, MAIN_FORKNUM, PageIsNew(), RelationData::rd_locator, ReadBuffer(), RelationIsAccessibleInLogicalDecoding, RelationNeedsWAL, RelationUsesLocalBuffers, ReleaseBuffer(), SizeOfBtreeReusePage, xl_btree_reuse_page::snapshotConflictHorizon, VALGRIND_MAKE_MEM_DEFINED, XLOG_BTREE_REUSE_PAGE, XLogBeginInsert(), XLogInsert(), XLogRegisterData(), and XLogStandbyInfoActive.

Referenced by _bt_getroot(), _bt_newlevel(), and _bt_split().

◆ _bt_binsrch_array_skey()

int _bt_binsrch_array_skey ( FmgrInfo orderproc,
bool  cur_elem_trig,
ScanDirection  dir,
Datum  tupdatum,
bool  tupnull,
BTArrayKeyInfo array,
ScanKey  cur,
int32 set_elem_result 
)

Definition at line 3415 of file nbtreadpage.c.

3420{
3421 int low_elem = 0,
3422 mid_elem = -1,
3423 high_elem = array->num_elems - 1,
3424 result = 0;
3425 Datum arrdatum;
3426
3427 Assert(cur->sk_flags & SK_SEARCHARRAY);
3428 Assert(!(cur->sk_flags & SK_BT_SKIP));
3429 Assert(!(cur->sk_flags & SK_ISNULL)); /* SAOP arrays never have NULLs */
3430 Assert(cur->sk_strategy == BTEqualStrategyNumber);
3431
3432 if (cur_elem_trig)
3433 {
3435 Assert(cur->sk_flags & SK_BT_REQFWD);
3436
3437 /*
3438 * When the scan key that triggered array advancement is a required
3439 * array scan key, it is now certain that the current array element
3440 * (plus all prior elements relative to the current scan direction)
3441 * cannot possibly be at or ahead of the corresponding tuple value.
3442 * (_bt_checkkeys must have called _bt_tuple_before_array_skeys, which
3443 * makes sure this is true as a condition of advancing the arrays.)
3444 *
3445 * This makes it safe to exclude array elements up to and including
3446 * the former-current array element from our search.
3447 *
3448 * Separately, when array advancement was triggered by a required scan
3449 * key, the array element immediately after the former-current element
3450 * is often either an exact tupdatum match, or a "close by" near-match
3451 * (a near-match tupdatum is one whose key space falls _between_ the
3452 * former-current and new-current array elements). We'll detect both
3453 * cases via an optimistic comparison of the new search lower bound
3454 * (or new search upper bound in the case of backwards scans).
3455 */
3456 if (ScanDirectionIsForward(dir))
3457 {
3458 low_elem = array->cur_elem + 1; /* old cur_elem exhausted */
3459
3460 /* Compare prospective new cur_elem (also the new lower bound) */
3461 if (high_elem >= low_elem)
3462 {
3463 arrdatum = array->elem_values[low_elem];
3464 result = _bt_compare_array_skey(orderproc, tupdatum, tupnull,
3465 arrdatum, cur);
3466
3467 if (result <= 0)
3468 {
3469 /* Optimistic comparison optimization worked out */
3470 *set_elem_result = result;
3471 return low_elem;
3472 }
3473 mid_elem = low_elem;
3474 low_elem++; /* this cur_elem exhausted, too */
3475 }
3476
3477 if (high_elem < low_elem)
3478 {
3479 /* Caller needs to perform "beyond end" array advancement */
3480 *set_elem_result = 1;
3481 return high_elem;
3482 }
3483 }
3484 else
3485 {
3486 high_elem = array->cur_elem - 1; /* old cur_elem exhausted */
3487
3488 /* Compare prospective new cur_elem (also the new upper bound) */
3489 if (high_elem >= low_elem)
3490 {
3491 arrdatum = array->elem_values[high_elem];
3492 result = _bt_compare_array_skey(orderproc, tupdatum, tupnull,
3493 arrdatum, cur);
3494
3495 if (result >= 0)
3496 {
3497 /* Optimistic comparison optimization worked out */
3498 *set_elem_result = result;
3499 return high_elem;
3500 }
3501 mid_elem = high_elem;
3502 high_elem--; /* this cur_elem exhausted, too */
3503 }
3504
3505 if (high_elem < low_elem)
3506 {
3507 /* Caller needs to perform "beyond end" array advancement */
3508 *set_elem_result = -1;
3509 return low_elem;
3510 }
3511 }
3512 }
3513
3514 while (high_elem > low_elem)
3515 {
3516 mid_elem = low_elem + ((high_elem - low_elem) / 2);
3517 arrdatum = array->elem_values[mid_elem];
3518
3519 result = _bt_compare_array_skey(orderproc, tupdatum, tupnull,
3520 arrdatum, cur);
3521
3522 if (result == 0)
3523 {
3524 /*
3525 * It's safe to quit as soon as we see an equal array element.
3526 * This often saves an extra comparison or two...
3527 */
3528 low_elem = mid_elem;
3529 break;
3530 }
3531
3532 if (result > 0)
3533 low_elem = mid_elem + 1;
3534 else
3535 high_elem = mid_elem;
3536 }
3537
3538 /*
3539 * ...but our caller also cares about how its searched-for tuple datum
3540 * compares to the low_elem datum. Must always set *set_elem_result with
3541 * the result of that comparison specifically.
3542 */
3543 if (low_elem != mid_elem)
3544 result = _bt_compare_array_skey(orderproc, tupdatum, tupnull,
3545 array->elem_values[low_elem], cur);
3546
3547 *set_elem_result = result;
3548
3549 return low_elem;
3550}
struct cursor * cur
Definition: ecpg.c:29
static int32 _bt_compare_array_skey(FmgrInfo *orderproc, Datum tupdatum, bool tupnull, Datum arrdatum, ScanKey cur)
Definition: nbtreadpage.c:3344
#define SK_BT_SKIP
Definition: nbtree.h:1106
#define SK_BT_REQFWD
Definition: nbtree.h:1104
uint64_t Datum
Definition: postgres.h:70
#define ScanDirectionIsForward(direction)
Definition: sdir.h:64
#define ScanDirectionIsNoMovement(direction)
Definition: sdir.h:57
#define SK_SEARCHARRAY
Definition: skey.h:120
#define SK_ISNULL
Definition: skey.h:115
#define BTEqualStrategyNumber
Definition: stratnum.h:31
Datum * elem_values
Definition: nbtree.h:1041

References _bt_compare_array_skey(), Assert(), BTEqualStrategyNumber, cur, BTArrayKeyInfo::cur_elem, BTArrayKeyInfo::elem_values, BTArrayKeyInfo::num_elems, ScanDirectionIsForward, ScanDirectionIsNoMovement, SK_BT_REQFWD, SK_BT_SKIP, SK_ISNULL, and SK_SEARCHARRAY.

Referenced by _bt_advance_array_keys(), _bt_saoparray_shrink(), and _bt_set_startikey().

◆ _bt_binsrch_insert()

OffsetNumber _bt_binsrch_insert ( Relation  rel,
BTInsertState  insertstate 
)

Definition at line 469 of file nbtsearch.c.

470{
471 BTScanInsert key = insertstate->itup_key;
472 Page page;
473 BTPageOpaque opaque;
474 OffsetNumber low,
475 high,
476 stricthigh;
477 int32 result,
478 cmpval;
479
480 page = BufferGetPage(insertstate->buf);
481 opaque = BTPageGetOpaque(page);
482
483 Assert(P_ISLEAF(opaque));
484 Assert(!key->nextkey);
485 Assert(insertstate->postingoff == 0);
486
487 if (!insertstate->bounds_valid)
488 {
489 /* Start new binary search */
490 low = P_FIRSTDATAKEY(opaque);
491 high = PageGetMaxOffsetNumber(page);
492 }
493 else
494 {
495 /* Restore result of previous binary search against same page */
496 low = insertstate->low;
497 high = insertstate->stricthigh;
498 }
499
500 /* If there are no keys on the page, return the first available slot */
501 if (unlikely(high < low))
502 {
503 /* Caller can't reuse bounds */
504 insertstate->low = InvalidOffsetNumber;
505 insertstate->stricthigh = InvalidOffsetNumber;
506 insertstate->bounds_valid = false;
507 return low;
508 }
509
510 /*
511 * Binary search to find the first key on the page >= scan key. (nextkey
512 * is always false when inserting).
513 *
514 * The loop invariant is: all slots before 'low' are < scan key, all slots
515 * at or after 'high' are >= scan key. 'stricthigh' is > scan key, and is
516 * maintained to save additional search effort for caller.
517 *
518 * We can fall out when high == low.
519 */
520 if (!insertstate->bounds_valid)
521 high++; /* establish the loop invariant for high */
522 stricthigh = high; /* high initially strictly higher */
523
524 cmpval = 1; /* !nextkey comparison value */
525
526 while (high > low)
527 {
528 OffsetNumber mid = low + ((high - low) / 2);
529
530 /* We have low <= mid < high, so mid points at a real slot */
531
532 result = _bt_compare(rel, key, page, mid);
533
534 if (result >= cmpval)
535 low = mid + 1;
536 else
537 {
538 high = mid;
539 if (result != 0)
540 stricthigh = high;
541 }
542
543 /*
544 * If tuple at offset located by binary search is a posting list whose
545 * TID range overlaps with caller's scantid, perform posting list
546 * binary search to set postingoff for caller. Caller must split the
547 * posting list when postingoff is set. This should happen
548 * infrequently.
549 */
550 if (unlikely(result == 0 && key->scantid != NULL))
551 {
552 /*
553 * postingoff should never be set more than once per leaf page
554 * binary search. That would mean that there are duplicate table
555 * TIDs in the index, which is never okay. Check for that here.
556 */
557 if (insertstate->postingoff != 0)
559 (errcode(ERRCODE_INDEX_CORRUPTED),
560 errmsg_internal("table tid from new index tuple (%u,%u) cannot find insert offset between offsets %u and %u of block %u in index \"%s\"",
563 low, stricthigh,
564 BufferGetBlockNumber(insertstate->buf),
566
567 insertstate->postingoff = _bt_binsrch_posting(key, page, mid);
568 }
569 }
570
571 /*
572 * On a leaf page, a binary search always returns the first key >= scan
573 * key (at least in !nextkey case), which could be the last slot + 1. This
574 * is also the lower bound of cached search.
575 *
576 * stricthigh may also be the last slot + 1, which prevents caller from
577 * using bounds directly, but is still useful to us if we're called a
578 * second time with cached bounds (cached low will be < stricthigh when
579 * that happens).
580 */
581 insertstate->low = low;
582 insertstate->stricthigh = stricthigh;
583 insertstate->bounds_valid = true;
584
585 return low;
586}
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:4223
static OffsetNumber PageGetMaxOffsetNumber(const PageData *page)
Definition: bufpage.h:371
int32_t int32
Definition: c.h:548
#define unlikely(x)
Definition: c.h:418
int errmsg_internal(const char *fmt,...)
Definition: elog.c:1170
int errcode(int sqlerrcode)
Definition: elog.c:863
#define ERROR
Definition: elog.h:39
#define ereport(elevel,...)
Definition: elog.h:150
static OffsetNumber ItemPointerGetOffsetNumber(const ItemPointerData *pointer)
Definition: itemptr.h:124
static BlockNumber ItemPointerGetBlockNumber(const ItemPointerData *pointer)
Definition: itemptr.h:103
#define P_ISLEAF(opaque)
Definition: nbtree.h:221
#define BTPageGetOpaque(page)
Definition: nbtree.h:74
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:370
static int _bt_binsrch_posting(BTScanInsert key, Page page, OffsetNumber offnum)
Definition: nbtsearch.c:597
int32 _bt_compare(Relation rel, BTScanInsert key, Page page, OffsetNumber offnum)
Definition: nbtsearch.c:683
#define InvalidOffsetNumber
Definition: off.h:26
uint16 OffsetNumber
Definition: off.h:24
OffsetNumber stricthigh
Definition: nbtree.h:836
bool bounds_valid
Definition: nbtree.h:834
OffsetNumber low
Definition: nbtree.h:835
BTScanInsert itup_key
Definition: nbtree.h:824

References _bt_binsrch_posting(), _bt_compare(), Assert(), BTInsertStateData::bounds_valid, BTPageGetOpaque, BTInsertStateData::buf, BufferGetBlockNumber(), BufferGetPage(), ereport, errcode(), errmsg_internal(), ERROR, InvalidOffsetNumber, ItemPointerGetBlockNumber(), ItemPointerGetOffsetNumber(), BTInsertStateData::itup_key, sort-test::key, BTInsertStateData::low, P_FIRSTDATAKEY, P_ISLEAF, PageGetMaxOffsetNumber(), BTInsertStateData::postingoff, RelationGetRelationName, BTInsertStateData::stricthigh, and unlikely.

Referenced by _bt_check_unique(), _bt_findinsertloc(), and bt_rootdescend().

◆ _bt_bottomupdel_pass()

bool _bt_bottomupdel_pass ( Relation  rel,
Buffer  buf,
Relation  heapRel,
Size  newitemsz 
)

Definition at line 307 of file nbtdedup.c.

309{
310 OffsetNumber offnum,
311 minoff,
312 maxoff;
313 Page page = BufferGetPage(buf);
314 BTPageOpaque opaque = BTPageGetOpaque(page);
316 TM_IndexDeleteOp delstate;
317 bool neverdedup;
318 int nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
319
320 /* Passed-in newitemsz is MAXALIGNED but does not include line pointer */
321 newitemsz += sizeof(ItemIdData);
322
323 /* Initialize deduplication state */
325 state->deduplicate = true;
326 state->nmaxitems = 0;
327 state->maxpostingsize = BLCKSZ; /* We're not really deduplicating */
328 state->base = NULL;
329 state->baseoff = InvalidOffsetNumber;
330 state->basetupsize = 0;
331 state->htids = palloc(state->maxpostingsize);
332 state->nhtids = 0;
333 state->nitems = 0;
334 state->phystupsize = 0;
335 state->nintervals = 0;
336
337 /*
338 * Initialize tableam state that describes bottom-up index deletion
339 * operation.
340 *
341 * We'll go on to ask the tableam to search for TIDs whose index tuples we
342 * can safely delete. The tableam will search until our leaf page space
343 * target is satisfied, or until the cost of continuing with the tableam
344 * operation seems too high. It focuses its efforts on TIDs associated
345 * with duplicate index tuples that we mark "promising".
346 *
347 * This space target is a little arbitrary. The tableam must be able to
348 * keep the costs and benefits in balance. We provide the tableam with
349 * exhaustive information about what might work, without directly
350 * concerning ourselves with avoiding work during the tableam call. Our
351 * role in costing the bottom-up deletion process is strictly advisory.
352 */
353 delstate.irel = rel;
354 delstate.iblknum = BufferGetBlockNumber(buf);
355 delstate.bottomup = true;
356 delstate.bottomupfreespace = Max(BLCKSZ / 16, newitemsz);
357 delstate.ndeltids = 0;
360
361 minoff = P_FIRSTDATAKEY(opaque);
362 maxoff = PageGetMaxOffsetNumber(page);
363 for (offnum = minoff;
364 offnum <= maxoff;
365 offnum = OffsetNumberNext(offnum))
366 {
367 ItemId itemid = PageGetItemId(page, offnum);
368 IndexTuple itup = (IndexTuple) PageGetItem(page, itemid);
369
370 Assert(!ItemIdIsDead(itemid));
371
372 if (offnum == minoff)
373 {
374 /* itup starts first pending interval */
375 _bt_dedup_start_pending(state, itup, offnum);
376 }
377 else if (_bt_keep_natts_fast(rel, state->base, itup) > nkeyatts &&
379 {
380 /* Tuple is equal; just added its TIDs to pending interval */
381 }
382 else
383 {
384 /* Finalize interval -- move its TIDs to delete state */
385 _bt_bottomupdel_finish_pending(page, state, &delstate);
386
387 /* itup starts new pending interval */
388 _bt_dedup_start_pending(state, itup, offnum);
389 }
390 }
391 /* Finalize final interval -- move its TIDs to delete state */
392 _bt_bottomupdel_finish_pending(page, state, &delstate);
393
394 /*
395 * We don't give up now in the event of having few (or even zero)
396 * promising tuples for the tableam because it's not up to us as the index
397 * AM to manage costs (note that the tableam might have heuristics of its
398 * own that work out what to do). We should at least avoid having our
399 * caller do a useless deduplication pass after we return in the event of
400 * zero promising tuples, though.
401 */
402 neverdedup = false;
403 if (state->nintervals == 0)
404 neverdedup = true;
405
406 pfree(state->htids);
407 pfree(state);
408
409 /* Ask tableam which TIDs are deletable, then physically delete them */
410 _bt_delitems_delete_check(rel, buf, heapRel, &delstate);
411
412 pfree(delstate.deltids);
413 pfree(delstate.status);
414
415 /* Report "success" to caller unconditionally to avoid deduplication */
416 if (neverdedup)
417 return true;
418
419 /* Don't dedup when we won't end up back here any time soon anyway */
420 return PageGetExactFreeSpace(page) >= Max(BLCKSZ / 24, newitemsz);
421}
Size PageGetExactFreeSpace(const PageData *page)
Definition: bufpage.c:957
static void * PageGetItem(const PageData *page, const ItemIdData *itemId)
Definition: bufpage.h:353
static ItemId PageGetItemId(Page page, OffsetNumber offsetNumber)
Definition: bufpage.h:243
#define Max(x, y)
Definition: c.h:1010
#define palloc_object(type)
Definition: fe_memutils.h:74
#define palloc_array(type, count)
Definition: fe_memutils.h:76
struct ItemIdData ItemIdData
#define ItemIdIsDead(itemId)
Definition: itemid.h:113
IndexTupleData * IndexTuple
Definition: itup.h:53
void pfree(void *pointer)
Definition: mcxt.c:1594
void * palloc(Size size)
Definition: mcxt.c:1365
bool _bt_dedup_save_htid(BTDedupState state, IndexTuple itup)
Definition: nbtdedup.c:484
void _bt_dedup_start_pending(BTDedupState state, IndexTuple base, OffsetNumber baseoff)
Definition: nbtdedup.c:433
static void _bt_bottomupdel_finish_pending(Page page, BTDedupState state, TM_IndexDeleteOp *delstate)
Definition: nbtdedup.c:646
void _bt_delitems_delete_check(Relation rel, Buffer buf, Relation heapRel, TM_IndexDeleteOp *delstate)
Definition: nbtpage.c:1512
#define MaxTIDsPerBTreePage
Definition: nbtree.h:186
int _bt_keep_natts_fast(Relation rel, IndexTuple lastleft, IndexTuple firstright)
Definition: nbtutils.c:917
#define OffsetNumberNext(offsetNumber)
Definition: off.h:52
TM_IndexStatus * status
Definition: tableam.h:254
int bottomupfreespace
Definition: tableam.h:249
Relation irel
Definition: tableam.h:246
TM_IndexDelete * deltids
Definition: tableam.h:253
BlockNumber iblknum
Definition: tableam.h:247
Definition: regguts.h:323

References _bt_bottomupdel_finish_pending(), _bt_dedup_save_htid(), _bt_dedup_start_pending(), _bt_delitems_delete_check(), _bt_keep_natts_fast(), Assert(), TM_IndexDeleteOp::bottomup, TM_IndexDeleteOp::bottomupfreespace, BTPageGetOpaque, buf, BufferGetBlockNumber(), BufferGetPage(), TM_IndexDeleteOp::deltids, TM_IndexDeleteOp::iblknum, IndexRelationGetNumberOfKeyAttributes, InvalidOffsetNumber, TM_IndexDeleteOp::irel, ItemIdIsDead, Max, MaxTIDsPerBTreePage, TM_IndexDeleteOp::ndeltids, OffsetNumberNext, P_FIRSTDATAKEY, PageGetExactFreeSpace(), PageGetItem(), PageGetItemId(), PageGetMaxOffsetNumber(), palloc(), palloc_array, palloc_object, pfree(), and TM_IndexDeleteOp::status.

Referenced by _bt_delete_or_dedup_one_page().

◆ _bt_check_natts()

bool _bt_check_natts ( Relation  rel,
bool  heapkeyspace,
Page  page,
OffsetNumber  offnum 
)

Definition at line 964 of file nbtutils.c.

965{
968 BTPageOpaque opaque = BTPageGetOpaque(page);
969 IndexTuple itup;
970 int tupnatts;
971
972 /*
973 * We cannot reliably test a deleted or half-dead page, since they have
974 * dummy high keys
975 */
976 if (P_IGNORE(opaque))
977 return true;
978
979 Assert(offnum >= FirstOffsetNumber &&
980 offnum <= PageGetMaxOffsetNumber(page));
981
982 itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
983 tupnatts = BTreeTupleGetNAtts(itup, rel);
984
985 /* !heapkeyspace indexes do not support deduplication */
986 if (!heapkeyspace && BTreeTupleIsPosting(itup))
987 return false;
988
989 /* Posting list tuples should never have "pivot heap TID" bit set */
990 if (BTreeTupleIsPosting(itup) &&
993 return false;
994
995 /* INCLUDE indexes do not support deduplication */
996 if (natts != nkeyatts && BTreeTupleIsPosting(itup))
997 return false;
998
999 if (P_ISLEAF(opaque))
1000 {
1001 if (offnum >= P_FIRSTDATAKEY(opaque))
1002 {
1003 /*
1004 * Non-pivot tuple should never be explicitly marked as a pivot
1005 * tuple
1006 */
1007 if (BTreeTupleIsPivot(itup))
1008 return false;
1009
1010 /*
1011 * Leaf tuples that are not the page high key (non-pivot tuples)
1012 * should never be truncated. (Note that tupnatts must have been
1013 * inferred, even with a posting list tuple, because only pivot
1014 * tuples store tupnatts directly.)
1015 */
1016 return tupnatts == natts;
1017 }
1018 else
1019 {
1020 /*
1021 * Rightmost page doesn't contain a page high key, so tuple was
1022 * checked above as ordinary leaf tuple
1023 */
1024 Assert(!P_RIGHTMOST(opaque));
1025
1026 /*
1027 * !heapkeyspace high key tuple contains only key attributes. Note
1028 * that tupnatts will only have been explicitly represented in
1029 * !heapkeyspace indexes that happen to have non-key attributes.
1030 */
1031 if (!heapkeyspace)
1032 return tupnatts == nkeyatts;
1033
1034 /* Use generic heapkeyspace pivot tuple handling */
1035 }
1036 }
1037 else /* !P_ISLEAF(opaque) */
1038 {
1039 if (offnum == P_FIRSTDATAKEY(opaque))
1040 {
1041 /*
1042 * The first tuple on any internal page (possibly the first after
1043 * its high key) is its negative infinity tuple. Negative
1044 * infinity tuples are always truncated to zero attributes. They
1045 * are a particular kind of pivot tuple.
1046 */
1047 if (heapkeyspace)
1048 return tupnatts == 0;
1049
1050 /*
1051 * The number of attributes won't be explicitly represented if the
1052 * negative infinity tuple was generated during a page split that
1053 * occurred with a version of Postgres before v11. There must be
1054 * a problem when there is an explicit representation that is
1055 * non-zero, or when there is no explicit representation and the
1056 * tuple is evidently not a pre-pg_upgrade tuple.
1057 *
1058 * Prior to v11, downlinks always had P_HIKEY as their offset.
1059 * Accept that as an alternative indication of a valid
1060 * !heapkeyspace negative infinity tuple.
1061 */
1062 return tupnatts == 0 ||
1064 }
1065 else
1066 {
1067 /*
1068 * !heapkeyspace downlink tuple with separator key contains only
1069 * key attributes. Note that tupnatts will only have been
1070 * explicitly represented in !heapkeyspace indexes that happen to
1071 * have non-key attributes.
1072 */
1073 if (!heapkeyspace)
1074 return tupnatts == nkeyatts;
1075
1076 /* Use generic heapkeyspace pivot tuple handling */
1077 }
1078 }
1079
1080 /* Handle heapkeyspace pivot tuples (excluding minus infinity items) */
1081 Assert(heapkeyspace);
1082
1083 /*
1084 * Explicit representation of the number of attributes is mandatory with
1085 * heapkeyspace index pivot tuples, regardless of whether or not there are
1086 * non-key attributes.
1087 */
1088 if (!BTreeTupleIsPivot(itup))
1089 return false;
1090
1091 /* Pivot tuple should not use posting list representation (redundant) */
1092 if (BTreeTupleIsPosting(itup))
1093 return false;
1094
1095 /*
1096 * Heap TID is a tiebreaker key attribute, so it cannot be untruncated
1097 * when any other key attribute is truncated
1098 */
1099 if (BTreeTupleGetHeapTID(itup) != NULL && tupnatts != nkeyatts)
1100 return false;
1101
1102 /*
1103 * Pivot tuple must have at least one untruncated key attribute (minus
1104 * infinity pivot tuples are the only exception). Pivot tuples can never
1105 * represent that there is a value present for a key attribute that
1106 * exceeds pg_index.indnkeyatts for the index.
1107 */
1108 return tupnatts > 0 && tupnatts <= nkeyatts;
1109}
int16_t int16
Definition: c.h:547
#define BT_PIVOT_HEAP_TID_ATTR
Definition: nbtree.h:466
#define P_HIKEY
Definition: nbtree.h:368
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:220
#define P_IGNORE(opaque)
Definition: nbtree.h:226
static bool BTreeTupleIsPosting(IndexTuple itup)
Definition: nbtree.h:493
static ItemPointer BTreeTupleGetHeapTID(IndexTuple itup)
Definition: nbtree.h:639
#define BTreeTupleGetNAtts(itup, rel)
Definition: nbtree.h:578
#define FirstOffsetNumber
Definition: off.h:27
ItemPointerData t_tid
Definition: itup.h:37

References Assert(), BT_PIVOT_HEAP_TID_ATTR, BTPageGetOpaque, BTreeTupleGetHeapTID(), BTreeTupleGetNAtts, BTreeTupleIsPivot(), BTreeTupleIsPosting(), FirstOffsetNumber, IndexRelationGetNumberOfAttributes, IndexRelationGetNumberOfKeyAttributes, ItemPointerGetOffsetNumber(), ItemPointerGetOffsetNumberNoCheck(), P_FIRSTDATAKEY, P_HIKEY, P_IGNORE, P_ISLEAF, P_RIGHTMOST, PageGetItem(), PageGetItemId(), PageGetMaxOffsetNumber(), and IndexTupleData::t_tid.

Referenced by _bt_compare(), and bt_target_page_check().

◆ _bt_check_third_page()

void _bt_check_third_page ( Relation  rel,
Relation  heap,
bool  needheaptidspace,
Page  page,
IndexTuple  newtup 
)

Definition at line 1124 of file nbtutils.c.

1126{
1127 Size itemsz;
1128 BTPageOpaque opaque;
1129
1130 itemsz = MAXALIGN(IndexTupleSize(newtup));
1131
1132 /* Double check item size against limit */
1133 if (itemsz <= BTMaxItemSize)
1134 return;
1135
1136 /*
1137 * Tuple is probably too large to fit on page, but it's possible that the
1138 * index uses version 2 or version 3, or that page is an internal page, in
1139 * which case a slightly higher limit applies.
1140 */
1141 if (!needheaptidspace && itemsz <= BTMaxItemSizeNoHeapTid)
1142 return;
1143
1144 /*
1145 * Internal page insertions cannot fail here, because that would mean that
1146 * an earlier leaf level insertion that should have failed didn't
1147 */
1148 opaque = BTPageGetOpaque(page);
1149 if (!P_ISLEAF(opaque))
1150 elog(ERROR, "cannot insert oversized tuple of size %zu on internal page of index \"%s\"",
1151 itemsz, RelationGetRelationName(rel));
1152
1153 ereport(ERROR,
1154 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
1155 errmsg("index row size %zu exceeds btree version %u maximum %zu for index \"%s\"",
1156 itemsz,
1157 needheaptidspace ? BTREE_VERSION : BTREE_NOVAC_VERSION,
1158 needheaptidspace ? BTMaxItemSize : BTMaxItemSizeNoHeapTid,
1160 errdetail("Index row references tuple (%u,%u) in relation \"%s\".",
1164 errhint("Values larger than 1/3 of a buffer page cannot be indexed.\n"
1165 "Consider a function index of an MD5 hash of the value, "
1166 "or use full text indexing."),
1168}
size_t Size
Definition: c.h:624
int errdetail(const char *fmt,...)
Definition: elog.c:1216
int errhint(const char *fmt,...)
Definition: elog.c:1330
int errmsg(const char *fmt,...)
Definition: elog.c:1080
static Size IndexTupleSize(const IndexTupleData *itup)
Definition: itup.h:71
#define BTREE_VERSION
Definition: nbtree.h:151
#define BTREE_NOVAC_VERSION
Definition: nbtree.h:153
#define BTMaxItemSizeNoHeapTid
Definition: nbtree.h:170
#define BTMaxItemSize
Definition: nbtree.h:165
int errtableconstraint(Relation rel, const char *conname)
Definition: relcache.c:6103

References BTMaxItemSize, BTMaxItemSizeNoHeapTid, BTPageGetOpaque, BTREE_NOVAC_VERSION, BTREE_VERSION, BTreeTupleGetHeapTID(), elog, ereport, errcode(), errdetail(), errhint(), errmsg(), ERROR, errtableconstraint(), IndexTupleSize(), ItemPointerGetBlockNumber(), ItemPointerGetOffsetNumber(), MAXALIGN, P_ISLEAF, and RelationGetRelationName.

Referenced by _bt_buildadd(), and _bt_findinsertloc().

◆ _bt_checkpage()

void _bt_checkpage ( Relation  rel,
Buffer  buf 
)

Definition at line 798 of file nbtpage.c.

799{
800 Page page = BufferGetPage(buf);
801
802 /*
803 * ReadBuffer verifies that every newly-read page passes
804 * PageHeaderIsValid, which means it either contains a reasonably sane
805 * page header or is all-zero. We have to defend against the all-zero
806 * case, however.
807 */
808 if (PageIsNew(page))
810 (errcode(ERRCODE_INDEX_CORRUPTED),
811 errmsg("index \"%s\" contains unexpected zero page at block %u",
814 errhint("Please REINDEX it.")));
815
816 /*
817 * Additionally check that the special area looks sane.
818 */
819 if (PageGetSpecialSize(page) != MAXALIGN(sizeof(BTPageOpaqueData)))
821 (errcode(ERRCODE_INDEX_CORRUPTED),
822 errmsg("index \"%s\" contains corrupted page at block %u",
825 errhint("Please REINDEX it.")));
826}
static uint16 PageGetSpecialSize(const PageData *page)
Definition: bufpage.h:316

References buf, BufferGetBlockNumber(), BufferGetPage(), ereport, errcode(), errhint(), errmsg(), ERROR, MAXALIGN, PageGetSpecialSize(), PageIsNew(), and RelationGetRelationName.

Referenced by _bt_getbuf(), _bt_relandgetbuf(), _bt_search_insert(), bt_recheck_sibling_links(), btvacuumpage(), and palloc_btree_page().

◆ _bt_compare()

int32 _bt_compare ( Relation  rel,
BTScanInsert  key,
Page  page,
OffsetNumber  offnum 
)

Definition at line 683 of file nbtsearch.c.

687{
688 TupleDesc itupdesc = RelationGetDescr(rel);
689 BTPageOpaque opaque = BTPageGetOpaque(page);
690 IndexTuple itup;
691 ItemPointer heapTid;
692 ScanKey scankey;
693 int ncmpkey;
694 int ntupatts;
695 int32 result;
696
697 Assert(_bt_check_natts(rel, key->heapkeyspace, page, offnum));
699 Assert(key->heapkeyspace || key->scantid == NULL);
700
701 /*
702 * Force result ">" if target item is first data item on an internal page
703 * --- see NOTE above.
704 */
705 if (!P_ISLEAF(opaque) && offnum == P_FIRSTDATAKEY(opaque))
706 return 1;
707
708 itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
709 ntupatts = BTreeTupleGetNAtts(itup, rel);
710
711 /*
712 * The scan key is set up with the attribute number associated with each
713 * term in the key. It is important that, if the index is multi-key, the
714 * scan contain the first k key attributes, and that they be in order. If
715 * you think about how multi-key ordering works, you'll understand why
716 * this is.
717 *
718 * We don't test for violation of this condition here, however. The
719 * initial setup for the index scan had better have gotten it right (see
720 * _bt_first).
721 */
722
723 ncmpkey = Min(ntupatts, key->keysz);
724 Assert(key->heapkeyspace || ncmpkey == key->keysz);
725 Assert(!BTreeTupleIsPosting(itup) || key->allequalimage);
726 scankey = key->scankeys;
727 for (int i = 1; i <= ncmpkey; i++)
728 {
729 Datum datum;
730 bool isNull;
731
732 datum = index_getattr(itup, scankey->sk_attno, itupdesc, &isNull);
733
734 if (scankey->sk_flags & SK_ISNULL) /* key is NULL */
735 {
736 if (isNull)
737 result = 0; /* NULL "=" NULL */
738 else if (scankey->sk_flags & SK_BT_NULLS_FIRST)
739 result = -1; /* NULL "<" NOT_NULL */
740 else
741 result = 1; /* NULL ">" NOT_NULL */
742 }
743 else if (isNull) /* key is NOT_NULL and item is NULL */
744 {
745 if (scankey->sk_flags & SK_BT_NULLS_FIRST)
746 result = 1; /* NOT_NULL ">" NULL */
747 else
748 result = -1; /* NOT_NULL "<" NULL */
749 }
750 else
751 {
752 /*
753 * The sk_func needs to be passed the index value as left arg and
754 * the sk_argument as right arg (they might be of different
755 * types). Since it is convenient for callers to think of
756 * _bt_compare as comparing the scankey to the index item, we have
757 * to flip the sign of the comparison result. (Unless it's a DESC
758 * column, in which case we *don't* flip the sign.)
759 */
760 result = DatumGetInt32(FunctionCall2Coll(&scankey->sk_func,
761 scankey->sk_collation,
762 datum,
763 scankey->sk_argument));
764
765 if (!(scankey->sk_flags & SK_BT_DESC))
766 INVERT_COMPARE_RESULT(result);
767 }
768
769 /* if the keys are unequal, return the difference */
770 if (result != 0)
771 return result;
772
773 scankey++;
774 }
775
776 /*
777 * All non-truncated attributes (other than heap TID) were found to be
778 * equal. Treat truncated attributes as minus infinity when scankey has a
779 * key attribute value that would otherwise be compared directly.
780 *
781 * Note: it doesn't matter if ntupatts includes non-key attributes;
782 * scankey won't, so explicitly excluding non-key attributes isn't
783 * necessary.
784 */
785 if (key->keysz > ntupatts)
786 return 1;
787
788 /*
789 * Use the heap TID attribute and scantid to try to break the tie. The
790 * rules are the same as any other key attribute -- only the
791 * representation differs.
792 */
793 heapTid = BTreeTupleGetHeapTID(itup);
794 if (key->scantid == NULL)
795 {
796 /*
797 * Forward scans have a scankey that is considered greater than a
798 * truncated pivot tuple if and when the scankey has equal values for
799 * attributes up to and including the least significant untruncated
800 * attribute in tuple. Even attributes that were omitted from the
801 * scan key are considered greater than -inf truncated attributes.
802 * (See _bt_binsrch for an explanation of our backward scan behavior.)
803 *
804 * For example, if an index has the minimum two attributes (single
805 * user key attribute, plus heap TID attribute), and a page's high key
806 * is ('foo', -inf), and scankey is ('foo', <omitted>), the search
807 * will not descend to the page to the left. The search will descend
808 * right instead. The truncated attribute in pivot tuple means that
809 * all non-pivot tuples on the page to the left are strictly < 'foo',
810 * so it isn't necessary to descend left. In other words, search
811 * doesn't have to descend left because it isn't interested in a match
812 * that has a heap TID value of -inf.
813 *
814 * Note: the heap TID part of the test ensures that scankey is being
815 * compared to a pivot tuple with one or more truncated -inf key
816 * attributes. The heap TID attribute is the last key attribute in
817 * every index, of course, but other than that it isn't special.
818 */
819 if (!key->backward && key->keysz == ntupatts && heapTid == NULL &&
820 key->heapkeyspace)
821 return 1;
822
823 /* All provided scankey arguments found to be equal */
824 return 0;
825 }
826
827 /*
828 * Treat truncated heap TID as minus infinity, since scankey has a key
829 * attribute value (scantid) that would otherwise be compared directly
830 */
832 if (heapTid == NULL)
833 return 1;
834
835 /*
836 * Scankey must be treated as equal to a posting list tuple if its scantid
837 * value falls within the range of the posting list. In all other cases
838 * there can only be a single heap TID value, which is compared directly
839 * with scantid.
840 */
842 result = ItemPointerCompare(key->scantid, heapTid);
843 if (result <= 0 || !BTreeTupleIsPosting(itup))
844 return result;
845 else
846 {
847 result = ItemPointerCompare(key->scantid,
849 if (result > 0)
850 return 1;
851 }
852
853 return 0;
854}
#define Min(x, y)
Definition: c.h:1016
#define INVERT_COMPARE_RESULT(var)
Definition: c.h:1118
Datum FunctionCall2Coll(FmgrInfo *flinfo, Oid collation, Datum arg1, Datum arg2)
Definition: fmgr.c:1150
int32 ItemPointerCompare(const ItemPointerData *arg1, const ItemPointerData *arg2)
Definition: itemptr.c:51
static Datum index_getattr(IndexTuple tup, int attnum, TupleDesc tupleDesc, bool *isnull)
Definition: itup.h:131
#define SK_BT_NULLS_FIRST
Definition: nbtree.h:1117
#define SK_BT_DESC
Definition: nbtree.h:1116
static ItemPointer BTreeTupleGetMaxHeapTID(IndexTuple itup)
Definition: nbtree.h:665
bool _bt_check_natts(Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum)
Definition: nbtutils.c:964
static int32 DatumGetInt32(Datum X)
Definition: postgres.h:212
#define RelationGetDescr(relation)
Definition: rel.h:541
int sk_flags
Definition: skey.h:66
Datum sk_argument
Definition: skey.h:72
FmgrInfo sk_func
Definition: skey.h:71
Oid sk_collation
Definition: skey.h:70
AttrNumber sk_attno
Definition: skey.h:67

References _bt_check_natts(), Assert(), BTPageGetOpaque, BTreeTupleGetHeapTID(), BTreeTupleGetMaxHeapTID(), BTreeTupleGetNAtts, BTreeTupleIsPosting(), DatumGetInt32(), FunctionCall2Coll(), i, index_getattr(), IndexRelationGetNumberOfKeyAttributes, INVERT_COMPARE_RESULT, ItemPointerCompare(), sort-test::key, Min, P_FIRSTDATAKEY, P_ISLEAF, PageGetItem(), PageGetItemId(), RelationGetDescr, ScanKeyData::sk_argument, ScanKeyData::sk_attno, SK_BT_DESC, SK_BT_NULLS_FIRST, ScanKeyData::sk_collation, ScanKeyData::sk_flags, ScanKeyData::sk_func, and SK_ISNULL.

Referenced by _bt_binsrch(), _bt_binsrch_insert(), _bt_check_unique(), _bt_findinsertloc(), _bt_moveright(), _bt_search_insert(), bt_rootdescend(), bt_target_page_check(), invariant_g_offset(), invariant_l_nontarget_offset(), invariant_l_offset(), and invariant_leq_offset().

◆ _bt_conditionallockbuf()

bool _bt_conditionallockbuf ( Relation  rel,
Buffer  buf 
)

Definition at line 1094 of file nbtpage.c.

1095{
1096 /* ConditionalLockBuffer() asserts that pin is held by this backend */
1098 return false;
1099
1100 if (!RelationUsesLocalBuffers(rel))
1102
1103 return true;
1104}
bool ConditionalLockBuffer(Buffer buffer)
Definition: bufmgr.c:5630

References buf, BufferGetPage(), ConditionalLockBuffer(), RelationUsesLocalBuffers, and VALGRIND_MAKE_MEM_DEFINED.

Referenced by _bt_allocbuf(), and _bt_search_insert().

◆ _bt_dedup_finish_pending()

Size _bt_dedup_finish_pending ( Page  newpage,
BTDedupState  state 
)

Definition at line 555 of file nbtdedup.c.

556{
557 OffsetNumber tupoff;
558 Size tuplesz;
559 Size spacesaving;
560
561 Assert(state->nitems > 0);
562 Assert(state->nitems <= state->nhtids);
563 Assert(state->intervals[state->nintervals].baseoff == state->baseoff);
564
565 tupoff = OffsetNumberNext(PageGetMaxOffsetNumber(newpage));
566 if (state->nitems == 1)
567 {
568 /* Use original, unchanged base tuple */
569 tuplesz = IndexTupleSize(state->base);
570 Assert(tuplesz == MAXALIGN(IndexTupleSize(state->base)));
571 Assert(tuplesz <= BTMaxItemSize);
572 if (PageAddItem(newpage, state->base, tuplesz, tupoff, false, false) == InvalidOffsetNumber)
573 elog(ERROR, "deduplication failed to add tuple to page");
574
575 spacesaving = 0;
576 }
577 else
578 {
579 IndexTuple final;
580
581 /* Form a tuple with a posting list */
582 final = _bt_form_posting(state->base, state->htids, state->nhtids);
583 tuplesz = IndexTupleSize(final);
584 Assert(tuplesz <= state->maxpostingsize);
585
586 /* Save final number of items for posting list */
587 state->intervals[state->nintervals].nitems = state->nitems;
588
589 Assert(tuplesz == MAXALIGN(IndexTupleSize(final)));
590 Assert(tuplesz <= BTMaxItemSize);
591 if (PageAddItem(newpage, final, tuplesz, tupoff, false, false) == InvalidOffsetNumber)
592 elog(ERROR, "deduplication failed to add tuple to page");
593
594 pfree(final);
595 spacesaving = state->phystupsize - (tuplesz + sizeof(ItemIdData));
596 /* Increment nintervals, since we wrote a new posting list tuple */
597 state->nintervals++;
598 Assert(spacesaving > 0 && spacesaving < BLCKSZ);
599 }
600
601 /* Reset state for next pending posting list */
602 state->nhtids = 0;
603 state->nitems = 0;
604 state->phystupsize = 0;
605
606 return spacesaving;
607}
#define PageAddItem(page, item, size, offsetNumber, overwrite, is_heap)
Definition: bufpage.h:471
IndexTuple _bt_form_posting(IndexTuple base, const ItemPointerData *htids, int nhtids)
Definition: nbtdedup.c:862

References _bt_form_posting(), Assert(), BTMaxItemSize, elog, ERROR, IndexTupleSize(), InvalidOffsetNumber, MAXALIGN, OffsetNumberNext, PageAddItem, PageGetMaxOffsetNumber(), and pfree().

Referenced by _bt_dedup_pass(), and btree_xlog_dedup().

◆ _bt_dedup_pass()

void _bt_dedup_pass ( Relation  rel,
Buffer  buf,
IndexTuple  newitem,
Size  newitemsz,
bool  bottomupdedup 
)

Definition at line 59 of file nbtdedup.c.

61{
62 OffsetNumber offnum,
63 minoff,
64 maxoff;
65 Page page = BufferGetPage(buf);
66 BTPageOpaque opaque = BTPageGetOpaque(page);
67 Page newpage;
69 Size pagesaving PG_USED_FOR_ASSERTS_ONLY = 0;
70 bool singlevalstrat = false;
71 int nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
72
73 /* Passed-in newitemsz is MAXALIGNED but does not include line pointer */
74 newitemsz += sizeof(ItemIdData);
75
76 /*
77 * Initialize deduplication state.
78 *
79 * It would be possible for maxpostingsize (limit on posting list tuple
80 * size) to be set to one third of the page. However, it seems like a
81 * good idea to limit the size of posting lists to one sixth of a page.
82 * That ought to leave us with a good split point when pages full of
83 * duplicates can be split several times.
84 */
86 state->deduplicate = true;
87 state->nmaxitems = 0;
88 state->maxpostingsize = Min(BTMaxItemSize / 2, INDEX_SIZE_MASK);
89 /* Metadata about base tuple of current pending posting list */
90 state->base = NULL;
91 state->baseoff = InvalidOffsetNumber;
92 state->basetupsize = 0;
93 /* Metadata about current pending posting list TIDs */
94 state->htids = palloc(state->maxpostingsize);
95 state->nhtids = 0;
96 state->nitems = 0;
97 /* Size of all physical tuples to be replaced by pending posting list */
98 state->phystupsize = 0;
99 /* nintervals should be initialized to zero */
100 state->nintervals = 0;
101
102 minoff = P_FIRSTDATAKEY(opaque);
103 maxoff = PageGetMaxOffsetNumber(page);
104
105 /*
106 * Consider applying "single value" strategy, though only if the page
107 * seems likely to be split in the near future
108 */
109 if (!bottomupdedup)
110 singlevalstrat = _bt_do_singleval(rel, page, state, minoff, newitem);
111
112 /*
113 * Deduplicate items from page, and write them to newpage.
114 *
115 * Copy the original page's LSN into newpage copy. This will become the
116 * updated version of the page. We need this because XLogInsert will
117 * examine the LSN and possibly dump it in a page image.
118 */
119 newpage = PageGetTempPageCopySpecial(page);
120 PageSetLSN(newpage, PageGetLSN(page));
121
122 /* Copy high key, if any */
123 if (!P_RIGHTMOST(opaque))
124 {
125 ItemId hitemid = PageGetItemId(page, P_HIKEY);
126 Size hitemsz = ItemIdGetLength(hitemid);
127 IndexTuple hitem = (IndexTuple) PageGetItem(page, hitemid);
128
129 if (PageAddItem(newpage, hitem, hitemsz, P_HIKEY, false, false) == InvalidOffsetNumber)
130 elog(ERROR, "deduplication failed to add highkey");
131 }
132
133 for (offnum = minoff;
134 offnum <= maxoff;
135 offnum = OffsetNumberNext(offnum))
136 {
137 ItemId itemid = PageGetItemId(page, offnum);
138 IndexTuple itup = (IndexTuple) PageGetItem(page, itemid);
139
140 Assert(!ItemIdIsDead(itemid));
141
142 if (offnum == minoff)
143 {
144 /*
145 * No previous/base tuple for the data item -- use the data item
146 * as base tuple of pending posting list
147 */
148 _bt_dedup_start_pending(state, itup, offnum);
149 }
150 else if (state->deduplicate &&
151 _bt_keep_natts_fast(rel, state->base, itup) > nkeyatts &&
153 {
154 /*
155 * Tuple is equal to base tuple of pending posting list. Heap
156 * TID(s) for itup have been saved in state.
157 */
158 }
159 else
160 {
161 /*
162 * Tuple is not equal to pending posting list tuple, or
163 * _bt_dedup_save_htid() opted to not merge current item into
164 * pending posting list for some other reason (e.g., adding more
165 * TIDs would have caused posting list to exceed current
166 * maxpostingsize).
167 *
168 * If state contains pending posting list with more than one item,
169 * form new posting tuple and add it to our temp page (newpage).
170 * Else add pending interval's base tuple to the temp page as-is.
171 */
172 pagesaving += _bt_dedup_finish_pending(newpage, state);
173
174 if (singlevalstrat)
175 {
176 /*
177 * Single value strategy's extra steps.
178 *
179 * Lower maxpostingsize for sixth and final large posting list
180 * tuple at the point where 5 maxpostingsize-capped tuples
181 * have either been formed or observed.
182 *
183 * When a sixth maxpostingsize-capped item is formed/observed,
184 * stop merging together tuples altogether. The few tuples
185 * that remain at the end of the page won't be merged together
186 * at all (at least not until after a future page split takes
187 * place, when this page's newly allocated right sibling page
188 * gets its first deduplication pass).
189 */
190 if (state->nmaxitems == 5)
191 _bt_singleval_fillfactor(page, state, newitemsz);
192 else if (state->nmaxitems == 6)
193 {
194 state->deduplicate = false;
195 singlevalstrat = false; /* won't be back here */
196 }
197 }
198
199 /* itup starts new pending posting list */
200 _bt_dedup_start_pending(state, itup, offnum);
201 }
202 }
203
204 /* Handle the last item */
205 pagesaving += _bt_dedup_finish_pending(newpage, state);
206
207 /*
208 * If no items suitable for deduplication were found, newpage must be
209 * exactly the same as the original page, so just return from function.
210 *
211 * We could determine whether or not to proceed on the basis the space
212 * savings being sufficient to avoid an immediate page split instead. We
213 * don't do that because there is some small value in nbtsplitloc.c always
214 * operating against a page that is fully deduplicated (apart from
215 * newitem). Besides, most of the cost has already been paid.
216 */
217 if (state->nintervals == 0)
218 {
219 /* cannot leak memory here */
220 pfree(newpage);
221 pfree(state->htids);
222 pfree(state);
223 return;
224 }
225
226 /*
227 * By here, it's clear that deduplication will definitely go ahead.
228 *
229 * Clear the BTP_HAS_GARBAGE page flag. The index must be a heapkeyspace
230 * index, and as such we'll never pay attention to BTP_HAS_GARBAGE anyway.
231 * But keep things tidy.
232 */
233 if (P_HAS_GARBAGE(opaque))
234 {
235 BTPageOpaque nopaque = BTPageGetOpaque(newpage);
236
237 nopaque->btpo_flags &= ~BTP_HAS_GARBAGE;
238 }
239
241
242 PageRestoreTempPage(newpage, page);
244
245 /* XLOG stuff */
246 if (RelationNeedsWAL(rel))
247 {
248 XLogRecPtr recptr;
249 xl_btree_dedup xlrec_dedup;
250
251 xlrec_dedup.nintervals = state->nintervals;
252
255 XLogRegisterData(&xlrec_dedup, SizeOfBtreeDedup);
256
257 /*
258 * The intervals array is not in the buffer, but pretend that it is.
259 * When XLogInsert stores the whole buffer, the array need not be
260 * stored too.
261 */
262 XLogRegisterBufData(0, state->intervals,
263 state->nintervals * sizeof(BTDedupInterval));
264
265 recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DEDUP);
266
267 PageSetLSN(page, recptr);
268 }
269
271
272 /* Local space accounting should agree with page accounting */
273 Assert(pagesaving < newitemsz || PageGetExactFreeSpace(page) >= newitemsz);
274
275 /* cannot leak memory here */
276 pfree(state->htids);
277 pfree(state);
278}
void MarkBufferDirty(Buffer buffer)
Definition: bufmgr.c:2943
void PageRestoreTempPage(Page tempPage, Page oldPage)
Definition: bufpage.c:423
Page PageGetTempPageCopySpecial(const PageData *page)
Definition: bufpage.c:401
static void PageSetLSN(Page page, XLogRecPtr lsn)
Definition: bufpage.h:390
static XLogRecPtr PageGetLSN(const PageData *page)
Definition: bufpage.h:385
#define PG_USED_FOR_ASSERTS_ONLY
Definition: c.h:229
#define ItemIdGetLength(itemId)
Definition: itemid.h:59
#define INDEX_SIZE_MASK
Definition: itup.h:65
#define START_CRIT_SECTION()
Definition: miscadmin.h:150
#define END_CRIT_SECTION()
Definition: miscadmin.h:152
static bool _bt_do_singleval(Relation rel, Page page, BTDedupState state, OffsetNumber minoff, IndexTuple newitem)
Definition: nbtdedup.c:780
Size _bt_dedup_finish_pending(Page newpage, BTDedupState state)
Definition: nbtdedup.c:555
static void _bt_singleval_fillfactor(Page page, BTDedupState state, Size newitemsz)
Definition: nbtdedup.c:820
#define P_HAS_GARBAGE(opaque)
Definition: nbtree.h:227
#define XLOG_BTREE_DEDUP
Definition: nbtxlog.h:33
#define SizeOfBtreeDedup
Definition: nbtxlog.h:174
uint16 btpo_flags
Definition: nbtree.h:68
uint16 nintervals
Definition: nbtxlog.h:169
uint64 XLogRecPtr
Definition: xlogdefs.h:21
void XLogRegisterBufData(uint8 block_id, const void *data, uint32 len)
Definition: xloginsert.c:409
void XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags)
Definition: xloginsert.c:245
#define REGBUF_STANDARD
Definition: xloginsert.h:35

References _bt_dedup_finish_pending(), _bt_dedup_save_htid(), _bt_dedup_start_pending(), _bt_do_singleval(), _bt_keep_natts_fast(), _bt_singleval_fillfactor(), Assert(), BTMaxItemSize, BTPageGetOpaque, BTPageOpaqueData::btpo_flags, buf, BufferGetPage(), elog, END_CRIT_SECTION, ERROR, INDEX_SIZE_MASK, IndexRelationGetNumberOfKeyAttributes, InvalidOffsetNumber, ItemIdGetLength, ItemIdIsDead, MarkBufferDirty(), Min, xl_btree_dedup::nintervals, OffsetNumberNext, P_FIRSTDATAKEY, P_HAS_GARBAGE, P_HIKEY, P_RIGHTMOST, PageAddItem, PageGetExactFreeSpace(), PageGetItem(), PageGetItemId(), PageGetLSN(), PageGetMaxOffsetNumber(), PageGetTempPageCopySpecial(), PageRestoreTempPage(), PageSetLSN(), palloc(), palloc_object, pfree(), PG_USED_FOR_ASSERTS_ONLY, REGBUF_STANDARD, RelationNeedsWAL, SizeOfBtreeDedup, START_CRIT_SECTION, XLOG_BTREE_DEDUP, XLogBeginInsert(), XLogInsert(), XLogRegisterBufData(), XLogRegisterBuffer(), and XLogRegisterData().

Referenced by _bt_delete_or_dedup_one_page().

◆ _bt_dedup_save_htid()

bool _bt_dedup_save_htid ( BTDedupState  state,
IndexTuple  itup 
)

Definition at line 484 of file nbtdedup.c.

485{
486 int nhtids;
487 ItemPointer htids;
488 Size mergedtupsz;
489
491
492 if (!BTreeTupleIsPosting(itup))
493 {
494 nhtids = 1;
495 htids = &itup->t_tid;
496 }
497 else
498 {
499 nhtids = BTreeTupleGetNPosting(itup);
500 htids = BTreeTupleGetPosting(itup);
501 }
502
503 /*
504 * Don't append (have caller finish pending posting list as-is) if
505 * appending heap TID(s) from itup would put us over maxpostingsize limit.
506 *
507 * This calculation needs to match the code used within _bt_form_posting()
508 * for new posting list tuples.
509 */
510 mergedtupsz = MAXALIGN(state->basetupsize +
511 (state->nhtids + nhtids) * sizeof(ItemPointerData));
512
513 if (mergedtupsz > state->maxpostingsize)
514 {
515 /*
516 * Count this as an oversized item for single value strategy, though
517 * only when there are 50 TIDs in the final posting list tuple. This
518 * limit (which is fairly arbitrary) avoids confusion about how many
519 * 1/6 of a page tuples have been encountered/created by the current
520 * deduplication pass.
521 *
522 * Note: We deliberately don't consider which deduplication pass
523 * merged together tuples to create this item (could be a previous
524 * deduplication pass, or current pass). See _bt_do_singleval()
525 * comments.
526 */
527 if (state->nhtids > 50)
528 state->nmaxitems++;
529
530 return false;
531 }
532
533 /*
534 * Save heap TIDs to pending posting list tuple -- itup can be merged into
535 * pending posting list
536 */
537 state->nitems++;
538 memcpy(state->htids + state->nhtids, htids,
539 sizeof(ItemPointerData) * nhtids);
540 state->nhtids += nhtids;
541 state->phystupsize += MAXALIGN(IndexTupleSize(itup)) + sizeof(ItemIdData);
542
543 return true;
544}
static uint16 BTreeTupleGetNPosting(IndexTuple posting)
Definition: nbtree.h:519
static ItemPointer BTreeTupleGetPosting(IndexTuple posting)
Definition: nbtree.h:538

References Assert(), BTreeTupleGetNPosting(), BTreeTupleGetPosting(), BTreeTupleIsPivot(), BTreeTupleIsPosting(), IndexTupleSize(), MAXALIGN, and IndexTupleData::t_tid.

Referenced by _bt_bottomupdel_pass(), _bt_dedup_pass(), _bt_load(), and btree_xlog_dedup().

◆ _bt_dedup_start_pending()

void _bt_dedup_start_pending ( BTDedupState  state,
IndexTuple  base,
OffsetNumber  baseoff 
)

Definition at line 433 of file nbtdedup.c.

435{
436 Assert(state->nhtids == 0);
437 Assert(state->nitems == 0);
439
440 /*
441 * Copy heap TID(s) from new base tuple for new candidate posting list
442 * into working state's array
443 */
444 if (!BTreeTupleIsPosting(base))
445 {
446 memcpy(state->htids, &base->t_tid, sizeof(ItemPointerData));
447 state->nhtids = 1;
448 state->basetupsize = IndexTupleSize(base);
449 }
450 else
451 {
452 int nposting;
453
454 nposting = BTreeTupleGetNPosting(base);
455 memcpy(state->htids, BTreeTupleGetPosting(base),
456 sizeof(ItemPointerData) * nposting);
457 state->nhtids = nposting;
458 /* basetupsize should not include existing posting list */
459 state->basetupsize = BTreeTupleGetPostingOffset(base);
460 }
461
462 /*
463 * Save new base tuple itself -- it'll be needed if we actually create a
464 * new posting list from new pending posting list.
465 *
466 * Must maintain physical size of all existing tuples (including line
467 * pointer overhead) so that we can calculate space savings on page.
468 */
469 state->nitems = 1;
470 state->base = base;
471 state->baseoff = baseoff;
472 state->phystupsize = MAXALIGN(IndexTupleSize(base)) + sizeof(ItemIdData);
473 /* Also save baseoff in pending state for interval */
474 state->intervals[state->nintervals].baseoff = state->baseoff;
475}
static uint32 BTreeTupleGetPostingOffset(IndexTuple posting)
Definition: nbtree.h:530

References Assert(), BTreeTupleGetNPosting(), BTreeTupleGetPosting(), BTreeTupleGetPostingOffset(), BTreeTupleIsPivot(), BTreeTupleIsPosting(), IndexTupleSize(), MAXALIGN, and IndexTupleData::t_tid.

Referenced by _bt_bottomupdel_pass(), _bt_dedup_pass(), _bt_load(), and btree_xlog_dedup().

◆ _bt_delitems_delete_check()

void _bt_delitems_delete_check ( Relation  rel,
Buffer  buf,
Relation  heapRel,
struct TM_IndexDeleteOp delstate 
)

Definition at line 1512 of file nbtpage.c.

1514{
1515 Page page = BufferGetPage(buf);
1516 TransactionId snapshotConflictHorizon;
1517 bool isCatalogRel;
1518 OffsetNumber postingidxoffnum = InvalidOffsetNumber;
1519 int ndeletable = 0,
1520 nupdatable = 0;
1523
1524 /* Use tableam interface to determine which tuples to delete first */
1525 snapshotConflictHorizon = table_index_delete_tuples(heapRel, delstate);
1526 isCatalogRel = RelationIsAccessibleInLogicalDecoding(heapRel);
1527
1528 /* Should not WAL-log snapshotConflictHorizon unless it's required */
1529 if (!XLogStandbyInfoActive())
1530 snapshotConflictHorizon = InvalidTransactionId;
1531
1532 /*
1533 * Construct a leaf-page-wise description of what _bt_delitems_delete()
1534 * needs to do to physically delete index tuples from the page.
1535 *
1536 * Must sort deltids array to restore leaf-page-wise order (original order
1537 * before call to tableam). This is the order that the loop expects.
1538 *
1539 * Note that deltids array might be a lot smaller now. It might even have
1540 * no entries at all (with bottom-up deletion caller), in which case there
1541 * is nothing left to do.
1542 */
1543 qsort(delstate->deltids, delstate->ndeltids, sizeof(TM_IndexDelete),
1545 if (delstate->ndeltids == 0)
1546 {
1547 Assert(delstate->bottomup);
1548 return;
1549 }
1550
1551 /* We definitely have to delete at least one index tuple (or one TID) */
1552 for (int i = 0; i < delstate->ndeltids; i++)
1553 {
1554 TM_IndexStatus *dstatus = delstate->status + delstate->deltids[i].id;
1555 OffsetNumber idxoffnum = dstatus->idxoffnum;
1556 ItemId itemid = PageGetItemId(page, idxoffnum);
1557 IndexTuple itup = (IndexTuple) PageGetItem(page, itemid);
1558 int nestedi,
1559 nitem;
1560 BTVacuumPosting vacposting;
1561
1562 Assert(OffsetNumberIsValid(idxoffnum));
1563
1564 if (idxoffnum == postingidxoffnum)
1565 {
1566 /*
1567 * This deltid entry is a TID from a posting list tuple that has
1568 * already been completely processed
1569 */
1572 &delstate->deltids[i].tid) < 0);
1574 &delstate->deltids[i].tid) >= 0);
1575 continue;
1576 }
1577
1578 if (!BTreeTupleIsPosting(itup))
1579 {
1580 /* Plain non-pivot tuple */
1581 Assert(ItemPointerEquals(&itup->t_tid, &delstate->deltids[i].tid));
1582 if (dstatus->knowndeletable)
1583 deletable[ndeletable++] = idxoffnum;
1584 continue;
1585 }
1586
1587 /*
1588 * itup is a posting list tuple whose lowest deltids entry (which may
1589 * or may not be for the first TID from itup) is considered here now.
1590 * We should process all of the deltids entries for the posting list
1591 * together now, though (not just the lowest). Remember to skip over
1592 * later itup-related entries during later iterations of outermost
1593 * loop.
1594 */
1595 postingidxoffnum = idxoffnum; /* Remember work in outermost loop */
1596 nestedi = i; /* Initialize for first itup deltids entry */
1597 vacposting = NULL; /* Describes final action for itup */
1598 nitem = BTreeTupleGetNPosting(itup);
1599 for (int p = 0; p < nitem; p++)
1600 {
1601 ItemPointer ptid = BTreeTupleGetPostingN(itup, p);
1602 int ptidcmp = -1;
1603
1604 /*
1605 * This nested loop reuses work across ptid TIDs taken from itup.
1606 * We take advantage of the fact that both itup's TIDs and deltids
1607 * entries (within a single itup/posting list grouping) must both
1608 * be in ascending TID order.
1609 */
1610 for (; nestedi < delstate->ndeltids; nestedi++)
1611 {
1612 TM_IndexDelete *tcdeltid = &delstate->deltids[nestedi];
1613 TM_IndexStatus *tdstatus = (delstate->status + tcdeltid->id);
1614
1615 /* Stop once we get past all itup related deltids entries */
1616 Assert(tdstatus->idxoffnum >= idxoffnum);
1617 if (tdstatus->idxoffnum != idxoffnum)
1618 break;
1619
1620 /* Skip past non-deletable itup related entries up front */
1621 if (!tdstatus->knowndeletable)
1622 continue;
1623
1624 /* Entry is first partial ptid match (or an exact match)? */
1625 ptidcmp = ItemPointerCompare(&tcdeltid->tid, ptid);
1626 if (ptidcmp >= 0)
1627 {
1628 /* Greater than or equal (partial or exact) match... */
1629 break;
1630 }
1631 }
1632
1633 /* ...exact ptid match to a deletable deltids entry? */
1634 if (ptidcmp != 0)
1635 continue;
1636
1637 /* Exact match for deletable deltids entry -- ptid gets deleted */
1638 if (vacposting == NULL)
1639 {
1640 vacposting = palloc(offsetof(BTVacuumPostingData, deletetids) +
1641 nitem * sizeof(uint16));
1642 vacposting->itup = itup;
1643 vacposting->updatedoffset = idxoffnum;
1644 vacposting->ndeletedtids = 0;
1645 }
1646 vacposting->deletetids[vacposting->ndeletedtids++] = p;
1647 }
1648
1649 /* Final decision on itup, a posting list tuple */
1650
1651 if (vacposting == NULL)
1652 {
1653 /* No TIDs to delete from itup -- do nothing */
1654 }
1655 else if (vacposting->ndeletedtids == nitem)
1656 {
1657 /* Straight delete of itup (to delete all TIDs) */
1658 deletable[ndeletable++] = idxoffnum;
1659 /* Turns out we won't need granular information */
1660 pfree(vacposting);
1661 }
1662 else
1663 {
1664 /* Delete some (but not all) TIDs from itup */
1665 Assert(vacposting->ndeletedtids > 0 &&
1666 vacposting->ndeletedtids < nitem);
1667 updatable[nupdatable++] = vacposting;
1668 }
1669 }
1670
1671 /* Physically delete tuples (or TIDs) using deletable (or updatable) */
1672 _bt_delitems_delete(rel, buf, snapshotConflictHorizon, isCatalogRel,
1673 deletable, ndeletable, updatable, nupdatable);
1674
1675 /* be tidy */
1676 for (int i = 0; i < nupdatable; i++)
1677 pfree(updatable[i]);
1678}
uint16_t uint16
Definition: c.h:551
uint32 TransactionId
Definition: c.h:671
bool ItemPointerEquals(const ItemPointerData *pointer1, const ItemPointerData *pointer2)
Definition: itemptr.c:35
#define MaxIndexTuplesPerPage
Definition: itup.h:181
static void _bt_delitems_delete(Relation rel, Buffer buf, TransactionId snapshotConflictHorizon, bool isCatalogRel, OffsetNumber *deletable, int ndeletable, BTVacuumPosting *updatable, int nupdatable)
Definition: nbtpage.c:1284
static int _bt_delitems_cmp(const void *a, const void *b)
Definition: nbtpage.c:1463
static ItemPointer BTreeTupleGetPostingN(IndexTuple posting, int n)
Definition: nbtree.h:545
#define OffsetNumberIsValid(offsetNumber)
Definition: off.h:39
#define qsort(a, b, c, d)
Definition: port.h:499
uint16 deletetids[FLEXIBLE_ARRAY_MEMBER]
Definition: nbtree.h:922
uint16 ndeletedtids
Definition: nbtree.h:921
IndexTuple itup
Definition: nbtree.h:917
OffsetNumber updatedoffset
Definition: nbtree.h:918
ItemPointerData tid
Definition: tableam.h:212
bool knowndeletable
Definition: tableam.h:219
OffsetNumber idxoffnum
Definition: tableam.h:218
static TransactionId table_index_delete_tuples(Relation rel, TM_IndexDeleteOp *delstate)
Definition: tableam.h:1331
#define InvalidTransactionId
Definition: transam.h:31

References _bt_delitems_cmp(), _bt_delitems_delete(), Assert(), TM_IndexDeleteOp::bottomup, BTreeTupleGetHeapTID(), BTreeTupleGetMaxHeapTID(), BTreeTupleGetNPosting(), BTreeTupleGetPostingN(), BTreeTupleIsPosting(), buf, BufferGetPage(), BTVacuumPostingData::deletetids, TM_IndexDeleteOp::deltids, i, TM_IndexDelete::id, TM_IndexStatus::idxoffnum, InvalidOffsetNumber, InvalidTransactionId, ItemPointerCompare(), ItemPointerEquals(), BTVacuumPostingData::itup, TM_IndexStatus::knowndeletable, MaxIndexTuplesPerPage, BTVacuumPostingData::ndeletedtids, TM_IndexDeleteOp::ndeltids, OffsetNumberIsValid, PageGetItem(), PageGetItemId(), palloc(), pfree(), qsort, RelationIsAccessibleInLogicalDecoding, TM_IndexDeleteOp::status, IndexTupleData::t_tid, table_index_delete_tuples(), TM_IndexDelete::tid, BTVacuumPostingData::updatedoffset, and XLogStandbyInfoActive.

Referenced by _bt_bottomupdel_pass(), and _bt_simpledel_pass().

◆ _bt_delitems_vacuum()

void _bt_delitems_vacuum ( Relation  rel,
Buffer  buf,
OffsetNumber deletable,
int  ndeletable,
BTVacuumPosting updatable,
int  nupdatable 
)

Definition at line 1155 of file nbtpage.c.

1158{
1159 Page page = BufferGetPage(buf);
1160 BTPageOpaque opaque;
1161 bool needswal = RelationNeedsWAL(rel);
1162 char *updatedbuf = NULL;
1163 Size updatedbuflen = 0;
1164 OffsetNumber updatedoffsets[MaxIndexTuplesPerPage];
1165
1166 /* Shouldn't be called unless there's something to do */
1167 Assert(ndeletable > 0 || nupdatable > 0);
1168
1169 /* Generate new version of posting lists without deleted TIDs */
1170 if (nupdatable > 0)
1171 updatedbuf = _bt_delitems_update(updatable, nupdatable,
1172 updatedoffsets, &updatedbuflen,
1173 needswal);
1174
1175 /* No ereport(ERROR) until changes are logged */
1177
1178 /*
1179 * Handle posting tuple updates.
1180 *
1181 * Deliberately do this before handling simple deletes. If we did it the
1182 * other way around (i.e. WAL record order -- simple deletes before
1183 * updates) then we'd have to make compensating changes to the 'updatable'
1184 * array of offset numbers.
1185 *
1186 * PageIndexTupleOverwrite() won't unset each item's LP_DEAD bit when it
1187 * happens to already be set. It's important that we not interfere with
1188 * any future simple index tuple deletion operations.
1189 */
1190 for (int i = 0; i < nupdatable; i++)
1191 {
1192 OffsetNumber updatedoffset = updatedoffsets[i];
1193 IndexTuple itup;
1194 Size itemsz;
1195
1196 itup = updatable[i]->itup;
1197 itemsz = MAXALIGN(IndexTupleSize(itup));
1198 if (!PageIndexTupleOverwrite(page, updatedoffset, itup, itemsz))
1199 elog(PANIC, "failed to update partially dead item in block %u of index \"%s\"",
1201 }
1202
1203 /* Now handle simple deletes of entire tuples */
1204 if (ndeletable > 0)
1205 PageIndexMultiDelete(page, deletable, ndeletable);
1206
1207 /*
1208 * We can clear the vacuum cycle ID since this page has certainly been
1209 * processed by the current vacuum scan.
1210 */
1211 opaque = BTPageGetOpaque(page);
1212 opaque->btpo_cycleid = 0;
1213
1214 /*
1215 * Clear the BTP_HAS_GARBAGE page flag.
1216 *
1217 * This flag indicates the presence of LP_DEAD items on the page (though
1218 * not reliably). Note that we only rely on it with pg_upgrade'd
1219 * !heapkeyspace indexes. That's why clearing it here won't usually
1220 * interfere with simple index tuple deletion.
1221 */
1222 opaque->btpo_flags &= ~BTP_HAS_GARBAGE;
1223
1225
1226 /* XLOG stuff */
1227 if (needswal)
1228 {
1229 XLogRecPtr recptr;
1230 xl_btree_vacuum xlrec_vacuum;
1231
1232 xlrec_vacuum.ndeleted = ndeletable;
1233 xlrec_vacuum.nupdated = nupdatable;
1234
1237 XLogRegisterData(&xlrec_vacuum, SizeOfBtreeVacuum);
1238
1239 if (ndeletable > 0)
1240 XLogRegisterBufData(0, deletable,
1241 ndeletable * sizeof(OffsetNumber));
1242
1243 if (nupdatable > 0)
1244 {
1245 XLogRegisterBufData(0, updatedoffsets,
1246 nupdatable * sizeof(OffsetNumber));
1247 XLogRegisterBufData(0, updatedbuf, updatedbuflen);
1248 }
1249
1250 recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_VACUUM);
1251
1252 PageSetLSN(page, recptr);
1253 }
1254
1256
1257 /* can't leak memory here */
1258 if (updatedbuf != NULL)
1259 pfree(updatedbuf);
1260 /* free tuples allocated within _bt_delitems_update() */
1261 for (int i = 0; i < nupdatable; i++)
1262 pfree(updatable[i]->itup);
1263}
void PageIndexMultiDelete(Page page, OffsetNumber *itemnos, int nitems)
Definition: bufpage.c:1160
bool PageIndexTupleOverwrite(Page page, OffsetNumber offnum, const void *newtup, Size newsize)
Definition: bufpage.c:1404
#define PANIC
Definition: elog.h:42
static char * _bt_delitems_update(BTVacuumPosting *updatable, int nupdatable, OffsetNumber *updatedoffsets, Size *updatedbuflen, bool needswal)
Definition: nbtpage.c:1404
#define SizeOfBtreeVacuum
Definition: nbtxlog.h:234
#define XLOG_BTREE_VACUUM
Definition: nbtxlog.h:39
BTCycleId btpo_cycleid
Definition: nbtree.h:69
uint16 ndeleted
Definition: nbtxlog.h:222
uint16 nupdated
Definition: nbtxlog.h:223

References _bt_delitems_update(), Assert(), BTPageGetOpaque, BTPageOpaqueData::btpo_cycleid, BTPageOpaqueData::btpo_flags, buf, BufferGetBlockNumber(), BufferGetPage(), elog, END_CRIT_SECTION, i, IndexTupleSize(), BTVacuumPostingData::itup, MarkBufferDirty(), MAXALIGN, MaxIndexTuplesPerPage, xl_btree_vacuum::ndeleted, xl_btree_vacuum::nupdated, PageIndexMultiDelete(), PageIndexTupleOverwrite(), PageSetLSN(), PANIC, pfree(), REGBUF_STANDARD, RelationGetRelationName, RelationNeedsWAL, SizeOfBtreeVacuum, START_CRIT_SECTION, XLOG_BTREE_VACUUM, XLogBeginInsert(), XLogInsert(), XLogRegisterBufData(), XLogRegisterBuffer(), and XLogRegisterData().

Referenced by btvacuumpage().

◆ _bt_doinsert()

bool _bt_doinsert ( Relation  rel,
IndexTuple  itup,
IndexUniqueCheck  checkUnique,
bool  indexUnchanged,
Relation  heapRel 
)

Definition at line 104 of file nbtinsert.c.

107{
108 bool is_unique = false;
109 BTInsertStateData insertstate;
110 BTScanInsert itup_key;
111 BTStack stack;
112 bool checkingunique = (checkUnique != UNIQUE_CHECK_NO);
113
114 /* we need an insertion scan key to do our search, so build one */
115 itup_key = _bt_mkscankey(rel, itup);
116
117 if (checkingunique)
118 {
119 if (!itup_key->anynullkeys)
120 {
121 /* No (heapkeyspace) scantid until uniqueness established */
122 itup_key->scantid = NULL;
123 }
124 else
125 {
126 /*
127 * Scan key for new tuple contains NULL key values. Bypass
128 * checkingunique steps. They are unnecessary because core code
129 * considers NULL unequal to every value, including NULL.
130 *
131 * This optimization avoids O(N^2) behavior within the
132 * _bt_findinsertloc() heapkeyspace path when a unique index has a
133 * large number of "duplicates" with NULL key values.
134 */
135 checkingunique = false;
136 /* Tuple is unique in the sense that core code cares about */
137 Assert(checkUnique != UNIQUE_CHECK_EXISTING);
138 is_unique = true;
139 }
140 }
141
142 /*
143 * Fill in the BTInsertState working area, to track the current page and
144 * position within the page to insert on.
145 *
146 * Note that itemsz is passed down to lower level code that deals with
147 * inserting the item. It must be MAXALIGN()'d. This ensures that space
148 * accounting code consistently considers the alignment overhead that we
149 * expect PageAddItem() will add later. (Actually, index_form_tuple() is
150 * already conservative about alignment, but we don't rely on that from
151 * this distance. Besides, preserving the "true" tuple size in index
152 * tuple headers for the benefit of nbtsplitloc.c might happen someday.
153 * Note that heapam does not MAXALIGN() each heap tuple's lp_len field.)
154 */
155 insertstate.itup = itup;
156 insertstate.itemsz = MAXALIGN(IndexTupleSize(itup));
157 insertstate.itup_key = itup_key;
158 insertstate.bounds_valid = false;
159 insertstate.buf = InvalidBuffer;
160 insertstate.postingoff = 0;
161
162search:
163
164 /*
165 * Find and lock the leaf page that the tuple should be added to by
166 * searching from the root page. insertstate.buf will hold a buffer that
167 * is locked in exclusive mode afterwards.
168 */
169 stack = _bt_search_insert(rel, heapRel, &insertstate);
170
171 /*
172 * checkingunique inserts are not allowed to go ahead when two tuples with
173 * equal key attribute values would be visible to new MVCC snapshots once
174 * the xact commits. Check for conflicts in the locked page/buffer (if
175 * needed) here.
176 *
177 * It might be necessary to check a page to the right in _bt_check_unique,
178 * though that should be very rare. In practice the first page the value
179 * could be on (with scantid omitted) is almost always also the only page
180 * that a matching tuple might be found on. This is due to the behavior
181 * of _bt_findsplitloc with duplicate tuples -- a group of duplicates can
182 * only be allowed to cross a page boundary when there is no candidate
183 * leaf page split point that avoids it. Also, _bt_check_unique can use
184 * the leaf page high key to determine that there will be no duplicates on
185 * the right sibling without actually visiting it (it uses the high key in
186 * cases where the new item happens to belong at the far right of the leaf
187 * page).
188 *
189 * NOTE: obviously, _bt_check_unique can only detect keys that are already
190 * in the index; so it cannot defend against concurrent insertions of the
191 * same key. We protect against that by means of holding a write lock on
192 * the first page the value could be on, with omitted/-inf value for the
193 * implicit heap TID tiebreaker attribute. Any other would-be inserter of
194 * the same key must acquire a write lock on the same page, so only one
195 * would-be inserter can be making the check at one time. Furthermore,
196 * once we are past the check we hold write locks continuously until we
197 * have performed our insertion, so no later inserter can fail to see our
198 * insertion. (This requires some care in _bt_findinsertloc.)
199 *
200 * If we must wait for another xact, we release the lock while waiting,
201 * and then must perform a new search.
202 *
203 * For a partial uniqueness check, we don't wait for the other xact. Just
204 * let the tuple in and return false for possibly non-unique, or true for
205 * definitely unique.
206 */
207 if (checkingunique)
208 {
209 TransactionId xwait;
210 uint32 speculativeToken;
211
212 xwait = _bt_check_unique(rel, &insertstate, heapRel, checkUnique,
213 &is_unique, &speculativeToken);
214
215 if (unlikely(TransactionIdIsValid(xwait)))
216 {
217 /* Have to wait for the other guy ... */
218 _bt_relbuf(rel, insertstate.buf);
219 insertstate.buf = InvalidBuffer;
220
221 /*
222 * If it's a speculative insertion, wait for it to finish (ie. to
223 * go ahead with the insertion, or kill the tuple). Otherwise
224 * wait for the transaction to finish as usual.
225 */
226 if (speculativeToken)
227 SpeculativeInsertionWait(xwait, speculativeToken);
228 else
229 XactLockTableWait(xwait, rel, &itup->t_tid, XLTW_InsertIndex);
230
231 /* start over... */
232 if (stack)
233 _bt_freestack(stack);
234 goto search;
235 }
236
237 /* Uniqueness is established -- restore heap tid as scantid */
238 if (itup_key->heapkeyspace)
239 itup_key->scantid = &itup->t_tid;
240 }
241
242 if (checkUnique != UNIQUE_CHECK_EXISTING)
243 {
244 OffsetNumber newitemoff;
245
246 /*
247 * The only conflict predicate locking cares about for indexes is when
248 * an index tuple insert conflicts with an existing lock. We don't
249 * know the actual page we're going to insert on for sure just yet in
250 * checkingunique and !heapkeyspace cases, but it's okay to use the
251 * first page the value could be on (with scantid omitted) instead.
252 */
254
255 /*
256 * Do the insertion. Note that insertstate contains cached binary
257 * search bounds established within _bt_check_unique when insertion is
258 * checkingunique.
259 */
260 newitemoff = _bt_findinsertloc(rel, &insertstate, checkingunique,
261 indexUnchanged, stack, heapRel);
262 _bt_insertonpg(rel, heapRel, itup_key, insertstate.buf, InvalidBuffer,
263 stack, itup, insertstate.itemsz, newitemoff,
264 insertstate.postingoff, false);
265 }
266 else
267 {
268 /* just release the buffer */
269 _bt_relbuf(rel, insertstate.buf);
270 }
271
272 /* be tidy */
273 if (stack)
274 _bt_freestack(stack);
275 pfree(itup_key);
276
277 return is_unique;
278}
uint32_t uint32
Definition: c.h:552
@ UNIQUE_CHECK_NO
Definition: genam.h:144
@ UNIQUE_CHECK_EXISTING
Definition: genam.h:147
void SpeculativeInsertionWait(TransactionId xid, uint32 token)
Definition: lmgr.c:828
void XactLockTableWait(TransactionId xid, Relation rel, const ItemPointerData *ctid, XLTW_Oper oper)
Definition: lmgr.c:663
@ XLTW_InsertIndex
Definition: lmgr.h:31
static BTStack _bt_search_insert(Relation rel, Relation heaprel, BTInsertState insertstate)
Definition: nbtinsert.c:319
static OffsetNumber _bt_findinsertloc(Relation rel, BTInsertState insertstate, bool checkingunique, bool indexUnchanged, BTStack stack, Relation heapRel)
Definition: nbtinsert.c:817
static void _bt_insertonpg(Relation rel, Relation heaprel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, BTStack stack, IndexTuple itup, Size itemsz, OffsetNumber newitemoff, int postingoff, bool split_only_page)
Definition: nbtinsert.c:1107
static TransactionId _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel, IndexUniqueCheck checkUnique, bool *is_unique, uint32 *speculativeToken)
Definition: nbtinsert.c:410
void _bt_freestack(BTStack stack)
Definition: nbtutils.c:151
BTScanInsert _bt_mkscankey(Relation rel, IndexTuple itup)
Definition: nbtutils.c:59
void CheckForSerializableConflictIn(Relation relation, const ItemPointerData *tid, BlockNumber blkno)
Definition: predicate.c:4334
IndexTuple itup
Definition: nbtree.h:822
ItemPointer scantid
Definition: nbtree.h:802
bool heapkeyspace
Definition: nbtree.h:797
bool anynullkeys
Definition: nbtree.h:799
#define TransactionIdIsValid(xid)
Definition: transam.h:41

References _bt_check_unique(), _bt_findinsertloc(), _bt_freestack(), _bt_insertonpg(), _bt_mkscankey(), _bt_relbuf(), _bt_search_insert(), BTScanInsertData::anynullkeys, Assert(), BTInsertStateData::bounds_valid, BTInsertStateData::buf, BufferGetBlockNumber(), CheckForSerializableConflictIn(), BTScanInsertData::heapkeyspace, IndexTupleSize(), InvalidBuffer, BTInsertStateData::itemsz, BTInsertStateData::itup, BTInsertStateData::itup_key, MAXALIGN, pfree(), BTInsertStateData::postingoff, BTScanInsertData::scantid, SpeculativeInsertionWait(), IndexTupleData::t_tid, TransactionIdIsValid, UNIQUE_CHECK_EXISTING, UNIQUE_CHECK_NO, unlikely, XactLockTableWait(), and XLTW_InsertIndex.

Referenced by btinsert().

◆ _bt_end_vacuum()

void _bt_end_vacuum ( Relation  rel)

Definition at line 526 of file nbtutils.c.

527{
528 int i;
529
530 LWLockAcquire(BtreeVacuumLock, LW_EXCLUSIVE);
531
532 /* Find the array entry */
533 for (i = 0; i < btvacinfo->num_vacuums; i++)
534 {
536
537 if (vac->relid.relId == rel->rd_lockInfo.lockRelId.relId &&
538 vac->relid.dbId == rel->rd_lockInfo.lockRelId.dbId)
539 {
540 /* Remove it by shifting down the last entry */
543 break;
544 }
545 }
546
547 LWLockRelease(BtreeVacuumLock);
548}
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1174
void LWLockRelease(LWLock *lock)
Definition: lwlock.c:1894
@ LW_EXCLUSIVE
Definition: lwlock.h:112
static BTVacInfo * btvacinfo
Definition: nbtutils.c:422
LockRelId relid
Definition: nbtutils.c:410
int num_vacuums
Definition: nbtutils.c:417
BTOneVacInfo vacuums[FLEXIBLE_ARRAY_MEMBER]
Definition: nbtutils.c:419
LockRelId lockRelId
Definition: rel.h:46
Oid relId
Definition: rel.h:40
Oid dbId
Definition: rel.h:41
LockInfoData rd_lockInfo
Definition: rel.h:114

References btvacinfo, LockRelId::dbId, i, LockInfoData::lockRelId, LW_EXCLUSIVE, LWLockAcquire(), LWLockRelease(), BTVacInfo::num_vacuums, RelationData::rd_lockInfo, BTOneVacInfo::relid, LockRelId::relId, and BTVacInfo::vacuums.

Referenced by _bt_end_vacuum_callback(), and btbulkdelete().

◆ _bt_end_vacuum_callback()

void _bt_end_vacuum_callback ( int  code,
Datum  arg 
)

Definition at line 554 of file nbtutils.c.

555{
557}
void _bt_end_vacuum(Relation rel)
Definition: nbtutils.c:526
void * arg
static Pointer DatumGetPointer(Datum X)
Definition: postgres.h:322

References _bt_end_vacuum(), arg, and DatumGetPointer().

Referenced by btbulkdelete().

◆ _bt_findsplitloc()

OffsetNumber _bt_findsplitloc ( Relation  rel,
Page  origpage,
OffsetNumber  newitemoff,
Size  newitemsz,
IndexTuple  newitem,
bool *  newitemonleft 
)

Definition at line 130 of file nbtsplitloc.c.

136{
137 BTPageOpaque opaque;
138 int leftspace,
139 rightspace,
140 olddataitemstotal,
141 olddataitemstoleft,
142 perfectpenalty,
143 leaffillfactor;
145 FindSplitStrat strategy;
146 ItemId itemid;
147 OffsetNumber offnum,
148 maxoff,
149 firstrightoff;
150 double fillfactormult;
151 bool usemult;
152 SplitPoint leftpage,
153 rightpage;
154
155 opaque = BTPageGetOpaque(origpage);
156 maxoff = PageGetMaxOffsetNumber(origpage);
157
158 /* Total free space available on a btree page, after fixed overhead */
159 leftspace = rightspace =
161 MAXALIGN(sizeof(BTPageOpaqueData));
162
163 /* The right page will have the same high key as the old page */
164 if (!P_RIGHTMOST(opaque))
165 {
166 itemid = PageGetItemId(origpage, P_HIKEY);
167 rightspace -= (int) (MAXALIGN(ItemIdGetLength(itemid)) +
168 sizeof(ItemIdData));
169 }
170
171 /* Count up total space in data items before actually scanning 'em */
172 olddataitemstotal = rightspace - (int) PageGetExactFreeSpace(origpage);
173 leaffillfactor = BTGetFillFactor(rel);
174
175 /* Passed-in newitemsz is MAXALIGNED but does not include line pointer */
176 newitemsz += sizeof(ItemIdData);
177 state.rel = rel;
178 state.origpage = origpage;
179 state.newitem = newitem;
180 state.newitemsz = newitemsz;
181 state.is_leaf = P_ISLEAF(opaque);
182 state.is_rightmost = P_RIGHTMOST(opaque);
183 state.leftspace = leftspace;
184 state.rightspace = rightspace;
185 state.olddataitemstotal = olddataitemstotal;
186 state.minfirstrightsz = SIZE_MAX;
187 state.newitemoff = newitemoff;
188
189 /* newitem cannot be a posting list item */
190 Assert(!BTreeTupleIsPosting(newitem));
191
192 /*
193 * nsplits should never exceed maxoff because there will be at most as
194 * many candidate split points as there are points _between_ tuples, once
195 * you imagine that the new item is already on the original page (the
196 * final number of splits may be slightly lower because not all points
197 * between tuples will be legal).
198 */
199 state.maxsplits = maxoff;
200 state.splits = palloc_array(SplitPoint, state.maxsplits);
201 state.nsplits = 0;
202
203 /*
204 * Scan through the data items and calculate space usage for a split at
205 * each possible position
206 */
207 olddataitemstoleft = 0;
208
209 for (offnum = P_FIRSTDATAKEY(opaque);
210 offnum <= maxoff;
211 offnum = OffsetNumberNext(offnum))
212 {
213 Size itemsz;
214
215 itemid = PageGetItemId(origpage, offnum);
216 itemsz = MAXALIGN(ItemIdGetLength(itemid)) + sizeof(ItemIdData);
217
218 /*
219 * When item offset number is not newitemoff, neither side of the
220 * split can be newitem. Record a split after the previous data item
221 * from original page, but before the current data item from original
222 * page. (_bt_recsplitloc() will reject the split when there are no
223 * previous items, which we rely on.)
224 */
225 if (offnum < newitemoff)
226 _bt_recsplitloc(&state, offnum, false, olddataitemstoleft, itemsz);
227 else if (offnum > newitemoff)
228 _bt_recsplitloc(&state, offnum, true, olddataitemstoleft, itemsz);
229 else
230 {
231 /*
232 * Record a split after all "offnum < newitemoff" original page
233 * data items, but before newitem
234 */
235 _bt_recsplitloc(&state, offnum, false, olddataitemstoleft, itemsz);
236
237 /*
238 * Record a split after newitem, but before data item from
239 * original page at offset newitemoff/current offset
240 */
241 _bt_recsplitloc(&state, offnum, true, olddataitemstoleft, itemsz);
242 }
243
244 olddataitemstoleft += itemsz;
245 }
246
247 /*
248 * Record a split after all original page data items, but before newitem.
249 * (Though only when it's possible that newitem will end up alone on new
250 * right page.)
251 */
252 Assert(olddataitemstoleft == olddataitemstotal);
253 if (newitemoff > maxoff)
254 _bt_recsplitloc(&state, newitemoff, false, olddataitemstotal, 0);
255
256 /*
257 * I believe it is not possible to fail to find a feasible split, but just
258 * in case ...
259 */
260 if (state.nsplits == 0)
261 elog(ERROR, "could not find a feasible split point for index \"%s\"",
263
264 /*
265 * Start search for a split point among list of legal split points. Give
266 * primary consideration to equalizing available free space in each half
267 * of the split initially (start with default strategy), while applying
268 * rightmost and split-after-new-item optimizations where appropriate.
269 * Either of the two other fallback strategies may be required for cases
270 * with a large number of duplicates around the original/space-optimal
271 * split point.
272 *
273 * Default strategy gives some weight to suffix truncation in deciding a
274 * split point on leaf pages. It attempts to select a split point where a
275 * distinguishing attribute appears earlier in the new high key for the
276 * left side of the split, in order to maximize the number of trailing
277 * attributes that can be truncated away. Only candidate split points
278 * that imply an acceptable balance of free space on each side are
279 * considered. See _bt_defaultinterval().
280 */
281 if (!state.is_leaf)
282 {
283 /* fillfactormult only used on rightmost page */
284 usemult = state.is_rightmost;
285 fillfactormult = BTREE_NONLEAF_FILLFACTOR / 100.0;
286 }
287 else if (state.is_rightmost)
288 {
289 /* Rightmost leaf page -- fillfactormult always used */
290 usemult = true;
291 fillfactormult = leaffillfactor / 100.0;
292 }
293 else if (_bt_afternewitemoff(&state, maxoff, leaffillfactor, &usemult))
294 {
295 /*
296 * New item inserted at rightmost point among a localized grouping on
297 * a leaf page -- apply "split after new item" optimization, either by
298 * applying leaf fillfactor multiplier, or by choosing the exact split
299 * point that leaves newitem as lastleft. (usemult is set for us.)
300 */
301 if (usemult)
302 {
303 /* fillfactormult should be set based on leaf fillfactor */
304 fillfactormult = leaffillfactor / 100.0;
305 }
306 else
307 {
308 /* find precise split point after newitemoff */
309 for (int i = 0; i < state.nsplits; i++)
310 {
311 SplitPoint *split = state.splits + i;
312
313 if (split->newitemonleft &&
314 newitemoff == split->firstrightoff)
315 {
316 pfree(state.splits);
317 *newitemonleft = true;
318 return newitemoff;
319 }
320 }
321
322 /*
323 * Cannot legally split after newitemoff; proceed with split
324 * without using fillfactor multiplier. This is defensive, and
325 * should never be needed in practice.
326 */
327 fillfactormult = 0.50;
328 }
329 }
330 else
331 {
332 /* Other leaf page. 50:50 page split. */
333 usemult = false;
334 /* fillfactormult not used, but be tidy */
335 fillfactormult = 0.50;
336 }
337
338 /*
339 * Save leftmost and rightmost splits for page before original ordinal
340 * sort order is lost by delta/fillfactormult sort
341 */
342 leftpage = state.splits[0];
343 rightpage = state.splits[state.nsplits - 1];
344
345 /* Give split points a fillfactormult-wise delta, and sort on deltas */
346 _bt_deltasortsplits(&state, fillfactormult, usemult);
347
348 /* Determine split interval for default strategy */
349 state.interval = _bt_defaultinterval(&state);
350
351 /*
352 * Determine if default strategy/split interval will produce a
353 * sufficiently distinguishing split, or if we should change strategies.
354 * Alternative strategies change the range of split points that are
355 * considered acceptable (split interval), and possibly change
356 * fillfactormult, in order to deal with pages with a large number of
357 * duplicates gracefully.
358 *
359 * Pass low and high splits for the entire page (actually, they're for an
360 * imaginary version of the page that includes newitem). These are used
361 * when the initial split interval encloses split points that are full of
362 * duplicates, and we need to consider if it's even possible to avoid
363 * appending a heap TID.
364 */
365 perfectpenalty = _bt_strategy(&state, &leftpage, &rightpage, &strategy);
366
367 if (strategy == SPLIT_DEFAULT)
368 {
369 /*
370 * Default strategy worked out (always works out with internal page).
371 * Original split interval still stands.
372 */
373 }
374
375 /*
376 * Many duplicates strategy is used when a heap TID would otherwise be
377 * appended, but the page isn't completely full of logical duplicates.
378 *
379 * The split interval is widened to include all legal candidate split
380 * points. There might be a few as two distinct values in the whole-page
381 * split interval, though it's also possible that most of the values on
382 * the page are unique. The final split point will either be to the
383 * immediate left or to the immediate right of the group of duplicate
384 * tuples that enclose the first/delta-optimal split point (perfect
385 * penalty was set so that the lowest delta split point that avoids
386 * appending a heap TID will be chosen). Maximizing the number of
387 * attributes that can be truncated away is not a goal of the many
388 * duplicates strategy.
389 *
390 * Single value strategy is used when it is impossible to avoid appending
391 * a heap TID. It arranges to leave the left page very full. This
392 * maximizes space utilization in cases where tuples with the same
393 * attribute values span many pages. Newly inserted duplicates will tend
394 * to have higher heap TID values, so we'll end up splitting to the right
395 * consistently. (Single value strategy is harmless though not
396 * particularly useful with !heapkeyspace indexes.)
397 */
398 else if (strategy == SPLIT_MANY_DUPLICATES)
399 {
400 Assert(state.is_leaf);
401 /* Shouldn't try to truncate away extra user attributes */
402 Assert(perfectpenalty ==
404 /* No need to resort splits -- no change in fillfactormult/deltas */
405 state.interval = state.nsplits;
406 }
407 else if (strategy == SPLIT_SINGLE_VALUE)
408 {
409 Assert(state.is_leaf);
410 /* Split near the end of the page */
411 usemult = true;
412 fillfactormult = BTREE_SINGLEVAL_FILLFACTOR / 100.0;
413 /* Resort split points with new delta */
414 _bt_deltasortsplits(&state, fillfactormult, usemult);
415 /* Appending a heap TID is unavoidable, so interval of 1 is fine */
416 state.interval = 1;
417 }
418
419 /*
420 * Search among acceptable split points (using final split interval) for
421 * the entry that has the lowest penalty, and is therefore expected to
422 * maximize fan-out. Sets *newitemonleft for us.
423 */
424 firstrightoff = _bt_bestsplitloc(&state, perfectpenalty, newitemonleft,
425 strategy);
426 pfree(state.splits);
427
428 return firstrightoff;
429}
static Size PageGetPageSize(const PageData *page)
Definition: bufpage.h:276
#define BTREE_SINGLEVAL_FILLFACTOR
Definition: nbtree.h:203
#define BTGetFillFactor(relation)
Definition: nbtree.h:1127
#define BTREE_NONLEAF_FILLFACTOR
Definition: nbtree.h:202
static void _bt_deltasortsplits(FindSplitData *state, double fillfactormult, bool usemult)
Definition: nbtsplitloc.c:567
static int _bt_strategy(FindSplitData *state, SplitPoint *leftpage, SplitPoint *rightpage, FindSplitStrat *strategy)
Definition: nbtsplitloc.c:935
static bool _bt_afternewitemoff(FindSplitData *state, OffsetNumber maxoff, int leaffillfactor, bool *usemult)
Definition: nbtsplitloc.c:631
static void _bt_recsplitloc(FindSplitData *state, OffsetNumber firstrightoff, bool newitemonleft, int olddataitemstoleft, Size firstrightofforigpagetuplesz)
Definition: nbtsplitloc.c:450
FindSplitStrat
Definition: nbtsplitloc.c:22
@ SPLIT_DEFAULT
Definition: nbtsplitloc.c:24
@ SPLIT_MANY_DUPLICATES
Definition: nbtsplitloc.c:25
@ SPLIT_SINGLE_VALUE
Definition: nbtsplitloc.c:26
static OffsetNumber _bt_bestsplitloc(FindSplitData *state, int perfectpenalty, bool *newitemonleft, FindSplitStrat strategy)
Definition: nbtsplitloc.c:789
static int _bt_defaultinterval(FindSplitData *state)
Definition: nbtsplitloc.c:877
bool newitemonleft
Definition: nbtsplitloc.c:38
OffsetNumber firstrightoff
Definition: nbtsplitloc.c:37

References _bt_afternewitemoff(), _bt_bestsplitloc(), _bt_defaultinterval(), _bt_deltasortsplits(), _bt_recsplitloc(), _bt_strategy(), Assert(), BTGetFillFactor, BTPageGetOpaque, BTREE_NONLEAF_FILLFACTOR, BTREE_SINGLEVAL_FILLFACTOR, BTreeTupleIsPosting(), elog, ERROR, SplitPoint::firstrightoff, i, IndexRelationGetNumberOfKeyAttributes, ItemIdGetLength, MAXALIGN, SplitPoint::newitemonleft, OffsetNumberNext, P_FIRSTDATAKEY, P_HIKEY, P_ISLEAF, P_RIGHTMOST, PageGetExactFreeSpace(), PageGetItemId(), PageGetMaxOffsetNumber(), PageGetPageSize(), palloc_array, pfree(), RelationGetRelationName, SizeOfPageHeaderData, SPLIT_DEFAULT, SPLIT_MANY_DUPLICATES, and SPLIT_SINGLE_VALUE.

Referenced by _bt_split().

◆ _bt_finish_split()

void _bt_finish_split ( Relation  rel,
Relation  heaprel,
Buffer  lbuf,
BTStack  stack 
)

Definition at line 2256 of file nbtinsert.c.

2257{
2258 Page lpage = BufferGetPage(lbuf);
2259 BTPageOpaque lpageop = BTPageGetOpaque(lpage);
2260 Buffer rbuf;
2261 Page rpage;
2262 BTPageOpaque rpageop;
2263 bool wasroot;
2264 bool wasonly;
2265
2266 Assert(P_INCOMPLETE_SPLIT(lpageop));
2267 Assert(heaprel != NULL);
2268
2269 /* Lock right sibling, the one missing the downlink */
2270 rbuf = _bt_getbuf(rel, lpageop->btpo_next, BT_WRITE);
2271 rpage = BufferGetPage(rbuf);
2272 rpageop = BTPageGetOpaque(rpage);
2273
2274 /* Could this be a root split? */
2275 if (!stack)
2276 {
2277 Buffer metabuf;
2278 Page metapg;
2279 BTMetaPageData *metad;
2280
2281 /* acquire lock on the metapage */
2282 metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
2283 metapg = BufferGetPage(metabuf);
2284 metad = BTPageGetMeta(metapg);
2285
2286 wasroot = (metad->btm_root == BufferGetBlockNumber(lbuf));
2287
2288 _bt_relbuf(rel, metabuf);
2289 }
2290 else
2291 wasroot = false;
2292
2293 /* Was this the only page on the level before split? */
2294 wasonly = (P_LEFTMOST(lpageop) && P_RIGHTMOST(rpageop));
2295
2296 INJECTION_POINT("nbtree-finish-incomplete-split", NULL);
2297 elog(DEBUG1, "finishing incomplete split of %u/%u",
2299
2300 _bt_insert_parent(rel, heaprel, lbuf, rbuf, stack, wasroot, wasonly);
2301}
#define INJECTION_POINT(name, arg)
static void _bt_insert_parent(Relation rel, Relation heaprel, Buffer buf, Buffer rbuf, BTStack stack, bool isroot, bool isonly)
Definition: nbtinsert.c:2114
Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access)
Definition: nbtpage.c:846
#define BTPageGetMeta(p)
Definition: nbtree.h:122
#define P_LEFTMOST(opaque)
Definition: nbtree.h:219
#define P_INCOMPLETE_SPLIT(opaque)
Definition: nbtree.h:228
#define BTREE_METAPAGE
Definition: nbtree.h:149
#define BT_WRITE
Definition: nbtree.h:731
BlockNumber btm_root
Definition: nbtree.h:108
BlockNumber btpo_next
Definition: nbtree.h:66

References _bt_getbuf(), _bt_insert_parent(), _bt_relbuf(), Assert(), BT_WRITE, BTMetaPageData::btm_root, BTPageGetMeta, BTPageGetOpaque, BTPageOpaqueData::btpo_next, BTREE_METAPAGE, BufferGetBlockNumber(), BufferGetPage(), DEBUG1, elog, INJECTION_POINT, P_INCOMPLETE_SPLIT, P_LEFTMOST, and P_RIGHTMOST.

Referenced by _bt_getstackbuf(), _bt_moveright(), and _bt_stepright().

◆ _bt_first()

bool _bt_first ( IndexScanDesc  scan,
ScanDirection  dir 
)

Definition at line 877 of file nbtsearch.c.

878{
879 Relation rel = scan->indexRelation;
880 BTScanOpaque so = (BTScanOpaque) scan->opaque;
881 BTStack stack;
882 OffsetNumber offnum;
883 BTScanInsertData inskey;
884 ScanKey startKeys[INDEX_MAX_KEYS];
885 ScanKeyData notnullkey;
886 int keysz = 0;
887 StrategyNumber strat_total = InvalidStrategy;
889 lastcurrblkno;
890
892
893 /*
894 * Examine the scan keys and eliminate any redundant keys; also mark the
895 * keys that must be matched to continue the scan.
896 */
898
899 /*
900 * Quit now if _bt_preprocess_keys() discovered that the scan keys can
901 * never be satisfied (eg, x == 1 AND x > 2).
902 */
903 if (!so->qual_ok)
904 {
905 Assert(!so->needPrimScan);
906 _bt_parallel_done(scan);
907 return false;
908 }
909
910 /*
911 * If this is a parallel scan, we must seize the scan. _bt_readfirstpage
912 * will likely release the parallel scan later on.
913 */
914 if (scan->parallel_scan != NULL &&
915 !_bt_parallel_seize(scan, &blkno, &lastcurrblkno, true))
916 return false;
917
918 /*
919 * Initialize the scan's arrays (if any) for the current scan direction
920 * (except when they were already set to later values as part of
921 * scheduling the primitive index scan that is now underway)
922 */
923 if (so->numArrayKeys && !so->needPrimScan)
924 _bt_start_array_keys(scan, dir);
925
926 if (blkno != InvalidBlockNumber)
927 {
928 /*
929 * We anticipated calling _bt_search, but another worker bet us to it.
930 * _bt_readnextpage releases the scan for us (not _bt_readfirstpage).
931 */
932 Assert(scan->parallel_scan != NULL);
933 Assert(!so->needPrimScan);
934 Assert(blkno != P_NONE);
935
936 if (!_bt_readnextpage(scan, blkno, lastcurrblkno, dir, true))
937 return false;
938
939 _bt_returnitem(scan, so);
940 return true;
941 }
942
943 /*
944 * Count an indexscan for stats, now that we know that we'll call
945 * _bt_search/_bt_endpoint below
946 */
948 if (scan->instrument)
949 scan->instrument->nsearches++;
950
951 /*----------
952 * Examine the scan keys to discover where we need to start the scan.
953 * The selected scan keys (at most one per index column) are remembered by
954 * storing their addresses into the local startKeys[] array. The final
955 * startKeys[] entry's strategy is set in strat_total. (Actually, there
956 * are a couple of cases where we force a less/more restrictive strategy.)
957 *
958 * We must use the key that was marked required (in the direction opposite
959 * our own scan's) during preprocessing. Each index attribute can only
960 * have one such required key. In general, the keys that we use to find
961 * an initial position when scanning forwards are the same keys that end
962 * the scan on the leaf level when scanning backwards (and vice-versa).
963 *
964 * When the scan keys include cross-type operators, _bt_preprocess_keys
965 * may not be able to eliminate redundant keys; in such cases it will
966 * arbitrarily pick a usable key for each attribute (and scan direction),
967 * ensuring that there is no more than one key required in each direction.
968 * We stop considering further keys once we reach the first nonrequired
969 * key (which must come after all required keys), so this can't affect us.
970 *
971 * The required keys that we use as starting boundaries have to be =, >,
972 * or >= keys for a forward scan or =, <, <= keys for a backwards scan.
973 * We can use keys for multiple attributes so long as the prior attributes
974 * had only =, >= (resp. =, <=) keys. These rules are very similar to the
975 * rules that preprocessing used to determine which keys to mark required.
976 * We cannot always use every required key as a positioning key, though.
977 * Skip arrays necessitate independently applying our own rules here.
978 * Skip arrays are always generally considered = array keys, but we'll
979 * nevertheless treat them as inequalities at certain points of the scan.
980 * When that happens, it _might_ have implications for the number of
981 * required keys that we can safely use for initial positioning purposes.
982 *
983 * For example, a forward scan with a skip array on its leading attribute
984 * (with no low_compare/high_compare) will have at least two required scan
985 * keys, but we won't use any of them as boundary keys during the scan's
986 * initial call here. Our positioning key during the first call here can
987 * be thought of as representing "> -infinity". Similarly, if such a skip
988 * array's low_compare is "a > 'foo'", then we position using "a > 'foo'"
989 * during the scan's initial call here; a lower-order key such as "b = 42"
990 * can't be used until the "a" array advances beyond MINVAL/low_compare.
991 *
992 * On the other hand, if such a skip array's low_compare was "a >= 'foo'",
993 * then we _can_ use "a >= 'foo' AND b = 42" during the initial call here.
994 * A subsequent call here might have us use "a = 'fop' AND b = 42". Note
995 * that we treat = and >= as equivalent when scanning forwards (just as we
996 * treat = and <= as equivalent when scanning backwards). We effectively
997 * do the same thing (though with a distinct "a" element/value) each time.
998 *
999 * All keys (with the exception of SK_SEARCHNULL keys and SK_BT_SKIP
1000 * array keys whose array is "null_elem=true") imply a NOT NULL qualifier.
1001 * If the index stores nulls at the end of the index we'll be starting
1002 * from, and we have no boundary key for the column (which means the key
1003 * we deduced NOT NULL from is an inequality key that constrains the other
1004 * end of the index), then we cons up an explicit SK_SEARCHNOTNULL key to
1005 * use as a boundary key. If we didn't do this, we might find ourselves
1006 * traversing a lot of null entries at the start of the scan.
1007 *
1008 * In this loop, row-comparison keys are treated the same as keys on their
1009 * first (leftmost) columns. We'll add all lower-order columns of the row
1010 * comparison that were marked required during preprocessing below.
1011 *
1012 * _bt_advance_array_keys needs to know exactly how we'll reposition the
1013 * scan (should it opt to schedule another primitive index scan). It is
1014 * critical that primscans only be scheduled when they'll definitely make
1015 * some useful progress. _bt_advance_array_keys does this by calling
1016 * _bt_checkkeys routines that report whether a tuple is past the end of
1017 * matches for the scan's keys (given the scan's current array elements).
1018 * If the page's final tuple is "after the end of matches" for a scan that
1019 * uses the *opposite* scan direction, then it must follow that it's also
1020 * "before the start of matches" for the actual current scan direction.
1021 * It is therefore essential that all of our initial positioning rules are
1022 * symmetric with _bt_checkkeys's corresponding continuescan=false rule.
1023 * If you update anything here, _bt_checkkeys/_bt_advance_array_keys might
1024 * need to be kept in sync.
1025 *----------
1026 */
1027 if (so->numberOfKeys > 0)
1028 {
1029 AttrNumber curattr;
1030 ScanKey bkey;
1031 ScanKey impliesNN;
1032 ScanKey cur;
1033
1034 /*
1035 * bkey will be set to the key that preprocessing left behind as the
1036 * boundary key for this attribute, in this scan direction (if any)
1037 */
1038 cur = so->keyData;
1039 curattr = 1;
1040 bkey = NULL;
1041 /* Also remember any scankey that implies a NOT NULL constraint */
1042 impliesNN = NULL;
1043
1044 /*
1045 * Loop iterates from 0 to numberOfKeys inclusive; we use the last
1046 * pass to handle after-last-key processing. Actual exit from the
1047 * loop is at one of the "break" statements below.
1048 */
1049 for (int i = 0;; cur++, i++)
1050 {
1051 if (i >= so->numberOfKeys || cur->sk_attno != curattr)
1052 {
1053 /* Done looking for the curattr boundary key */
1054 Assert(bkey == NULL ||
1055 (bkey->sk_attno == curattr &&
1056 (bkey->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD))));
1057 Assert(impliesNN == NULL ||
1058 (impliesNN->sk_attno == curattr &&
1059 (impliesNN->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD))));
1060
1061 /*
1062 * If this is a scan key for a skip array whose current
1063 * element is MINVAL, choose low_compare (when scanning
1064 * backwards it'll be MAXVAL, and we'll choose high_compare).
1065 *
1066 * Note: if the array's low_compare key makes 'bkey' NULL,
1067 * then we behave as if the array's first element is -inf,
1068 * except when !array->null_elem implies a usable NOT NULL
1069 * constraint.
1070 */
1071 if (bkey != NULL &&
1072 (bkey->sk_flags & (SK_BT_MINVAL | SK_BT_MAXVAL)))
1073 {
1074 int ikey = bkey - so->keyData;
1075 ScanKey skipequalitykey = bkey;
1076 BTArrayKeyInfo *array = NULL;
1077
1078 for (int arridx = 0; arridx < so->numArrayKeys; arridx++)
1079 {
1080 array = &so->arrayKeys[arridx];
1081 if (array->scan_key == ikey)
1082 break;
1083 }
1084
1085 if (ScanDirectionIsForward(dir))
1086 {
1087 Assert(!(skipequalitykey->sk_flags & SK_BT_MAXVAL));
1088 bkey = array->low_compare;
1089 }
1090 else
1091 {
1092 Assert(!(skipequalitykey->sk_flags & SK_BT_MINVAL));
1093 bkey = array->high_compare;
1094 }
1095
1096 Assert(bkey == NULL ||
1097 bkey->sk_attno == skipequalitykey->sk_attno);
1098
1099 if (!array->null_elem)
1100 impliesNN = skipequalitykey;
1101 else
1102 Assert(bkey == NULL && impliesNN == NULL);
1103 }
1104
1105 /*
1106 * If we didn't find a usable boundary key, see if we can
1107 * deduce a NOT NULL key
1108 */
1109 if (bkey == NULL && impliesNN != NULL &&
1110 ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
1113 {
1114 /* Final startKeys[] entry will be deduced NOT NULL key */
1115 bkey = &notnullkey;
1118 (impliesNN->sk_flags &
1120 curattr,
1123 InvalidOid,
1124 InvalidOid,
1125 InvalidOid,
1126 (Datum) 0);
1127 }
1128
1129 /*
1130 * If preprocessing didn't leave a usable boundary key, quit;
1131 * else save the boundary key pointer in startKeys[]
1132 */
1133 if (bkey == NULL)
1134 break;
1135 startKeys[keysz++] = bkey;
1136
1137 /*
1138 * We can only consider adding more boundary keys when the one
1139 * that we just chose to add uses either the = or >= strategy
1140 * (during backwards scans we can only do so when the key that
1141 * we just added to startKeys[] uses the = or <= strategy)
1142 */
1143 strat_total = bkey->sk_strategy;
1144 if (strat_total == BTGreaterStrategyNumber ||
1145 strat_total == BTLessStrategyNumber)
1146 break;
1147
1148 /*
1149 * If the key that we just added to startKeys[] is a skip
1150 * array = key whose current element is marked NEXT or PRIOR,
1151 * make strat_total > or < (and stop adding boundary keys).
1152 * This can only happen with opclasses that lack skip support.
1153 */
1154 if (bkey->sk_flags & (SK_BT_NEXT | SK_BT_PRIOR))
1155 {
1156 Assert(bkey->sk_flags & SK_BT_SKIP);
1157 Assert(strat_total == BTEqualStrategyNumber);
1158
1159 if (ScanDirectionIsForward(dir))
1160 {
1161 Assert(!(bkey->sk_flags & SK_BT_PRIOR));
1162 strat_total = BTGreaterStrategyNumber;
1163 }
1164 else
1165 {
1166 Assert(!(bkey->sk_flags & SK_BT_NEXT));
1167 strat_total = BTLessStrategyNumber;
1168 }
1169
1170 /*
1171 * We're done. We'll never find an exact = match for a
1172 * NEXT or PRIOR sentinel sk_argument value. There's no
1173 * sense in trying to add more keys to startKeys[].
1174 */
1175 break;
1176 }
1177
1178 /*
1179 * Done if that was the last scan key output by preprocessing.
1180 * Also done if we've now examined all keys marked required.
1181 */
1182 if (i >= so->numberOfKeys ||
1183 !(cur->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)))
1184 break;
1185
1186 /*
1187 * Reset for next attr.
1188 */
1189 Assert(cur->sk_attno == curattr + 1);
1190 curattr = cur->sk_attno;
1191 bkey = NULL;
1192 impliesNN = NULL;
1193 }
1194
1195 /*
1196 * If we've located the starting boundary key for curattr, we have
1197 * no interest in curattr's other required key
1198 */
1199 if (bkey != NULL)
1200 continue;
1201
1202 /*
1203 * Is this key the starting boundary key for curattr?
1204 *
1205 * If not, does it imply a NOT NULL constraint? (Because
1206 * SK_SEARCHNULL keys are always assigned BTEqualStrategyNumber,
1207 * *any* inequality key works for that; we need not test.)
1208 */
1209 switch (cur->sk_strategy)
1210 {
1213 if (ScanDirectionIsBackward(dir))
1214 bkey = cur;
1215 else if (impliesNN == NULL)
1216 impliesNN = cur;
1217 break;
1219 bkey = cur;
1220 break;
1223 if (ScanDirectionIsForward(dir))
1224 bkey = cur;
1225 else if (impliesNN == NULL)
1226 impliesNN = cur;
1227 break;
1228 }
1229 }
1230 }
1231
1232 /*
1233 * If we found no usable boundary keys, we have to start from one end of
1234 * the tree. Walk down that edge to the first or last key, and scan from
1235 * there.
1236 *
1237 * Note: calls _bt_readfirstpage for us, which releases the parallel scan.
1238 */
1239 if (keysz == 0)
1240 return _bt_endpoint(scan, dir);
1241
1242 /*
1243 * We want to start the scan somewhere within the index. Set up an
1244 * insertion scankey we can use to search for the boundary point we
1245 * identified above. The insertion scankey is built using the keys
1246 * identified by startKeys[]. (Remaining insertion scankey fields are
1247 * initialized after initial-positioning scan keys are finalized.)
1248 */
1249 Assert(keysz <= INDEX_MAX_KEYS);
1250 for (int i = 0; i < keysz; i++)
1251 {
1252 ScanKey bkey = startKeys[i];
1253
1254 Assert(bkey->sk_attno == i + 1);
1255
1256 if (bkey->sk_flags & SK_ROW_HEADER)
1257 {
1258 /*
1259 * Row comparison header: look to the first row member instead
1260 */
1261 ScanKey subkey = (ScanKey) DatumGetPointer(bkey->sk_argument);
1262 bool loosen_strat = false,
1263 tighten_strat = false;
1264
1265 /*
1266 * Cannot be a NULL in the first row member: _bt_preprocess_keys
1267 * would've marked the qual as unsatisfiable, preventing us from
1268 * ever getting this far
1269 */
1270 Assert(subkey->sk_flags & SK_ROW_MEMBER);
1271 Assert(subkey->sk_attno == bkey->sk_attno);
1272 Assert(!(subkey->sk_flags & SK_ISNULL));
1273
1274 /*
1275 * This is either a > or >= key (during backwards scans it is
1276 * either < or <=) that was marked required during preprocessing.
1277 * Later so->keyData[] keys can't have been marked required, so
1278 * our row compare header key must be the final startKeys[] entry.
1279 */
1281 Assert(subkey->sk_strategy == bkey->sk_strategy);
1282 Assert(subkey->sk_strategy == strat_total);
1283 Assert(i == keysz - 1);
1284
1285 /*
1286 * The member scankeys are already in insertion format (ie, they
1287 * have sk_func = 3-way-comparison function)
1288 */
1289 memcpy(inskey.scankeys + i, subkey, sizeof(ScanKeyData));
1290
1291 /*
1292 * Now look to later row compare members.
1293 *
1294 * If there's an "index attribute gap" between two row compare
1295 * members, the second member won't have been marked required, and
1296 * so can't be used as a starting boundary key here. The part of
1297 * the row comparison that we do still use has to be treated as a
1298 * ">=" or "<=" condition. For example, a qual "(a, c) > (1, 42)"
1299 * with an omitted intervening index attribute "b" will use an
1300 * insertion scan key "a >= 1". Even the first "a = 1" tuple on
1301 * the leaf level might satisfy the row compare qual.
1302 *
1303 * We're able to use a _more_ restrictive strategy when we reach a
1304 * NULL row compare member, since they're always unsatisfiable.
1305 * For example, a qual "(a, b, c) >= (1, NULL, 77)" will use an
1306 * insertion scan key "a > 1". All tuples where "a = 1" cannot
1307 * possibly satisfy the row compare qual, so this is safe.
1308 */
1309 Assert(!(subkey->sk_flags & SK_ROW_END));
1310 for (;;)
1311 {
1312 subkey++;
1313 Assert(subkey->sk_flags & SK_ROW_MEMBER);
1314
1315 if (subkey->sk_flags & SK_ISNULL)
1316 {
1317 /*
1318 * NULL member key, can only use earlier keys.
1319 *
1320 * We deliberately avoid checking if this key is marked
1321 * required. All earlier keys are required, and this key
1322 * is unsatisfiable either way, so we can't miss anything.
1323 */
1324 tighten_strat = true;
1325 break;
1326 }
1327
1328 if (!(subkey->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)))
1329 {
1330 /* nonrequired member key, can only use earlier keys */
1331 loosen_strat = true;
1332 break;
1333 }
1334
1335 Assert(subkey->sk_attno == keysz + 1);
1336 Assert(subkey->sk_strategy == bkey->sk_strategy);
1337 Assert(keysz < INDEX_MAX_KEYS);
1338
1339 memcpy(inskey.scankeys + keysz, subkey, sizeof(ScanKeyData));
1340 keysz++;
1341
1342 if (subkey->sk_flags & SK_ROW_END)
1343 break;
1344 }
1345 Assert(!(loosen_strat && tighten_strat));
1346 if (loosen_strat)
1347 {
1348 /* Use less restrictive strategy (and fewer member keys) */
1349 switch (strat_total)
1350 {
1352 strat_total = BTLessEqualStrategyNumber;
1353 break;
1355 strat_total = BTGreaterEqualStrategyNumber;
1356 break;
1357 }
1358 }
1359 if (tighten_strat)
1360 {
1361 /* Use more restrictive strategy (and fewer member keys) */
1362 switch (strat_total)
1363 {
1365 strat_total = BTLessStrategyNumber;
1366 break;
1368 strat_total = BTGreaterStrategyNumber;
1369 break;
1370 }
1371 }
1372
1373 /* Done (row compare header key is always last startKeys[] key) */
1374 break;
1375 }
1376
1377 /*
1378 * Ordinary comparison key/search-style key.
1379 *
1380 * Transform the search-style scan key to an insertion scan key by
1381 * replacing the sk_func with the appropriate btree 3-way-comparison
1382 * function.
1383 *
1384 * If scankey operator is not a cross-type comparison, we can use the
1385 * cached comparison function; otherwise gotta look it up in the
1386 * catalogs. (That can't lead to infinite recursion, since no
1387 * indexscan initiated by syscache lookup will use cross-data-type
1388 * operators.)
1389 *
1390 * We support the convention that sk_subtype == InvalidOid means the
1391 * opclass input type; this hack simplifies life for ScanKeyInit().
1392 */
1393 if (bkey->sk_subtype == rel->rd_opcintype[i] ||
1394 bkey->sk_subtype == InvalidOid)
1395 {
1396 FmgrInfo *procinfo;
1397
1398 procinfo = index_getprocinfo(rel, bkey->sk_attno, BTORDER_PROC);
1399 ScanKeyEntryInitializeWithInfo(inskey.scankeys + i,
1400 bkey->sk_flags,
1401 bkey->sk_attno,
1403 bkey->sk_subtype,
1404 bkey->sk_collation,
1405 procinfo,
1406 bkey->sk_argument);
1407 }
1408 else
1409 {
1410 RegProcedure cmp_proc;
1411
1412 cmp_proc = get_opfamily_proc(rel->rd_opfamily[i],
1413 rel->rd_opcintype[i],
1414 bkey->sk_subtype, BTORDER_PROC);
1415 if (!RegProcedureIsValid(cmp_proc))
1416 elog(ERROR, "missing support function %d(%u,%u) for attribute %d of index \"%s\"",
1417 BTORDER_PROC, rel->rd_opcintype[i], bkey->sk_subtype,
1418 bkey->sk_attno, RelationGetRelationName(rel));
1419 ScanKeyEntryInitialize(inskey.scankeys + i,
1420 bkey->sk_flags,
1421 bkey->sk_attno,
1423 bkey->sk_subtype,
1424 bkey->sk_collation,
1425 cmp_proc,
1426 bkey->sk_argument);
1427 }
1428 }
1429
1430 /*----------
1431 * Examine the selected initial-positioning strategy to determine exactly
1432 * where we need to start the scan, and set flag variables to control the
1433 * initial descent by _bt_search (and our _bt_binsrch call for the leaf
1434 * page _bt_search returns).
1435 *----------
1436 */
1437 _bt_metaversion(rel, &inskey.heapkeyspace, &inskey.allequalimage);
1438 inskey.anynullkeys = false; /* unused */
1439 inskey.scantid = NULL;
1440 inskey.keysz = keysz;
1441 switch (strat_total)
1442 {
1444
1445 inskey.nextkey = false;
1446 inskey.backward = true;
1447 break;
1448
1450
1451 inskey.nextkey = true;
1452 inskey.backward = true;
1453 break;
1454
1456
1457 /*
1458 * If a backward scan was specified, need to start with last equal
1459 * item not first one.
1460 */
1461 if (ScanDirectionIsBackward(dir))
1462 {
1463 /*
1464 * This is the same as the <= strategy
1465 */
1466 inskey.nextkey = true;
1467 inskey.backward = true;
1468 }
1469 else
1470 {
1471 /*
1472 * This is the same as the >= strategy
1473 */
1474 inskey.nextkey = false;
1475 inskey.backward = false;
1476 }
1477 break;
1478
1480
1481 /*
1482 * Find first item >= scankey
1483 */
1484 inskey.nextkey = false;
1485 inskey.backward = false;
1486 break;
1487
1489
1490 /*
1491 * Find first item > scankey
1492 */
1493 inskey.nextkey = true;
1494 inskey.backward = false;
1495 break;
1496
1497 default:
1498 /* can't get here, but keep compiler quiet */
1499 elog(ERROR, "unrecognized strat_total: %d", (int) strat_total);
1500 return false;
1501 }
1502
1503 /*
1504 * Use the manufactured insertion scan key to descend the tree and
1505 * position ourselves on the target leaf page.
1506 */
1507 Assert(ScanDirectionIsBackward(dir) == inskey.backward);
1508 stack = _bt_search(rel, NULL, &inskey, &so->currPos.buf, BT_READ);
1509
1510 /* don't need to keep the stack around... */
1511 _bt_freestack(stack);
1512
1513 if (!BufferIsValid(so->currPos.buf))
1514 {
1515 Assert(!so->needPrimScan);
1516
1517 /*
1518 * We only get here if the index is completely empty. Lock relation
1519 * because nothing finer to lock exists. Without a buffer lock, it's
1520 * possible for another transaction to insert data between
1521 * _bt_search() and PredicateLockRelation(). We have to try again
1522 * after taking the relation-level predicate lock, to close a narrow
1523 * window where we wouldn't scan concurrently inserted tuples, but the
1524 * writer wouldn't see our predicate lock.
1525 */
1527 {
1529 stack = _bt_search(rel, NULL, &inskey, &so->currPos.buf, BT_READ);
1530 _bt_freestack(stack);
1531 }
1532
1533 if (!BufferIsValid(so->currPos.buf))
1534 {
1535 _bt_parallel_done(scan);
1536 return false;
1537 }
1538 }
1539
1540 /* position to the precise item on the page */
1541 offnum = _bt_binsrch(rel, &inskey, so->currPos.buf);
1542
1543 /*
1544 * Now load data from the first page of the scan (usually the page
1545 * currently in so->currPos.buf).
1546 *
1547 * If inskey.nextkey = false and inskey.backward = false, offnum is
1548 * positioned at the first non-pivot tuple >= inskey.scankeys.
1549 *
1550 * If inskey.nextkey = false and inskey.backward = true, offnum is
1551 * positioned at the last non-pivot tuple < inskey.scankeys.
1552 *
1553 * If inskey.nextkey = true and inskey.backward = false, offnum is
1554 * positioned at the first non-pivot tuple > inskey.scankeys.
1555 *
1556 * If inskey.nextkey = true and inskey.backward = true, offnum is
1557 * positioned at the last non-pivot tuple <= inskey.scankeys.
1558 *
1559 * It's possible that _bt_binsrch returned an offnum that is out of bounds
1560 * for the page. For example, when inskey is both < the leaf page's high
1561 * key and > all of its non-pivot tuples, offnum will be "maxoff + 1".
1562 */
1563 if (!_bt_readfirstpage(scan, offnum, dir))
1564 return false;
1565
1566 _bt_returnitem(scan, so);
1567 return true;
1568}
int16 AttrNumber
Definition: attnum.h:21
#define RegProcedureIsValid(p)
Definition: c.h:790
regproc RegProcedure
Definition: c.h:669
FmgrInfo * index_getprocinfo(Relation irel, AttrNumber attnum, uint16 procnum)
Definition: indexam.c:917
void _bt_metaversion(Relation rel, bool *heapkeyspace, bool *allequalimage)
Definition: nbtpage.c:740
void _bt_preprocess_keys(IndexScanDesc scan)
void _bt_start_array_keys(IndexScanDesc scan, ScanDirection dir)
Definition: nbtreadpage.c:537
bool _bt_parallel_seize(IndexScanDesc scan, BlockNumber *next_scan_page, BlockNumber *last_curr_page, bool first)
Definition: nbtree.c:885
void _bt_parallel_done(IndexScanDesc scan)
Definition: nbtree.c:1050
#define BTORDER_PROC
Definition: nbtree.h:717
#define SK_BT_PRIOR
Definition: nbtree.h:1112
#define SK_BT_NEXT
Definition: nbtree.h:1111
#define BTScanPosIsValid(scanpos)
Definition: nbtree.h:1021
#define P_NONE
Definition: nbtree.h:213
#define SK_BT_REQBKWD
Definition: nbtree.h:1105
#define SK_BT_MAXVAL
Definition: nbtree.h:1110
#define BT_READ
Definition: nbtree.h:730
#define SK_BT_MINVAL
Definition: nbtree.h:1109
BTScanOpaqueData * BTScanOpaque
Definition: nbtree.h:1097
static bool _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, BlockNumber lastcurrblkno, ScanDirection dir, bool seized)
Definition: nbtsearch.c:1839
static OffsetNumber _bt_binsrch(Relation rel, BTScanInsert key, Buffer buf)
Definition: nbtsearch.c:338
static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:2174
static bool _bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum, ScanDirection dir)
Definition: nbtsearch.c:1746
BTStack _bt_search(Relation rel, Relation heaprel, BTScanInsert key, Buffer *bufP, int access)
Definition: nbtsearch.c:97
static void _bt_returnitem(IndexScanDesc scan, BTScanOpaque so)
Definition: nbtsearch.c:1621
#define INDEX_MAX_KEYS
#define pgstat_count_index_scan(rel)
Definition: pgstat.h:705
#define InvalidOid
Definition: postgres_ext.h:37
void PredicateLockRelation(Relation relation, Snapshot snapshot)
Definition: predicate.c:2574
void ScanKeyEntryInitialize(ScanKey entry, int flags, AttrNumber attributeNumber, StrategyNumber strategy, Oid subtype, Oid collation, RegProcedure procedure, Datum argument)
Definition: scankey.c:32
void ScanKeyEntryInitializeWithInfo(ScanKey entry, int flags, AttrNumber attributeNumber, StrategyNumber strategy, Oid subtype, Oid collation, FmgrInfo *finfo, Datum argument)
Definition: scankey.c:101
#define ScanDirectionIsBackward(direction)
Definition: sdir.h:50
#define SK_ROW_HEADER
Definition: skey.h:117
#define SK_ROW_MEMBER
Definition: skey.h:118
#define SK_SEARCHNOTNULL
Definition: skey.h:122
#define SK_ROW_END
Definition: skey.h:119
ScanKeyData * ScanKey
Definition: skey.h:75
uint16 StrategyNumber
Definition: stratnum.h:22
#define BTGreaterStrategyNumber
Definition: stratnum.h:33
#define InvalidStrategy
Definition: stratnum.h:24
#define BTLessStrategyNumber
Definition: stratnum.h:29
#define BTLessEqualStrategyNumber
Definition: stratnum.h:30
#define BTGreaterEqualStrategyNumber
Definition: stratnum.h:32
ScanKey high_compare
Definition: nbtree.h:1050
ScanKey low_compare
Definition: nbtree.h:1049
bool null_elem
Definition: nbtree.h:1047
bool needPrimScan
Definition: nbtree.h:1063
BTArrayKeyInfo * arrayKeys
Definition: nbtree.h:1066
BTScanPosData currPos
Definition: nbtree.h:1093
ScanKey keyData
Definition: nbtree.h:1058
Buffer buf
Definition: nbtree.h:964
Definition: fmgr.h:57
struct ParallelIndexScanDescData * parallel_scan
Definition: relscan.h:193
struct IndexScanInstrumentation * instrument
Definition: relscan.h:161
Relation indexRelation
Definition: relscan.h:139
struct SnapshotData * xs_snapshot
Definition: relscan.h:140
Oid sk_subtype
Definition: skey.h:69
StrategyNumber sk_strategy
Definition: skey.h:68
#define IsolationIsSerializable()
Definition: xact.h:53

References _bt_binsrch(), _bt_endpoint(), _bt_freestack(), _bt_metaversion(), _bt_parallel_done(), _bt_parallel_seize(), _bt_preprocess_keys(), _bt_readfirstpage(), _bt_readnextpage(), _bt_returnitem(), _bt_search(), _bt_start_array_keys(), BTScanOpaqueData::arrayKeys, Assert(), BT_READ, BTEqualStrategyNumber, BTGreaterEqualStrategyNumber, BTGreaterStrategyNumber, BTLessEqualStrategyNumber, BTLessStrategyNumber, BTORDER_PROC, BTScanPosIsValid, BTScanPosData::buf, BufferIsValid(), cur, BTScanOpaqueData::currPos, DatumGetPointer(), elog, ERROR, get_opfamily_proc(), BTArrayKeyInfo::high_compare, i, index_getprocinfo(), INDEX_MAX_KEYS, IndexScanDescData::indexRelation, IndexScanDescData::instrument, InvalidBlockNumber, InvalidOid, InvalidStrategy, IsolationIsSerializable, BTScanOpaqueData::keyData, BTArrayKeyInfo::low_compare, BTScanOpaqueData::needPrimScan, IndexScanInstrumentation::nsearches, BTArrayKeyInfo::null_elem, BTScanOpaqueData::numArrayKeys, BTScanOpaqueData::numberOfKeys, IndexScanDescData::opaque, P_NONE, IndexScanDescData::parallel_scan, pgstat_count_index_scan, PredicateLockRelation(), BTScanOpaqueData::qual_ok, RelationData::rd_opcintype, RelationData::rd_opfamily, RegProcedureIsValid, RelationGetRelationName, BTArrayKeyInfo::scan_key, ScanDirectionIsBackward, ScanDirectionIsForward, ScanKeyEntryInitialize(), ScanKeyEntryInitializeWithInfo(), ScanKeyData::sk_argument, ScanKeyData::sk_attno, SK_BT_DESC, SK_BT_MAXVAL, SK_BT_MINVAL, SK_BT_NEXT, SK_BT_NULLS_FIRST, SK_BT_PRIOR, SK_BT_REQBKWD, SK_BT_REQFWD, SK_BT_SKIP, ScanKeyData::sk_collation, ScanKeyData::sk_flags, SK_ISNULL, SK_ROW_END, SK_ROW_HEADER, SK_ROW_MEMBER, SK_SEARCHNOTNULL, ScanKeyData::sk_strategy, ScanKeyData::sk_subtype, and IndexScanDescData::xs_snapshot.

Referenced by btgetbitmap(), and btgettuple().

◆ _bt_form_posting()

IndexTuple _bt_form_posting ( IndexTuple  base,
const ItemPointerData htids,
int  nhtids 
)

Definition at line 862 of file nbtdedup.c.

863{
864 uint32 keysize,
865 newsize;
866 IndexTuple itup;
867
868 if (BTreeTupleIsPosting(base))
869 keysize = BTreeTupleGetPostingOffset(base);
870 else
871 keysize = IndexTupleSize(base);
872
874 Assert(nhtids > 0 && nhtids <= PG_UINT16_MAX);
875 Assert(keysize == MAXALIGN(keysize));
876
877 /* Determine final size of new tuple */
878 if (nhtids > 1)
879 newsize = MAXALIGN(keysize +
880 nhtids * sizeof(ItemPointerData));
881 else
882 newsize = keysize;
883
884 Assert(newsize <= INDEX_SIZE_MASK);
885 Assert(newsize == MAXALIGN(newsize));
886
887 /* Allocate memory using palloc0() (matches index_form_tuple()) */
888 itup = palloc0(newsize);
889 memcpy(itup, base, keysize);
890 itup->t_info &= ~INDEX_SIZE_MASK;
891 itup->t_info |= newsize;
892 if (nhtids > 1)
893 {
894 /* Form posting list tuple */
895 BTreeTupleSetPosting(itup, nhtids, keysize);
896 memcpy(BTreeTupleGetPosting(itup), htids,
897 sizeof(ItemPointerData) * nhtids);
898 Assert(_bt_posting_valid(itup));
899 }
900 else
901 {
902 /* Form standard non-pivot tuple */
903 itup->t_info &= ~INDEX_ALT_TID_MASK;
904 ItemPointerCopy(htids, &itup->t_tid);
906 }
907
908 return itup;
909}
#define PG_UINT16_MAX
Definition: c.h:606
static void ItemPointerCopy(const ItemPointerData *fromPointer, ItemPointerData *toPointer)
Definition: itemptr.h:172
static bool ItemPointerIsValid(const ItemPointerData *pointer)
Definition: itemptr.h:83
void * palloc0(Size size)
Definition: mcxt.c:1395
static void BTreeTupleSetPosting(IndexTuple itup, uint16 nhtids, int postingoffset)
Definition: nbtree.h:505
unsigned short t_info
Definition: itup.h:49

References Assert(), BTreeTupleGetPosting(), BTreeTupleGetPostingOffset(), BTreeTupleIsPivot(), BTreeTupleIsPosting(), BTreeTupleSetPosting(), INDEX_SIZE_MASK, IndexTupleSize(), ItemPointerCopy(), ItemPointerIsValid(), MAXALIGN, palloc0(), PG_UINT16_MAX, IndexTupleData::t_info, and IndexTupleData::t_tid.

Referenced by _bt_dedup_finish_pending(), _bt_sort_dedup_finish_pending(), and bt_posting_plain_tuple().

◆ _bt_freestack()

void _bt_freestack ( BTStack  stack)

Definition at line 151 of file nbtutils.c.

152{
153 BTStack ostack;
154
155 while (stack != NULL)
156 {
157 ostack = stack;
158 stack = stack->bts_parent;
159 pfree(ostack);
160 }
161}
struct BTStackData * bts_parent
Definition: nbtree.h:747

References BTStackData::bts_parent, and pfree().

Referenced by _bt_doinsert(), _bt_first(), and bt_rootdescend().

◆ _bt_get_endpoint()

Buffer _bt_get_endpoint ( Relation  rel,
uint32  level,
bool  rightmost 
)

Definition at line 2091 of file nbtsearch.c.

2092{
2093 Buffer buf;
2094 Page page;
2095 BTPageOpaque opaque;
2096 OffsetNumber offnum;
2097 BlockNumber blkno;
2098 IndexTuple itup;
2099
2100 /*
2101 * If we are looking for a leaf page, okay to descend from fast root;
2102 * otherwise better descend from true root. (There is no point in being
2103 * smarter about intermediate levels.)
2104 */
2105 if (level == 0)
2106 buf = _bt_getroot(rel, NULL, BT_READ);
2107 else
2108 buf = _bt_gettrueroot(rel);
2109
2110 if (!BufferIsValid(buf))
2111 return InvalidBuffer;
2112
2113 page = BufferGetPage(buf);
2114 opaque = BTPageGetOpaque(page);
2115
2116 for (;;)
2117 {
2118 /*
2119 * If we landed on a deleted page, step right to find a live page
2120 * (there must be one). Also, if we want the rightmost page, step
2121 * right if needed to get to it (this could happen if the page split
2122 * since we obtained a pointer to it).
2123 */
2124 while (P_IGNORE(opaque) ||
2125 (rightmost && !P_RIGHTMOST(opaque)))
2126 {
2127 blkno = opaque->btpo_next;
2128 if (blkno == P_NONE)
2129 elog(ERROR, "fell off the end of index \"%s\"",
2131 buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2132 page = BufferGetPage(buf);
2133 opaque = BTPageGetOpaque(page);
2134 }
2135
2136 /* Done? */
2137 if (opaque->btpo_level == level)
2138 break;
2139 if (opaque->btpo_level < level)
2140 ereport(ERROR,
2141 (errcode(ERRCODE_INDEX_CORRUPTED),
2142 errmsg_internal("btree level %u not found in index \"%s\"",
2143 level, RelationGetRelationName(rel))));
2144
2145 /* Descend to leftmost or rightmost child page */
2146 if (rightmost)
2147 offnum = PageGetMaxOffsetNumber(page);
2148 else
2149 offnum = P_FIRSTDATAKEY(opaque);
2150
2151 itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
2152 blkno = BTreeTupleGetDownLink(itup);
2153
2154 buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2155 page = BufferGetPage(buf);
2156 opaque = BTPageGetOpaque(page);
2157 }
2158
2159 return buf;
2160}
Buffer _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
Definition: nbtpage.c:1004
Buffer _bt_gettrueroot(Relation rel)
Definition: nbtpage.c:581
Buffer _bt_getroot(Relation rel, Relation heaprel, int access)
Definition: nbtpage.c:345
static BlockNumber BTreeTupleGetDownLink(IndexTuple pivot)
Definition: nbtree.h:557
uint32 btpo_level
Definition: nbtree.h:67

References _bt_getroot(), _bt_gettrueroot(), _bt_relandgetbuf(), BT_READ, BTPageGetOpaque, BTPageOpaqueData::btpo_level, BTPageOpaqueData::btpo_next, BTreeTupleGetDownLink(), buf, BufferGetPage(), BufferIsValid(), elog, ereport, errcode(), errmsg_internal(), ERROR, InvalidBuffer, P_FIRSTDATAKEY, P_IGNORE, P_NONE, P_RIGHTMOST, PageGetItem(), PageGetItemId(), PageGetMaxOffsetNumber(), and RelationGetRelationName.

Referenced by _bt_endpoint(), and _bt_insert_parent().

◆ _bt_getbuf()

Buffer _bt_getbuf ( Relation  rel,
BlockNumber  blkno,
int  access 
)

Definition at line 846 of file nbtpage.c.

847{
848 Buffer buf;
849
851
852 /* Read an existing block of the relation */
853 buf = ReadBuffer(rel, blkno);
854 _bt_lockbuf(rel, buf, access);
855 _bt_checkpage(rel, buf);
856
857 return buf;
858}
void _bt_checkpage(Relation rel, Buffer buf)
Definition: nbtpage.c:798
void _bt_lockbuf(Relation rel, Buffer buf, int access)
Definition: nbtpage.c:1040
short access
Definition: preproc-type.c:36

References _bt_checkpage(), _bt_lockbuf(), Assert(), BlockNumberIsValid(), buf, and ReadBuffer().

Referenced by _bt_finish_split(), _bt_getroot(), _bt_getrootheight(), _bt_getstackbuf(), _bt_gettrueroot(), _bt_insertonpg(), _bt_killitems(), _bt_leftsib_splitflag(), _bt_lock_and_validate_left(), _bt_metaversion(), _bt_moveright(), _bt_newlevel(), _bt_pagedel(), _bt_readnextpage(), _bt_rightsib_halfdeadflag(), _bt_set_cleanup_info(), _bt_split(), _bt_unlink_halfdead_page(), and _bt_vacuum_needs_cleanup().

◆ _bt_getroot()

Buffer _bt_getroot ( Relation  rel,
Relation  heaprel,
int  access 
)

Definition at line 345 of file nbtpage.c.

346{
347 Buffer metabuf;
348 Buffer rootbuf;
349 Page rootpage;
350 BTPageOpaque rootopaque;
351 BlockNumber rootblkno;
352 uint32 rootlevel;
353 BTMetaPageData *metad;
354
355 Assert(access == BT_READ || heaprel != NULL);
356
357 /*
358 * Try to use previously-cached metapage data to find the root. This
359 * normally saves one buffer access per index search, which is a very
360 * helpful savings in bufmgr traffic and hence contention.
361 */
362 if (rel->rd_amcache != NULL)
363 {
364 metad = (BTMetaPageData *) rel->rd_amcache;
365 /* We shouldn't have cached it if any of these fail */
366 Assert(metad->btm_magic == BTREE_MAGIC);
369 Assert(!metad->btm_allequalimage ||
371 Assert(metad->btm_root != P_NONE);
372
373 rootblkno = metad->btm_fastroot;
374 Assert(rootblkno != P_NONE);
375 rootlevel = metad->btm_fastlevel;
376
377 rootbuf = _bt_getbuf(rel, rootblkno, BT_READ);
378 rootpage = BufferGetPage(rootbuf);
379 rootopaque = BTPageGetOpaque(rootpage);
380
381 /*
382 * Since the cache might be stale, we check the page more carefully
383 * here than normal. We *must* check that it's not deleted. If it's
384 * not alone on its level, then we reject too --- this may be overly
385 * paranoid but better safe than sorry. Note we don't check P_ISROOT,
386 * because that's not set in a "fast root".
387 */
388 if (!P_IGNORE(rootopaque) &&
389 rootopaque->btpo_level == rootlevel &&
390 P_LEFTMOST(rootopaque) &&
391 P_RIGHTMOST(rootopaque))
392 {
393 /* OK, accept cached page as the root */
394 return rootbuf;
395 }
396 _bt_relbuf(rel, rootbuf);
397 /* Cache is stale, throw it away */
398 if (rel->rd_amcache)
399 pfree(rel->rd_amcache);
400 rel->rd_amcache = NULL;
401 }
402
403 metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
404 metad = _bt_getmeta(rel, metabuf);
405
406 /* if no root page initialized yet, do it */
407 if (metad->btm_root == P_NONE)
408 {
409 Page metapg;
410
411 /* If access = BT_READ, caller doesn't want us to create root yet */
412 if (access == BT_READ)
413 {
414 _bt_relbuf(rel, metabuf);
415 return InvalidBuffer;
416 }
417
418 /* trade in our read lock for a write lock */
419 _bt_unlockbuf(rel, metabuf);
420 _bt_lockbuf(rel, metabuf, BT_WRITE);
421
422 /*
423 * Race condition: if someone else initialized the metadata between
424 * the time we released the read lock and acquired the write lock, we
425 * must avoid doing it again.
426 */
427 if (metad->btm_root != P_NONE)
428 {
429 /*
430 * Metadata initialized by someone else. In order to guarantee no
431 * deadlocks, we have to release the metadata page and start all
432 * over again. (Is that really true? But it's hardly worth trying
433 * to optimize this case.)
434 */
435 _bt_relbuf(rel, metabuf);
436 return _bt_getroot(rel, heaprel, access);
437 }
438
439 /*
440 * Get, initialize, write, and leave a lock of the appropriate type on
441 * the new root page. Since this is the first page in the tree, it's
442 * a leaf as well as the root.
443 */
444 rootbuf = _bt_allocbuf(rel, heaprel);
445 rootblkno = BufferGetBlockNumber(rootbuf);
446 rootpage = BufferGetPage(rootbuf);
447 rootopaque = BTPageGetOpaque(rootpage);
448 rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE;
449 rootopaque->btpo_flags = (BTP_LEAF | BTP_ROOT);
450 rootopaque->btpo_level = 0;
451 rootopaque->btpo_cycleid = 0;
452 /* Get raw page pointer for metapage */
453 metapg = BufferGetPage(metabuf);
454
455 /* NO ELOG(ERROR) till meta is updated */
457
458 /* upgrade metapage if needed */
459 if (metad->btm_version < BTREE_NOVAC_VERSION)
460 _bt_upgrademetapage(metapg);
461
462 metad->btm_root = rootblkno;
463 metad->btm_level = 0;
464 metad->btm_fastroot = rootblkno;
465 metad->btm_fastlevel = 0;
468
469 MarkBufferDirty(rootbuf);
470 MarkBufferDirty(metabuf);
471
472 /* XLOG stuff */
473 if (RelationNeedsWAL(rel))
474 {
475 xl_btree_newroot xlrec;
476 XLogRecPtr recptr;
478
482
484 md.version = metad->btm_version;
485 md.root = rootblkno;
486 md.level = 0;
487 md.fastroot = rootblkno;
488 md.fastlevel = 0;
491
492 XLogRegisterBufData(2, &md, sizeof(xl_btree_metadata));
493
494 xlrec.rootblk = rootblkno;
495 xlrec.level = 0;
496
498
499 recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT);
500
501 PageSetLSN(rootpage, recptr);
502 PageSetLSN(metapg, recptr);
503 }
504
506
507 /*
508 * swap root write lock for read lock. There is no danger of anyone
509 * else accessing the new root page while it's unlocked, since no one
510 * else knows where it is yet.
511 */
512 _bt_unlockbuf(rel, rootbuf);
513 _bt_lockbuf(rel, rootbuf, BT_READ);
514
515 /* okay, metadata is correct, release lock on it without caching */
516 _bt_relbuf(rel, metabuf);
517 }
518 else
519 {
520 rootblkno = metad->btm_fastroot;
521 Assert(rootblkno != P_NONE);
522 rootlevel = metad->btm_fastlevel;
523
524 /*
525 * Cache the metapage data for next time
526 */
528 sizeof(BTMetaPageData));
529 memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));
530
531 /*
532 * We are done with the metapage; arrange to release it via first
533 * _bt_relandgetbuf call
534 */
535 rootbuf = metabuf;
536
537 for (;;)
538 {
539 rootbuf = _bt_relandgetbuf(rel, rootbuf, rootblkno, BT_READ);
540 rootpage = BufferGetPage(rootbuf);
541 rootopaque = BTPageGetOpaque(rootpage);
542
543 if (!P_IGNORE(rootopaque))
544 break;
545
546 /* it's dead, Jim. step right one page */
547 if (P_RIGHTMOST(rootopaque))
548 elog(ERROR, "no live root page found in index \"%s\"",
550 rootblkno = rootopaque->btpo_next;
551 }
552
553 if (rootopaque->btpo_level != rootlevel)
554 elog(ERROR, "root page %u of index \"%s\" has level %u, expected %u",
555 rootblkno, RelationGetRelationName(rel),
556 rootopaque->btpo_level, rootlevel);
557 }
558
559 /*
560 * By here, we have a pin and read lock on the root page, and no lock set
561 * on the metadata page. Return the root page's buffer.
562 */
563 return rootbuf;
564}
void * MemoryContextAlloc(MemoryContext context, Size size)
Definition: mcxt.c:1229
void _bt_upgrademetapage(Page page)
Definition: nbtpage.c:108
Buffer _bt_allocbuf(Relation rel, Relation heaprel)
Definition: nbtpage.c:870
static BTMetaPageData * _bt_getmeta(Relation rel, Buffer metabuf)
Definition: nbtpage.c:143
void _bt_unlockbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:1071
#define BTREE_MIN_VERSION
Definition: nbtree.h:152
#define BTP_LEAF
Definition: nbtree.h:77
#define BTREE_MAGIC
Definition: nbtree.h:150
#define BTP_ROOT
Definition: nbtree.h:78
#define SizeOfBtreeNewroot
Definition: nbtxlog.h:347
#define XLOG_BTREE_NEWROOT
Definition: nbtxlog.h:37
uint32 btm_last_cleanup_num_delpages
Definition: nbtree.h:115
uint32 btm_level
Definition: nbtree.h:109
float8 btm_last_cleanup_num_heap_tuples
Definition: nbtree.h:117
BlockNumber btm_fastroot
Definition: nbtree.h:110
uint32 btm_version
Definition: nbtree.h:107
uint32 btm_magic
Definition: nbtree.h:106
bool btm_allequalimage
Definition: nbtree.h:119
uint32 btm_fastlevel
Definition: nbtree.h:111
BlockNumber btpo_prev
Definition: nbtree.h:65
void * rd_amcache
Definition: rel.h:229
MemoryContext rd_indexcxt
Definition: rel.h:204
uint32 level
Definition: nbtxlog.h:50
uint32 version
Definition: nbtxlog.h:48
bool allequalimage
Definition: nbtxlog.h:54
BlockNumber fastroot
Definition: nbtxlog.h:51
uint32 fastlevel
Definition: nbtxlog.h:52
BlockNumber root
Definition: nbtxlog.h:49
uint32 last_cleanup_num_delpages
Definition: nbtxlog.h:53
uint32 level
Definition: nbtxlog.h:344
BlockNumber rootblk
Definition: nbtxlog.h:343
#define REGBUF_WILL_INIT
Definition: xloginsert.h:34

References _bt_allocbuf(), _bt_getbuf(), _bt_getmeta(), _bt_getroot(), _bt_lockbuf(), _bt_relandgetbuf(), _bt_relbuf(), _bt_unlockbuf(), _bt_upgrademetapage(), xl_btree_metadata::allequalimage, Assert(), BT_READ, BT_WRITE, BTMetaPageData::btm_allequalimage, BTMetaPageData::btm_fastlevel, BTMetaPageData::btm_fastroot, BTMetaPageData::btm_last_cleanup_num_delpages, BTMetaPageData::btm_last_cleanup_num_heap_tuples, BTMetaPageData::btm_level, BTMetaPageData::btm_magic, BTMetaPageData::btm_root, BTMetaPageData::btm_version, BTP_LEAF, BTP_ROOT, BTPageGetOpaque, BTPageOpaqueData::btpo_cycleid, BTPageOpaqueData::btpo_flags, BTPageOpaqueData::btpo_level, BTPageOpaqueData::btpo_next, BTPageOpaqueData::btpo_prev, BTREE_MAGIC, BTREE_METAPAGE, BTREE_MIN_VERSION, BTREE_NOVAC_VERSION, BTREE_VERSION, BufferGetBlockNumber(), BufferGetPage(), elog, END_CRIT_SECTION, ERROR, xl_btree_metadata::fastlevel, xl_btree_metadata::fastroot, InvalidBuffer, xl_btree_metadata::last_cleanup_num_delpages, xl_btree_metadata::level, xl_btree_newroot::level, MarkBufferDirty(), MemoryContextAlloc(), P_IGNORE, P_LEFTMOST, P_NONE, P_RIGHTMOST, PageSetLSN(), pfree(), RelationData::rd_amcache, RelationData::rd_indexcxt, REGBUF_STANDARD, REGBUF_WILL_INIT, RelationGetRelationName, RelationNeedsWAL, xl_btree_metadata::root, xl_btree_newroot::rootblk, SizeOfBtreeNewroot, START_CRIT_SECTION, xl_btree_metadata::version, XLOG_BTREE_NEWROOT, XLogBeginInsert(), XLogInsert(), XLogRegisterBufData(), XLogRegisterBuffer(), and XLogRegisterData().

Referenced by _bt_get_endpoint(), _bt_getroot(), and _bt_search().

◆ _bt_getrootheight()

int _bt_getrootheight ( Relation  rel)

Definition at line 676 of file nbtpage.c.

677{
678 BTMetaPageData *metad;
679
680 if (rel->rd_amcache == NULL)
681 {
682 Buffer metabuf;
683
684 metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
685 metad = _bt_getmeta(rel, metabuf);
686
687 /*
688 * If there's no root page yet, _bt_getroot() doesn't expect a cache
689 * to be made, so just stop here and report the index height is zero.
690 * (XXX perhaps _bt_getroot() should be changed to allow this case.)
691 */
692 if (metad->btm_root == P_NONE)
693 {
694 _bt_relbuf(rel, metabuf);
695 return 0;
696 }
697
698 /*
699 * Cache the metapage data for next time
700 */
702 sizeof(BTMetaPageData));
703 memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));
704 _bt_relbuf(rel, metabuf);
705 }
706
707 /* Get cached page */
708 metad = (BTMetaPageData *) rel->rd_amcache;
709 /* We shouldn't have cached it if any of these fail */
710 Assert(metad->btm_magic == BTREE_MAGIC);
713 Assert(!metad->btm_allequalimage ||
715 Assert(metad->btm_fastroot != P_NONE);
716
717 return metad->btm_fastlevel;
718}

References _bt_getbuf(), _bt_getmeta(), _bt_relbuf(), Assert(), BT_READ, BTMetaPageData::btm_allequalimage, BTMetaPageData::btm_fastlevel, BTMetaPageData::btm_fastroot, BTMetaPageData::btm_magic, BTMetaPageData::btm_root, BTMetaPageData::btm_version, BTREE_MAGIC, BTREE_METAPAGE, BTREE_MIN_VERSION, BTREE_NOVAC_VERSION, BTREE_VERSION, MemoryContextAlloc(), P_NONE, RelationData::rd_amcache, and RelationData::rd_indexcxt.

Referenced by _bt_insertonpg(), and btgettreeheight().

◆ _bt_getstackbuf()

Buffer _bt_getstackbuf ( Relation  rel,
Relation  heaprel,
BTStack  stack,
BlockNumber  child 
)

Definition at line 2335 of file nbtinsert.c.

2336{
2337 BlockNumber blkno;
2339
2340 blkno = stack->bts_blkno;
2341 start = stack->bts_offset;
2342
2343 for (;;)
2344 {
2345 Buffer buf;
2346 Page page;
2347 BTPageOpaque opaque;
2348
2349 buf = _bt_getbuf(rel, blkno, BT_WRITE);
2350 page = BufferGetPage(buf);
2351 opaque = BTPageGetOpaque(page);
2352
2353 Assert(heaprel != NULL);
2354 if (P_INCOMPLETE_SPLIT(opaque))
2355 {
2356 _bt_finish_split(rel, heaprel, buf, stack->bts_parent);
2357 continue;
2358 }
2359
2360 if (!P_IGNORE(opaque))
2361 {
2362 OffsetNumber offnum,
2363 minoff,
2364 maxoff;
2365 ItemId itemid;
2366 IndexTuple item;
2367
2368 minoff = P_FIRSTDATAKEY(opaque);
2369 maxoff = PageGetMaxOffsetNumber(page);
2370
2371 /*
2372 * start = InvalidOffsetNumber means "search the whole page". We
2373 * need this test anyway due to possibility that page has a high
2374 * key now when it didn't before.
2375 */
2376 if (start < minoff)
2377 start = minoff;
2378
2379 /*
2380 * Need this check too, to guard against possibility that page
2381 * split since we visited it originally.
2382 */
2383 if (start > maxoff)
2384 start = OffsetNumberNext(maxoff);
2385
2386 /*
2387 * These loops will check every item on the page --- but in an
2388 * order that's attuned to the probability of where it actually
2389 * is. Scan to the right first, then to the left.
2390 */
2391 for (offnum = start;
2392 offnum <= maxoff;
2393 offnum = OffsetNumberNext(offnum))
2394 {
2395 itemid = PageGetItemId(page, offnum);
2396 item = (IndexTuple) PageGetItem(page, itemid);
2397
2398 if (BTreeTupleGetDownLink(item) == child)
2399 {
2400 /* Return accurate pointer to where link is now */
2401 stack->bts_blkno = blkno;
2402 stack->bts_offset = offnum;
2403 return buf;
2404 }
2405 }
2406
2407 for (offnum = OffsetNumberPrev(start);
2408 offnum >= minoff;
2409 offnum = OffsetNumberPrev(offnum))
2410 {
2411 itemid = PageGetItemId(page, offnum);
2412 item = (IndexTuple) PageGetItem(page, itemid);
2413
2414 if (BTreeTupleGetDownLink(item) == child)
2415 {
2416 /* Return accurate pointer to where link is now */
2417 stack->bts_blkno = blkno;
2418 stack->bts_offset = offnum;
2419 return buf;
2420 }
2421 }
2422 }
2423
2424 /*
2425 * The item we're looking for moved right at least one page.
2426 *
2427 * Lehman and Yao couple/chain locks when moving right here, which we
2428 * can avoid. See nbtree/README.
2429 */
2430 if (P_RIGHTMOST(opaque))
2431 {
2432 _bt_relbuf(rel, buf);
2433 return InvalidBuffer;
2434 }
2435 blkno = opaque->btpo_next;
2437 _bt_relbuf(rel, buf);
2438 }
2439}
return str start
void _bt_finish_split(Relation rel, Relation heaprel, Buffer lbuf, BTStack stack)
Definition: nbtinsert.c:2256
#define OffsetNumberPrev(offsetNumber)
Definition: off.h:54
BlockNumber bts_blkno
Definition: nbtree.h:745
OffsetNumber bts_offset
Definition: nbtree.h:746

References _bt_finish_split(), _bt_getbuf(), _bt_relbuf(), Assert(), BT_WRITE, BTPageGetOpaque, BTPageOpaqueData::btpo_next, BTreeTupleGetDownLink(), BTStackData::bts_blkno, BTStackData::bts_offset, BTStackData::bts_parent, buf, BufferGetPage(), InvalidBuffer, InvalidOffsetNumber, OffsetNumberNext, OffsetNumberPrev, P_FIRSTDATAKEY, P_IGNORE, P_INCOMPLETE_SPLIT, P_RIGHTMOST, PageGetItem(), PageGetItemId(), PageGetMaxOffsetNumber(), and start.

Referenced by _bt_insert_parent(), and _bt_lock_subtree_parent().

◆ _bt_gettrueroot()

Buffer _bt_gettrueroot ( Relation  rel)

Definition at line 581 of file nbtpage.c.

582{
583 Buffer metabuf;
584 Page metapg;
585 BTPageOpaque metaopaque;
586 Buffer rootbuf;
587 Page rootpage;
588 BTPageOpaque rootopaque;
589 BlockNumber rootblkno;
590 uint32 rootlevel;
591 BTMetaPageData *metad;
592
593 /*
594 * We don't try to use cached metapage data here, since (a) this path is
595 * not performance-critical, and (b) if we are here it suggests our cache
596 * is out-of-date anyway. In light of point (b), it's probably safest to
597 * actively flush any cached metapage info.
598 */
599 if (rel->rd_amcache)
600 pfree(rel->rd_amcache);
601 rel->rd_amcache = NULL;
602
603 metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
604 metapg = BufferGetPage(metabuf);
605 metaopaque = BTPageGetOpaque(metapg);
606 metad = BTPageGetMeta(metapg);
607
608 if (!P_ISMETA(metaopaque) ||
609 metad->btm_magic != BTREE_MAGIC)
611 (errcode(ERRCODE_INDEX_CORRUPTED),
612 errmsg("index \"%s\" is not a btree",
614
615 if (metad->btm_version < BTREE_MIN_VERSION ||
616 metad->btm_version > BTREE_VERSION)
618 (errcode(ERRCODE_INDEX_CORRUPTED),
619 errmsg("version mismatch in index \"%s\": file version %d, "
620 "current version %d, minimal supported version %d",
623
624 /* if no root page initialized yet, fail */
625 if (metad->btm_root == P_NONE)
626 {
627 _bt_relbuf(rel, metabuf);
628 return InvalidBuffer;
629 }
630
631 rootblkno = metad->btm_root;
632 rootlevel = metad->btm_level;
633
634 /*
635 * We are done with the metapage; arrange to release it via first
636 * _bt_relandgetbuf call
637 */
638 rootbuf = metabuf;
639
640 for (;;)
641 {
642 rootbuf = _bt_relandgetbuf(rel, rootbuf, rootblkno, BT_READ);
643 rootpage = BufferGetPage(rootbuf);
644 rootopaque = BTPageGetOpaque(rootpage);
645
646 if (!P_IGNORE(rootopaque))
647 break;
648
649 /* it's dead, Jim. step right one page */
650 if (P_RIGHTMOST(rootopaque))
651 elog(ERROR, "no live root page found in index \"%s\"",
653 rootblkno = rootopaque->btpo_next;
654 }
655
656 if (rootopaque->btpo_level != rootlevel)
657 elog(ERROR, "root page %u of index \"%s\" has level %u, expected %u",
658 rootblkno, RelationGetRelationName(rel),
659 rootopaque->btpo_level, rootlevel);
660
661 return rootbuf;
662}
#define P_ISMETA(opaque)
Definition: nbtree.h:224

References _bt_getbuf(), _bt_relandgetbuf(), _bt_relbuf(), BT_READ, BTMetaPageData::btm_level, BTMetaPageData::btm_magic, BTMetaPageData::btm_root, BTMetaPageData::btm_version, BTPageGetMeta, BTPageGetOpaque, BTPageOpaqueData::btpo_level, BTPageOpaqueData::btpo_next, BTREE_MAGIC, BTREE_METAPAGE, BTREE_MIN_VERSION, BTREE_VERSION, BufferGetPage(), elog, ereport, errcode(), errmsg(), ERROR, InvalidBuffer, P_IGNORE, P_ISMETA, P_NONE, P_RIGHTMOST, pfree(), RelationData::rd_amcache, and RelationGetRelationName.

Referenced by _bt_get_endpoint().

◆ _bt_initmetapage()

void _bt_initmetapage ( Page  page,
BlockNumber  rootbknum,
uint32  level,
bool  allequalimage 
)

Definition at line 68 of file nbtpage.c.

70{
71 BTMetaPageData *metad;
72 BTPageOpaque metaopaque;
73
74 _bt_pageinit(page, BLCKSZ);
75
76 metad = BTPageGetMeta(page);
77 metad->btm_magic = BTREE_MAGIC;
79 metad->btm_root = rootbknum;
80 metad->btm_level = level;
81 metad->btm_fastroot = rootbknum;
82 metad->btm_fastlevel = level;
85 metad->btm_allequalimage = allequalimage;
86
87 metaopaque = BTPageGetOpaque(page);
88 metaopaque->btpo_flags = BTP_META;
89
90 /*
91 * Set pd_lower just past the end of the metadata. This is essential,
92 * because without doing so, metadata will be lost if xlog.c compresses
93 * the page.
94 */
95 ((PageHeader) page)->pd_lower =
96 ((char *) metad + sizeof(BTMetaPageData)) - (char *) page;
97}
PageHeaderData * PageHeader
Definition: bufpage.h:173
#define BTP_META
Definition: nbtree.h:80

References _bt_pageinit(), BTMetaPageData::btm_allequalimage, BTMetaPageData::btm_fastlevel, BTMetaPageData::btm_fastroot, BTMetaPageData::btm_last_cleanup_num_delpages, BTMetaPageData::btm_last_cleanup_num_heap_tuples, BTMetaPageData::btm_level, BTMetaPageData::btm_magic, BTMetaPageData::btm_root, BTMetaPageData::btm_version, BTP_META, BTPageGetMeta, BTPageGetOpaque, BTPageOpaqueData::btpo_flags, BTREE_MAGIC, and BTREE_VERSION.

Referenced by _bt_uppershutdown(), and btbuildempty().

◆ _bt_keep_natts_fast()

int _bt_keep_natts_fast ( Relation  rel,
IndexTuple  lastleft,
IndexTuple  firstright 
)

Definition at line 917 of file nbtutils.c.

918{
919 TupleDesc itupdesc = RelationGetDescr(rel);
921 int keepnatts;
922
923 keepnatts = 1;
924 for (int attnum = 1; attnum <= keysz; attnum++)
925 {
926 Datum datum1,
927 datum2;
928 bool isNull1,
929 isNull2;
930 CompactAttribute *att;
931
932 datum1 = index_getattr(lastleft, attnum, itupdesc, &isNull1);
933 datum2 = index_getattr(firstright, attnum, itupdesc, &isNull2);
934 att = TupleDescCompactAttr(itupdesc, attnum - 1);
935
936 if (isNull1 != isNull2)
937 break;
938
939 if (!isNull1 &&
940 !datum_image_eq(datum1, datum2, att->attbyval, att->attlen))
941 break;
942
943 keepnatts++;
944 }
945
946 return keepnatts;
947}
bool datum_image_eq(Datum value1, Datum value2, bool typByVal, int typLen)
Definition: datum.c:266
int16 attnum
Definition: pg_attribute.h:74
int16 attlen
Definition: tupdesc.h:71
static CompactAttribute * TupleDescCompactAttr(TupleDesc tupdesc, int i)
Definition: tupdesc.h:175

References CompactAttribute::attbyval, CompactAttribute::attlen, attnum, datum_image_eq(), index_getattr(), IndexRelationGetNumberOfKeyAttributes, RelationGetDescr, and TupleDescCompactAttr().

Referenced by _bt_afternewitemoff(), _bt_bottomupdel_pass(), _bt_dedup_pass(), _bt_do_singleval(), _bt_keep_natts(), _bt_load(), _bt_set_startikey(), _bt_split_penalty(), and _bt_strategy().

◆ _bt_killitems()

void _bt_killitems ( IndexScanDesc  scan)

Definition at line 205 of file nbtutils.c.

206{
207 Relation rel = scan->indexRelation;
208 BTScanOpaque so = (BTScanOpaque) scan->opaque;
209 Page page;
210 BTPageOpaque opaque;
211 OffsetNumber minoff;
212 OffsetNumber maxoff;
213 int numKilled = so->numKilled;
214 bool killedsomething = false;
215 Buffer buf;
216
217 Assert(numKilled > 0);
219 Assert(scan->heapRelation != NULL); /* can't be a bitmap index scan */
220
221 /* Always invalidate so->killedItems[] before leaving so->currPos */
222 so->numKilled = 0;
223
224 /*
225 * We need to iterate through so->killedItems[] in leaf page order; the
226 * loop below expects this (when marking posting list tuples, at least).
227 * so->killedItems[] is now in whatever order the scan returned items in.
228 * Scrollable cursor scans might have even saved the same item/TID twice.
229 *
230 * Sort and unique-ify so->killedItems[] to deal with all this.
231 */
232 if (numKilled > 1)
233 {
234 qsort(so->killedItems, numKilled, sizeof(int), _bt_compare_int);
235 numKilled = qunique(so->killedItems, numKilled, sizeof(int),
237 }
238
239 if (!so->dropPin)
240 {
241 /*
242 * We have held the pin on this page since we read the index tuples,
243 * so all we need to do is lock it. The pin will have prevented
244 * concurrent VACUUMs from recycling any of the TIDs on the page.
245 */
247 buf = so->currPos.buf;
248 _bt_lockbuf(rel, buf, BT_READ);
249 }
250 else
251 {
252 XLogRecPtr latestlsn;
253
257
258 latestlsn = BufferGetLSNAtomic(buf);
259 Assert(so->currPos.lsn <= latestlsn);
260 if (so->currPos.lsn != latestlsn)
261 {
262 /* Modified, give up on hinting */
263 _bt_relbuf(rel, buf);
264 return;
265 }
266
267 /* Unmodified, hinting is safe */
268 }
269
270 page = BufferGetPage(buf);
271 opaque = BTPageGetOpaque(page);
272 minoff = P_FIRSTDATAKEY(opaque);
273 maxoff = PageGetMaxOffsetNumber(page);
274
275 /* Iterate through so->killedItems[] in leaf page order */
276 for (int i = 0; i < numKilled; i++)
277 {
278 int itemIndex = so->killedItems[i];
279 BTScanPosItem *kitem = &so->currPos.items[itemIndex];
280 OffsetNumber offnum = kitem->indexOffset;
281
282 Assert(itemIndex >= so->currPos.firstItem &&
283 itemIndex <= so->currPos.lastItem);
284 Assert(i == 0 ||
285 offnum >= so->currPos.items[so->killedItems[i - 1]].indexOffset);
286
287 if (offnum < minoff)
288 continue; /* pure paranoia */
289 while (offnum <= maxoff)
290 {
291 ItemId iid = PageGetItemId(page, offnum);
292 IndexTuple ituple = (IndexTuple) PageGetItem(page, iid);
293 bool killtuple = false;
294
295 if (BTreeTupleIsPosting(ituple))
296 {
297 int pi = i + 1;
298 int nposting = BTreeTupleGetNPosting(ituple);
299 int j;
300
301 /*
302 * Note that the page may have been modified in almost any way
303 * since we first read it (in the !so->dropPin case), so it's
304 * possible that this posting list tuple wasn't a posting list
305 * tuple when we first encountered its heap TIDs.
306 */
307 for (j = 0; j < nposting; j++)
308 {
309 ItemPointer item = BTreeTupleGetPostingN(ituple, j);
310
311 if (!ItemPointerEquals(item, &kitem->heapTid))
312 break; /* out of posting list loop */
313
314 /*
315 * kitem must have matching offnum when heap TIDs match,
316 * though only in the common case where the page can't
317 * have been concurrently modified
318 */
319 Assert(kitem->indexOffset == offnum || !so->dropPin);
320
321 /*
322 * Read-ahead to later kitems here.
323 *
324 * We rely on the assumption that not advancing kitem here
325 * will prevent us from considering the posting list tuple
326 * fully dead by not matching its next heap TID in next
327 * loop iteration.
328 *
329 * If, on the other hand, this is the final heap TID in
330 * the posting list tuple, then tuple gets killed
331 * regardless (i.e. we handle the case where the last
332 * kitem is also the last heap TID in the last index tuple
333 * correctly -- posting tuple still gets killed).
334 */
335 if (pi < numKilled)
336 kitem = &so->currPos.items[so->killedItems[pi++]];
337 }
338
339 /*
340 * Don't bother advancing the outermost loop's int iterator to
341 * avoid processing killed items that relate to the same
342 * offnum/posting list tuple. This micro-optimization hardly
343 * seems worth it. (Further iterations of the outermost loop
344 * will fail to match on this same posting list's first heap
345 * TID instead, so we'll advance to the next offnum/index
346 * tuple pretty quickly.)
347 */
348 if (j == nposting)
349 killtuple = true;
350 }
351 else if (ItemPointerEquals(&ituple->t_tid, &kitem->heapTid))
352 killtuple = true;
353
354 /*
355 * Mark index item as dead, if it isn't already. Since this
356 * happens while holding a buffer lock possibly in shared mode,
357 * it's possible that multiple processes attempt to do this
358 * simultaneously, leading to multiple full-page images being sent
359 * to WAL (if wal_log_hints or data checksums are enabled), which
360 * is undesirable.
361 */
362 if (killtuple && !ItemIdIsDead(iid))
363 {
364 /* found the item/all posting list items */
365 ItemIdMarkDead(iid);
366 killedsomething = true;
367 break; /* out of inner search loop */
368 }
369 offnum = OffsetNumberNext(offnum);
370 }
371 }
372
373 /*
374 * Since this can be redone later if needed, mark as dirty hint.
375 *
376 * Whenever we mark anything LP_DEAD, we also set the page's
377 * BTP_HAS_GARBAGE flag, which is likewise just a hint. (Note that we
378 * only rely on the page-level flag in !heapkeyspace indexes.)
379 */
380 if (killedsomething)
381 {
382 opaque->btpo_flags |= BTP_HAS_GARBAGE;
384 }
385
386 if (!so->dropPin)
387 _bt_unlockbuf(rel, buf);
388 else
389 _bt_relbuf(rel, buf);
390}
XLogRecPtr BufferGetLSNAtomic(Buffer buffer)
Definition: bufmgr.c:4499
void MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
Definition: bufmgr.c:5430
int j
Definition: isn.c:78
#define ItemIdMarkDead(itemId)
Definition: itemid.h:179
#define BTP_HAS_GARBAGE
Definition: nbtree.h:83
static int _bt_compare_int(const void *va, const void *vb)
Definition: nbtutils.c:167
static size_t qunique(void *array, size_t elements, size_t width, int(*compare)(const void *, const void *))
Definition: qunique.h:21
int * killedItems
Definition: nbtree.h:1071
BlockNumber currPage
Definition: nbtree.h:967
int firstItem
Definition: nbtree.h:995
BTScanPosItem items[MaxTIDsPerBTreePage]
Definition: nbtree.h:999
XLogRecPtr lsn
Definition: nbtree.h:970
ItemPointerData heapTid
Definition: nbtree.h:957
OffsetNumber indexOffset
Definition: nbtree.h:958
Relation heapRelation
Definition: relscan.h:138

References _bt_compare_int(), _bt_getbuf(), _bt_lockbuf(), _bt_relbuf(), _bt_unlockbuf(), Assert(), BT_READ, BTP_HAS_GARBAGE, BTPageGetOpaque, BTreeTupleGetNPosting(), BTreeTupleGetPostingN(), BTreeTupleIsPosting(), BTScanPosIsPinned, BTScanPosIsValid, buf, BTScanPosData::buf, BufferGetLSNAtomic(), BufferGetPage(), BTScanPosData::currPage, BTScanOpaqueData::currPos, BTScanOpaqueData::dropPin, BTScanPosData::firstItem, IndexScanDescData::heapRelation, BTScanPosItem::heapTid, i, BTScanPosItem::indexOffset, IndexScanDescData::indexRelation, ItemIdIsDead, ItemIdMarkDead, ItemPointerEquals(), BTScanPosData::items, j, BTScanOpaqueData::killedItems, BTScanPosData::lsn, MarkBufferDirtyHint(), BTScanOpaqueData::numKilled, OffsetNumberNext, IndexScanDescData::opaque, P_FIRSTDATAKEY, PageGetItem(), PageGetItemId(), PageGetMaxOffsetNumber(), qsort, qunique(), RelationNeedsWAL, and IndexTupleData::t_tid.

Referenced by _bt_steppage(), btendscan(), btrescan(), and btrestrpos().

◆ _bt_lockbuf()

void _bt_lockbuf ( Relation  rel,
Buffer  buf,
int  access 
)

Definition at line 1040 of file nbtpage.c.

1041{
1042 /* LockBuffer() asserts that pin is held by this backend */
1044
1045 /*
1046 * It doesn't matter that _bt_unlockbuf() won't get called in the event of
1047 * an nbtree error (e.g. a unique violation error). That won't cause
1048 * Valgrind false positives.
1049 *
1050 * The nbtree client requests are superimposed on top of the bufmgr.c
1051 * buffer pin client requests. In the event of an nbtree error the buffer
1052 * will certainly get marked as defined when the backend once again
1053 * acquires its first pin on the buffer. (Of course, if the backend never
1054 * touches the buffer again then it doesn't matter that it remains
1055 * non-accessible to Valgrind.)
1056 *
1057 * Note: When an IndexTuple C pointer gets computed using an ItemId read
1058 * from a page while a lock was held, the C pointer becomes unsafe to
1059 * dereference forever as soon as the lock is released. Valgrind can only
1060 * detect cases where the pointer gets dereferenced with no _current_
1061 * lock/pin held, though.
1062 */
1063 if (!RelationUsesLocalBuffers(rel))
1065}
void LockBuffer(Buffer buffer, BufferLockMode mode)
Definition: bufmgr.c:5604

References buf, BufferGetPage(), LockBuffer(), RelationUsesLocalBuffers, and VALGRIND_MAKE_MEM_DEFINED.

Referenced by _bt_getbuf(), _bt_getroot(), _bt_killitems(), _bt_moveright(), _bt_pagedel(), _bt_relandgetbuf(), _bt_search(), _bt_set_cleanup_info(), _bt_unlink_halfdead_page(), and btvacuumpage().

◆ _bt_metaversion()

void _bt_metaversion ( Relation  rel,
bool *  heapkeyspace,
bool *  allequalimage 
)

Definition at line 740 of file nbtpage.c.

741{
742 BTMetaPageData *metad;
743
744 if (rel->rd_amcache == NULL)
745 {
746 Buffer metabuf;
747
748 metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
749 metad = _bt_getmeta(rel, metabuf);
750
751 /*
752 * If there's no root page yet, _bt_getroot() doesn't expect a cache
753 * to be made, so just stop here. (XXX perhaps _bt_getroot() should
754 * be changed to allow this case.)
755 */
756 if (metad->btm_root == P_NONE)
757 {
758 *heapkeyspace = metad->btm_version > BTREE_NOVAC_VERSION;
759 *allequalimage = metad->btm_allequalimage;
760
761 _bt_relbuf(rel, metabuf);
762 return;
763 }
764
765 /*
766 * Cache the metapage data for next time
767 *
768 * An on-the-fly version upgrade performed by _bt_upgrademetapage()
769 * can change the nbtree version for an index without invalidating any
770 * local cache. This is okay because it can only happen when moving
771 * from version 2 to version 3, both of which are !heapkeyspace
772 * versions.
773 */
775 sizeof(BTMetaPageData));
776 memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));
777 _bt_relbuf(rel, metabuf);
778 }
779
780 /* Get cached page */
781 metad = (BTMetaPageData *) rel->rd_amcache;
782 /* We shouldn't have cached it if any of these fail */
783 Assert(metad->btm_magic == BTREE_MAGIC);
786 Assert(!metad->btm_allequalimage ||
788 Assert(metad->btm_fastroot != P_NONE);
789
790 *heapkeyspace = metad->btm_version > BTREE_NOVAC_VERSION;
791 *allequalimage = metad->btm_allequalimage;
792}

References _bt_getbuf(), _bt_getmeta(), _bt_relbuf(), Assert(), BT_READ, BTMetaPageData::btm_allequalimage, BTMetaPageData::btm_fastroot, BTMetaPageData::btm_magic, BTMetaPageData::btm_root, BTMetaPageData::btm_version, BTREE_MAGIC, BTREE_METAPAGE, BTREE_MIN_VERSION, BTREE_NOVAC_VERSION, BTREE_VERSION, MemoryContextAlloc(), P_NONE, RelationData::rd_amcache, and RelationData::rd_indexcxt.

Referenced by _bt_first(), _bt_mkscankey(), and bt_index_check_callback().

◆ _bt_mkscankey()

BTScanInsert _bt_mkscankey ( Relation  rel,
IndexTuple  itup 
)

Definition at line 59 of file nbtutils.c.

60{
62 ScanKey skey;
63 TupleDesc itupdesc;
64 int indnkeyatts;
65 int16 *indoption;
66 int tupnatts;
67 int i;
68
69 itupdesc = RelationGetDescr(rel);
71 indoption = rel->rd_indoption;
72 tupnatts = itup ? BTreeTupleGetNAtts(itup, rel) : 0;
73
75
76 /*
77 * We'll execute search using scan key constructed on key columns.
78 * Truncated attributes and non-key attributes are omitted from the final
79 * scan key.
80 */
81 key = palloc(offsetof(BTScanInsertData, scankeys) +
82 sizeof(ScanKeyData) * indnkeyatts);
83 if (itup)
84 _bt_metaversion(rel, &key->heapkeyspace, &key->allequalimage);
85 else
86 {
87 /* Utility statement callers can set these fields themselves */
88 key->heapkeyspace = true;
89 key->allequalimage = false;
90 }
91 key->anynullkeys = false; /* initial assumption */
92 key->nextkey = false; /* usual case, required by btinsert */
93 key->backward = false; /* usual case, required by btinsert */
94 key->keysz = Min(indnkeyatts, tupnatts);
95 key->scantid = key->heapkeyspace && itup ?
96 BTreeTupleGetHeapTID(itup) : NULL;
97 skey = key->scankeys;
98 for (i = 0; i < indnkeyatts; i++)
99 {
100 FmgrInfo *procinfo;
101 Datum arg;
102 bool null;
103 int flags;
104
105 /*
106 * We can use the cached (default) support procs since no cross-type
107 * comparison can be needed.
108 */
109 procinfo = index_getprocinfo(rel, i + 1, BTORDER_PROC);
110
111 /*
112 * Key arguments built from truncated attributes (or when caller
113 * provides no tuple) are defensively represented as NULL values. They
114 * should never be used.
115 */
116 if (i < tupnatts)
117 arg = index_getattr(itup, i + 1, itupdesc, &null);
118 else
119 {
120 arg = (Datum) 0;
121 null = true;
122 }
123 flags = (null ? SK_ISNULL : 0) | (indoption[i] << SK_BT_INDOPTION_SHIFT);
125 flags,
126 (AttrNumber) (i + 1),
129 rel->rd_indcollation[i],
130 procinfo,
131 arg);
132 /* Record if any key attribute is NULL (or truncated) */
133 if (null)
134 key->anynullkeys = true;
135 }
136
137 /*
138 * In NULLS NOT DISTINCT mode, we pretend that there are no null keys, so
139 * that full uniqueness check is done.
140 */
141 if (rel->rd_index->indnullsnotdistinct)
142 key->anynullkeys = false;
143
144 return key;
145}
#define SK_BT_INDOPTION_SHIFT
Definition: nbtree.h:1115
int16 * rd_indoption
Definition: rel.h:211
Form_pg_index rd_index
Definition: rel.h:192

References _bt_metaversion(), arg, Assert(), BTORDER_PROC, BTreeTupleGetHeapTID(), BTreeTupleGetNAtts, i, index_getattr(), index_getprocinfo(), IndexRelationGetNumberOfAttributes, IndexRelationGetNumberOfKeyAttributes, InvalidOid, InvalidStrategy, sort-test::key, Min, palloc(), RelationData::rd_indcollation, RelationData::rd_index, RelationData::rd_indoption, RelationGetDescr, ScanKeyEntryInitializeWithInfo(), SK_BT_INDOPTION_SHIFT, and SK_ISNULL.

Referenced by _bt_doinsert(), _bt_leafbuild(), _bt_pagedel(), bt_mkscankey_pivotsearch(), bt_rootdescend(), tuplesort_begin_cluster(), and tuplesort_begin_index_btree().

◆ _bt_next()

bool _bt_next ( IndexScanDesc  scan,
ScanDirection  dir 
)

Definition at line 1585 of file nbtsearch.c.

1586{
1587 BTScanOpaque so = (BTScanOpaque) scan->opaque;
1588
1590
1591 /*
1592 * Advance to next tuple on current page; or if there's no more, try to
1593 * step to the next page with data.
1594 */
1595 if (ScanDirectionIsForward(dir))
1596 {
1597 if (++so->currPos.itemIndex > so->currPos.lastItem)
1598 {
1599 if (!_bt_steppage(scan, dir))
1600 return false;
1601 }
1602 }
1603 else
1604 {
1605 if (--so->currPos.itemIndex < so->currPos.firstItem)
1606 {
1607 if (!_bt_steppage(scan, dir))
1608 return false;
1609 }
1610 }
1611
1612 _bt_returnitem(scan, so);
1613 return true;
1614}
static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:1646
int lastItem
Definition: nbtree.h:996
int itemIndex
Definition: nbtree.h:997

References _bt_returnitem(), _bt_steppage(), Assert(), BTScanPosIsValid, BTScanOpaqueData::currPos, BTScanPosData::firstItem, BTScanPosData::itemIndex, BTScanPosData::lastItem, IndexScanDescData::opaque, and ScanDirectionIsForward.

Referenced by btgetbitmap(), and btgettuple().

◆ _bt_pagedel()

void _bt_pagedel ( Relation  rel,
Buffer  leafbuf,
BTVacState vstate 
)

Definition at line 1801 of file nbtpage.c.

1802{
1803 BlockNumber rightsib;
1804 bool rightsib_empty;
1805 Page page;
1806 BTPageOpaque opaque;
1807
1808 /*
1809 * Save original leafbuf block number from caller. Only deleted blocks
1810 * that are <= scanblkno are added to bulk delete stat's pages_deleted
1811 * count.
1812 */
1813 BlockNumber scanblkno = BufferGetBlockNumber(leafbuf);
1814
1815 /*
1816 * "stack" is a search stack leading (approximately) to the target page.
1817 * It is initially NULL, but when iterating, we keep it to avoid
1818 * duplicated search effort.
1819 *
1820 * Also, when "stack" is not NULL, we have already checked that the
1821 * current page is not the right half of an incomplete split, i.e. the
1822 * left sibling does not have its INCOMPLETE_SPLIT flag set, including
1823 * when the current target page is to the right of caller's initial page
1824 * (the scanblkno page).
1825 */
1826 BTStack stack = NULL;
1827
1828 for (;;)
1829 {
1830 page = BufferGetPage(leafbuf);
1831 opaque = BTPageGetOpaque(page);
1832
1833 /*
1834 * Internal pages are never deleted directly, only as part of deleting
1835 * the whole subtree all the way down to leaf level.
1836 *
1837 * Also check for deleted pages here. Caller never passes us a fully
1838 * deleted page. Only VACUUM can delete pages, so there can't have
1839 * been a concurrent deletion. Assume that we reached any deleted
1840 * page encountered here by following a sibling link, and that the
1841 * index is corrupt.
1842 */
1843 Assert(!P_ISDELETED(opaque));
1844 if (!P_ISLEAF(opaque) || P_ISDELETED(opaque))
1845 {
1846 /*
1847 * Pre-9.4 page deletion only marked internal pages as half-dead,
1848 * but now we only use that flag on leaf pages. The old algorithm
1849 * was never supposed to leave half-dead pages in the tree, it was
1850 * just a transient state, but it was nevertheless possible in
1851 * error scenarios. We don't know how to deal with them here. They
1852 * are harmless as far as searches are considered, but inserts
1853 * into the deleted keyspace could add out-of-order downlinks in
1854 * the upper levels. Log a notice, hopefully the admin will notice
1855 * and reindex.
1856 */
1857 if (P_ISHALFDEAD(opaque))
1858 ereport(LOG,
1859 (errcode(ERRCODE_INDEX_CORRUPTED),
1860 errmsg("index \"%s\" contains a half-dead internal page",
1862 errhint("This can be caused by an interrupted VACUUM in version 9.3 or older, before upgrade. Please REINDEX it.")));
1863
1864 if (P_ISDELETED(opaque))
1865 ereport(LOG,
1866 (errcode(ERRCODE_INDEX_CORRUPTED),
1867 errmsg_internal("found deleted block %u while following right link from block %u in index \"%s\"",
1868 BufferGetBlockNumber(leafbuf),
1869 scanblkno,
1871
1872 _bt_relbuf(rel, leafbuf);
1873 return;
1874 }
1875
1876 /*
1877 * We can never delete rightmost pages nor root pages. While at it,
1878 * check that page is empty, since it's possible that the leafbuf page
1879 * was empty a moment ago, but has since had some inserts.
1880 *
1881 * To keep the algorithm simple, we also never delete an incompletely
1882 * split page (they should be rare enough that this doesn't make any
1883 * meaningful difference to disk usage):
1884 *
1885 * The INCOMPLETE_SPLIT flag on the page tells us if the page is the
1886 * left half of an incomplete split, but ensuring that it's not the
1887 * right half is more complicated. For that, we have to check that
1888 * the left sibling doesn't have its INCOMPLETE_SPLIT flag set using
1889 * _bt_leftsib_splitflag(). On the first iteration, we temporarily
1890 * release the lock on scanblkno/leafbuf, check the left sibling, and
1891 * construct a search stack to scanblkno. On subsequent iterations,
1892 * we know we stepped right from a page that passed these tests, so
1893 * it's OK.
1894 */
1895 if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) ||
1896 P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page) ||
1897 P_INCOMPLETE_SPLIT(opaque))
1898 {
1899 /* Should never fail to delete a half-dead page */
1900 Assert(!P_ISHALFDEAD(opaque));
1901
1902 _bt_relbuf(rel, leafbuf);
1903 return;
1904 }
1905
1906 /*
1907 * First, remove downlink pointing to the page (or a parent of the
1908 * page, if we are going to delete a taller subtree), and mark the
1909 * leafbuf page half-dead
1910 */
1911 if (!P_ISHALFDEAD(opaque))
1912 {
1913 /*
1914 * We need an approximate pointer to the page's parent page. We
1915 * use a variant of the standard search mechanism to search for
1916 * the page's high key; this will give us a link to either the
1917 * current parent or someplace to its left (if there are multiple
1918 * equal high keys, which is possible with !heapkeyspace indexes).
1919 *
1920 * Also check if this is the right-half of an incomplete split
1921 * (see comment above).
1922 */
1923 if (!stack)
1924 {
1925 BTScanInsert itup_key;
1926 ItemId itemid;
1927 IndexTuple targetkey;
1928 BlockNumber leftsib,
1929 leafblkno;
1930 Buffer sleafbuf;
1931
1932 itemid = PageGetItemId(page, P_HIKEY);
1933 targetkey = CopyIndexTuple((IndexTuple) PageGetItem(page, itemid));
1934
1935 leftsib = opaque->btpo_prev;
1936 leafblkno = BufferGetBlockNumber(leafbuf);
1937
1938 /*
1939 * To avoid deadlocks, we'd better drop the leaf page lock
1940 * before going further.
1941 */
1942 _bt_unlockbuf(rel, leafbuf);
1943
1944 /*
1945 * Check that the left sibling of leafbuf (if any) is not
1946 * marked with INCOMPLETE_SPLIT flag before proceeding
1947 */
1948 Assert(leafblkno == scanblkno);
1949 if (_bt_leftsib_splitflag(rel, leftsib, leafblkno))
1950 {
1951 ReleaseBuffer(leafbuf);
1952 return;
1953 }
1954
1955 /*
1956 * We need an insertion scan key, so build one.
1957 *
1958 * _bt_search searches for the leaf page that contains any
1959 * matching non-pivot tuples, but we need it to "search" for
1960 * the high key pivot from the page that we're set to delete.
1961 * Compensate for the mismatch by having _bt_search locate the
1962 * last position < equal-to-untruncated-prefix non-pivots.
1963 */
1964 itup_key = _bt_mkscankey(rel, targetkey);
1965
1966 /* Set up a BTLessStrategyNumber-like insertion scan key */
1967 itup_key->nextkey = false;
1968 itup_key->backward = true;
1969 stack = _bt_search(rel, NULL, itup_key, &sleafbuf, BT_READ);
1970 /* won't need a second lock or pin on leafbuf */
1971 _bt_relbuf(rel, sleafbuf);
1972
1973 /*
1974 * Re-lock the leaf page, and start over to use our stack
1975 * within _bt_mark_page_halfdead. We must do it that way
1976 * because it's possible that leafbuf can no longer be
1977 * deleted. We need to recheck.
1978 *
1979 * Note: We can't simply hold on to the sleafbuf lock instead,
1980 * because it's barely possible that sleafbuf is not the same
1981 * page as leafbuf. This happens when leafbuf split after our
1982 * original lock was dropped, but before _bt_search finished
1983 * its descent. We rely on the assumption that we'll find
1984 * leafbuf isn't safe to delete anymore in this scenario.
1985 * (Page deletion can cope with the stack being to the left of
1986 * leafbuf, but not to the right of leafbuf.)
1987 */
1988 _bt_lockbuf(rel, leafbuf, BT_WRITE);
1989 continue;
1990 }
1991
1992 /*
1993 * See if it's safe to delete the leaf page, and determine how
1994 * many parent/internal pages above the leaf level will be
1995 * deleted. If it's safe then _bt_mark_page_halfdead will also
1996 * perform the first phase of deletion, which includes marking the
1997 * leafbuf page half-dead.
1998 */
1999 Assert(P_ISLEAF(opaque) && !P_IGNORE(opaque));
2000 if (!_bt_mark_page_halfdead(rel, vstate->info->heaprel, leafbuf,
2001 stack))
2002 {
2003 _bt_relbuf(rel, leafbuf);
2004 return;
2005 }
2006 }
2007 else
2008 {
2009 INJECTION_POINT("nbtree-finish-half-dead-page-vacuum", NULL);
2010 }
2011
2012 /*
2013 * Then unlink it from its siblings. Each call to
2014 * _bt_unlink_halfdead_page unlinks the topmost page from the subtree,
2015 * making it shallower. Iterate until the leafbuf page is deleted.
2016 */
2017 rightsib_empty = false;
2018 Assert(P_ISLEAF(opaque) && P_ISHALFDEAD(opaque));
2019 while (P_ISHALFDEAD(opaque))
2020 {
2021 /* Check for interrupts in _bt_unlink_halfdead_page */
2022 if (!_bt_unlink_halfdead_page(rel, leafbuf, scanblkno,
2023 &rightsib_empty, vstate))
2024 {
2025 /*
2026 * _bt_unlink_halfdead_page should never fail, since we
2027 * established that deletion is generally safe in
2028 * _bt_mark_page_halfdead -- index must be corrupt.
2029 *
2030 * Note that _bt_unlink_halfdead_page already released the
2031 * lock and pin on leafbuf for us.
2032 */
2033 Assert(false);
2034 return;
2035 }
2036 }
2037
2038 Assert(P_ISLEAF(opaque) && P_ISDELETED(opaque));
2039
2040 rightsib = opaque->btpo_next;
2041
2042 _bt_relbuf(rel, leafbuf);
2043
2044 /*
2045 * Check here, as calling loops will have locks held, preventing
2046 * interrupts from being processed.
2047 */
2049
2050 /*
2051 * The page has now been deleted. If its right sibling is completely
2052 * empty, it's possible that the reason we haven't deleted it earlier
2053 * is that it was the rightmost child of the parent. Now that we
2054 * removed the downlink for this page, the right sibling might now be
2055 * the only child of the parent, and could be removed. It would be
2056 * picked up by the next vacuum anyway, but might as well try to
2057 * remove it now, so loop back to process the right sibling.
2058 *
2059 * Note: This relies on the assumption that _bt_getstackbuf() will be
2060 * able to reuse our original descent stack with a different child
2061 * block (provided that the child block is to the right of the
2062 * original leaf page reached by _bt_search()). It will even update
2063 * the descent stack each time we loop around, avoiding repeated work.
2064 */
2065 if (!rightsib_empty)
2066 break;
2067
2068 leafbuf = _bt_getbuf(rel, rightsib, BT_WRITE);
2069 }
2070}
#define LOG
Definition: elog.h:31
IndexTuple CopyIndexTuple(IndexTuple source)
Definition: indextuple.c:547
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:123
static bool _bt_leftsib_splitflag(Relation rel, BlockNumber leftsib, BlockNumber target)
Definition: nbtpage.c:1694
static bool _bt_mark_page_halfdead(Relation rel, Relation heaprel, Buffer leafbuf, BTStack stack)
Definition: nbtpage.c:2091
static bool _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno, bool *rightsib_empty, BTVacState *vstate)
Definition: nbtpage.c:2316
#define P_ISHALFDEAD(opaque)
Definition: nbtree.h:225
#define P_ISDELETED(opaque)
Definition: nbtree.h:223
#define P_ISROOT(opaque)
Definition: nbtree.h:222
IndexVacuumInfo * info
Definition: nbtree.h:333
Relation heaprel
Definition: genam.h:74

References _bt_getbuf(), _bt_leftsib_splitflag(), _bt_lockbuf(), _bt_mark_page_halfdead(), _bt_mkscankey(), _bt_relbuf(), _bt_search(), _bt_unlink_halfdead_page(), _bt_unlockbuf(), Assert(), BTScanInsertData::backward, BT_READ, BT_WRITE, BTPageGetOpaque, BTPageOpaqueData::btpo_next, BTPageOpaqueData::btpo_prev, BufferGetBlockNumber(), BufferGetPage(), CHECK_FOR_INTERRUPTS, CopyIndexTuple(), ereport, errcode(), errhint(), errmsg(), errmsg_internal(), IndexVacuumInfo::heaprel, BTVacState::info, INJECTION_POINT, LOG, BTScanInsertData::nextkey, P_FIRSTDATAKEY, P_HIKEY, P_IGNORE, P_INCOMPLETE_SPLIT, P_ISDELETED, P_ISHALFDEAD, P_ISLEAF, P_ISROOT, P_RIGHTMOST, PageGetItem(), PageGetItemId(), PageGetMaxOffsetNumber(), RelationGetRelationName, and ReleaseBuffer().

Referenced by btvacuumpage().

◆ _bt_pageinit()

void _bt_pageinit ( Page  page,
Size  size 
)

Definition at line 1130 of file nbtpage.c.

1131{
1132 PageInit(page, size, sizeof(BTPageOpaqueData));
1133}
void PageInit(Page page, Size pageSize, Size specialSize)
Definition: bufpage.c:42

References PageInit().

Referenced by _bt_allocbuf(), _bt_blnewpage(), _bt_initmetapage(), _bt_restore_meta(), _bt_split(), btree_xlog_mark_page_halfdead(), btree_xlog_newroot(), btree_xlog_split(), and btree_xlog_unlink_page().

◆ _bt_parallel_build_main()

void _bt_parallel_build_main ( dsm_segment seg,
shm_toc toc 
)

Definition at line 1741 of file nbtsort.c.

1742{
1743 char *sharedquery;
1744 BTSpool *btspool;
1745 BTSpool *btspool2;
1746 BTShared *btshared;
1747 Sharedsort *sharedsort;
1748 Sharedsort *sharedsort2;
1749 Relation heapRel;
1750 Relation indexRel;
1751 LOCKMODE heapLockmode;
1752 LOCKMODE indexLockmode;
1753 WalUsage *walusage;
1754 BufferUsage *bufferusage;
1755 int sortmem;
1756
1757#ifdef BTREE_BUILD_STATS
1759 ResetUsage();
1760#endif /* BTREE_BUILD_STATS */
1761
1762 /*
1763 * The only possible status flag that can be set to the parallel worker is
1764 * PROC_IN_SAFE_IC.
1765 */
1766 Assert((MyProc->statusFlags == 0) ||
1768
1769 /* Set debug_query_string for individual workers first */
1770 sharedquery = shm_toc_lookup(toc, PARALLEL_KEY_QUERY_TEXT, true);
1771 debug_query_string = sharedquery;
1772
1773 /* Report the query string from leader */
1775
1776 /* Look up nbtree shared state */
1777 btshared = shm_toc_lookup(toc, PARALLEL_KEY_BTREE_SHARED, false);
1778
1779 /* Open relations using lock modes known to be obtained by index.c */
1780 if (!btshared->isconcurrent)
1781 {
1782 heapLockmode = ShareLock;
1783 indexLockmode = AccessExclusiveLock;
1784 }
1785 else
1786 {
1787 heapLockmode = ShareUpdateExclusiveLock;
1788 indexLockmode = RowExclusiveLock;
1789 }
1790
1791 /* Track query ID */
1792 pgstat_report_query_id(btshared->queryid, false);
1793
1794 /* Open relations within worker */
1795 heapRel = table_open(btshared->heaprelid, heapLockmode);
1796 indexRel = index_open(btshared->indexrelid, indexLockmode);
1797
1798 /* Initialize worker's own spool */
1799 btspool = palloc0_object(BTSpool);
1800 btspool->heap = heapRel;
1801 btspool->index = indexRel;
1802 btspool->isunique = btshared->isunique;
1803 btspool->nulls_not_distinct = btshared->nulls_not_distinct;
1804
1805 /* Look up shared state private to tuplesort.c */
1806 sharedsort = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT, false);
1807 tuplesort_attach_shared(sharedsort, seg);
1808 if (!btshared->isunique)
1809 {
1810 btspool2 = NULL;
1811 sharedsort2 = NULL;
1812 }
1813 else
1814 {
1815 /* Allocate memory for worker's own private secondary spool */
1816 btspool2 = palloc0_object(BTSpool);
1817
1818 /* Initialize worker's own secondary spool */
1819 btspool2->heap = btspool->heap;
1820 btspool2->index = btspool->index;
1821 btspool2->isunique = false;
1822 /* Look up shared state private to tuplesort.c */
1823 sharedsort2 = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT_SPOOL2, false);
1824 tuplesort_attach_shared(sharedsort2, seg);
1825 }
1826
1827 /* Prepare to track buffer usage during parallel execution */
1829
1830 /* Perform sorting of spool, and possibly a spool2 */
1831 sortmem = maintenance_work_mem / btshared->scantuplesortstates;
1832 _bt_parallel_scan_and_sort(btspool, btspool2, btshared, sharedsort,
1833 sharedsort2, sortmem, false);
1834
1835 /* Report WAL/buffer usage during parallel execution */
1836 bufferusage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false);
1837 walusage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false);
1839 &walusage[ParallelWorkerNumber]);
1840
1841#ifdef BTREE_BUILD_STATS
1843 {
1844 ShowUsage("BTREE BUILD (Worker Partial Spool) STATISTICS");
1845 ResetUsage();
1846 }
1847#endif /* BTREE_BUILD_STATS */
1848
1849 index_close(indexRel, indexLockmode);
1850 table_close(heapRel, heapLockmode);
1851}
int ParallelWorkerNumber
Definition: parallel.c:115
void pgstat_report_query_id(int64 query_id, bool force)
void pgstat_report_activity(BackendState state, const char *cmd_str)
@ STATE_RUNNING
#define palloc0_object(type)
Definition: fe_memutils.h:75
int maintenance_work_mem
Definition: globals.c:133
bool log_btree_build_stats
Definition: guc_tables.c:525
void index_close(Relation relation, LOCKMODE lockmode)
Definition: indexam.c:177
Relation index_open(Oid relationId, LOCKMODE lockmode)
Definition: indexam.c:133
void InstrEndParallelQuery(BufferUsage *bufusage, WalUsage *walusage)
Definition: instrument.c:208
void InstrStartParallelQuery(void)
Definition: instrument.c:200
int LOCKMODE
Definition: lockdefs.h:26
#define AccessExclusiveLock
Definition: lockdefs.h:43
#define ShareUpdateExclusiveLock
Definition: lockdefs.h:39
#define ShareLock
Definition: lockdefs.h:40
#define RowExclusiveLock
Definition: lockdefs.h:38
#define PARALLEL_KEY_BUFFER_USAGE
Definition: nbtsort.c:67
#define PARALLEL_KEY_TUPLESORT_SPOOL2
Definition: nbtsort.c:64
static void _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2, BTShared *btshared, Sharedsort *sharedsort, Sharedsort *sharedsort2, int sortmem, bool progress)
Definition: nbtsort.c:1866
#define PARALLEL_KEY_BTREE_SHARED
Definition: nbtsort.c:62
#define PARALLEL_KEY_TUPLESORT
Definition: nbtsort.c:63
#define PARALLEL_KEY_QUERY_TEXT
Definition: nbtsort.c:65
#define PARALLEL_KEY_WAL_USAGE
Definition: nbtsort.c:66
const char * debug_query_string
Definition: postgres.c:89
void ShowUsage(const char *title)
Definition: postgres.c:5068
void ResetUsage(void)
Definition: postgres.c:5061
#define PROC_IN_SAFE_IC
Definition: proc.h:59
void * shm_toc_lookup(shm_toc *toc, uint64 key, bool noError)
Definition: shm_toc.c:232
PGPROC * MyProc
Definition: proc.c:67
bool isconcurrent
Definition: nbtsort.c:105
Oid heaprelid
Definition: nbtsort.c:101
int64 queryid
Definition: nbtsort.c:109
bool isunique
Definition: nbtsort.c:103
int scantuplesortstates
Definition: nbtsort.c:106
Oid indexrelid
Definition: nbtsort.c:102
bool nulls_not_distinct
Definition: nbtsort.c:104
bool isunique
Definition: nbtsort.c:85
bool nulls_not_distinct
Definition: nbtsort.c:86
Relation heap
Definition: nbtsort.c:83
Relation index
Definition: nbtsort.c:84
uint8 statusFlags
Definition: proc.h:259
void table_close(Relation relation, LOCKMODE lockmode)
Definition: table.c:126
Relation table_open(Oid relationId, LOCKMODE lockmode)
Definition: table.c:40
void tuplesort_attach_shared(Sharedsort *shared, dsm_segment *seg)
Definition: tuplesort.c:2945

References _bt_parallel_scan_and_sort(), AccessExclusiveLock, Assert(), debug_query_string, BTSpool::heap, BTShared::heaprelid, BTSpool::index, index_close(), index_open(), BTShared::indexrelid, InstrEndParallelQuery(), InstrStartParallelQuery(), BTShared::isconcurrent, BTSpool::isunique, BTShared::isunique, log_btree_build_stats, maintenance_work_mem, MyProc, BTSpool::nulls_not_distinct, BTShared::nulls_not_distinct, palloc0_object, PARALLEL_KEY_BTREE_SHARED, PARALLEL_KEY_BUFFER_USAGE, PARALLEL_KEY_QUERY_TEXT, PARALLEL_KEY_TUPLESORT, PARALLEL_KEY_TUPLESORT_SPOOL2, PARALLEL_KEY_WAL_USAGE, ParallelWorkerNumber, pgstat_report_activity(), pgstat_report_query_id(), PROC_IN_SAFE_IC, BTShared::queryid, ResetUsage(), RowExclusiveLock, BTShared::scantuplesortstates, ShareLock, ShareUpdateExclusiveLock, shm_toc_lookup(), ShowUsage(), STATE_RUNNING, PGPROC::statusFlags, table_close(), table_open(), and tuplesort_attach_shared().

◆ _bt_parallel_done()

void _bt_parallel_done ( IndexScanDesc  scan)

Definition at line 1050 of file nbtree.c.

1051{
1052 BTScanOpaque so = (BTScanOpaque) scan->opaque;
1053 ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
1054 BTParallelScanDesc btscan;
1055 bool status_changed = false;
1056
1058
1059 /* Do nothing, for non-parallel scans */
1060 if (parallel_scan == NULL)
1061 return;
1062
1063 /*
1064 * Should not mark parallel scan done when there's still a pending
1065 * primitive index scan
1066 */
1067 if (so->needPrimScan)
1068 return;
1069
1070 btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
1071 parallel_scan->ps_offset_am);
1072
1073 /*
1074 * Mark the parallel scan as done, unless some other process did so
1075 * already
1076 */
1077 LWLockAcquire(&btscan->btps_lock, LW_EXCLUSIVE);
1078 Assert(btscan->btps_pageStatus != BTPARALLEL_NEED_PRIMSCAN);
1079 if (btscan->btps_pageStatus != BTPARALLEL_DONE)
1080 {
1081 btscan->btps_pageStatus = BTPARALLEL_DONE;
1082 status_changed = true;
1083 }
1084 LWLockRelease(&btscan->btps_lock);
1085
1086 /* wake up all the workers associated with this parallel scan */
1087 if (status_changed)
1088 ConditionVariableBroadcast(&btscan->btps_cv);
1089}
#define OffsetToPointer(base, offset)
Definition: c.h:785
void ConditionVariableBroadcast(ConditionVariable *cv)
@ BTPARALLEL_NEED_PRIMSCAN
Definition: nbtree.c:57
@ BTPARALLEL_DONE
Definition: nbtree.c:60
struct BTParallelScanDescData * BTParallelScanDesc
Definition: nbtree.c:93

References Assert(), BTPARALLEL_DONE, BTPARALLEL_NEED_PRIMSCAN, BTScanPosIsValid, ConditionVariableBroadcast(), BTScanOpaqueData::currPos, LW_EXCLUSIVE, LWLockAcquire(), LWLockRelease(), BTScanOpaqueData::needPrimScan, OffsetToPointer, IndexScanDescData::opaque, and IndexScanDescData::parallel_scan.

Referenced by _bt_endpoint(), _bt_first(), _bt_parallel_seize(), _bt_readnextpage(), and _bt_start_prim_scan().

◆ _bt_parallel_primscan_schedule()

void _bt_parallel_primscan_schedule ( IndexScanDesc  scan,
BlockNumber  curr_page 
)

Definition at line 1100 of file nbtree.c.

1101{
1102 Relation rel = scan->indexRelation;
1103 BTScanOpaque so = (BTScanOpaque) scan->opaque;
1104 ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
1105 BTParallelScanDesc btscan;
1106
1107 Assert(so->numArrayKeys);
1108
1109 btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
1110 parallel_scan->ps_offset_am);
1111
1112 LWLockAcquire(&btscan->btps_lock, LW_EXCLUSIVE);
1113 if (btscan->btps_lastCurrPage == curr_page &&
1114 btscan->btps_pageStatus == BTPARALLEL_IDLE)
1115 {
1116 btscan->btps_nextScanPage = InvalidBlockNumber;
1117 btscan->btps_lastCurrPage = InvalidBlockNumber;
1118 btscan->btps_pageStatus = BTPARALLEL_NEED_PRIMSCAN;
1119
1120 /* Serialize scan's current array keys */
1121 _bt_parallel_serialize_arrays(rel, btscan, so);
1122 }
1123 LWLockRelease(&btscan->btps_lock);
1124}
@ BTPARALLEL_IDLE
Definition: nbtree.c:59
static void _bt_parallel_serialize_arrays(Relation rel, BTParallelScanDesc btscan, BTScanOpaque so)
Definition: nbtree.c:732

References _bt_parallel_serialize_arrays(), Assert(), BTPARALLEL_IDLE, BTPARALLEL_NEED_PRIMSCAN, IndexScanDescData::indexRelation, InvalidBlockNumber, LW_EXCLUSIVE, LWLockAcquire(), LWLockRelease(), BTScanOpaqueData::numArrayKeys, OffsetToPointer, IndexScanDescData::opaque, and IndexScanDescData::parallel_scan.

Referenced by _bt_advance_array_keys(), and _bt_readpage().

◆ _bt_parallel_release()

void _bt_parallel_release ( IndexScanDesc  scan,
BlockNumber  next_scan_page,
BlockNumber  curr_page 
)

Definition at line 1023 of file nbtree.c.

1025{
1026 ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
1027 BTParallelScanDesc btscan;
1028
1029 Assert(BlockNumberIsValid(next_scan_page));
1030
1031 btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
1032 parallel_scan->ps_offset_am);
1033
1035 btscan->btps_nextScanPage = next_scan_page;
1036 btscan->btps_lastCurrPage = curr_page;
1038 LWLockRelease(&btscan->btps_lock);
1040}
void ConditionVariableSignal(ConditionVariable *cv)
BTPS_State btps_pageStatus
Definition: nbtree.c:72
BlockNumber btps_lastCurrPage
Definition: nbtree.c:70
ConditionVariable btps_cv
Definition: nbtree.c:76
BlockNumber btps_nextScanPage
Definition: nbtree.c:69

References Assert(), BlockNumberIsValid(), BTPARALLEL_IDLE, BTParallelScanDescData::btps_cv, BTParallelScanDescData::btps_lastCurrPage, BTParallelScanDescData::btps_lock, BTParallelScanDescData::btps_nextScanPage, BTParallelScanDescData::btps_pageStatus, ConditionVariableSignal(), LW_EXCLUSIVE, LWLockAcquire(), LWLockRelease(), OffsetToPointer, IndexScanDescData::parallel_scan, and ParallelIndexScanDescData::ps_offset_am.

Referenced by _bt_readnextpage(), and _bt_readpage().

◆ _bt_parallel_seize()

bool _bt_parallel_seize ( IndexScanDesc  scan,
BlockNumber next_scan_page,
BlockNumber last_curr_page,
bool  first 
)

Definition at line 885 of file nbtree.c.

887{
888 Relation rel = scan->indexRelation;
889 BTScanOpaque so = (BTScanOpaque) scan->opaque;
890 bool exit_loop = false,
891 status = true,
892 endscan = false;
893 ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
894 BTParallelScanDesc btscan;
895
896 *next_scan_page = InvalidBlockNumber;
897 *last_curr_page = InvalidBlockNumber;
898
899 /*
900 * Reset so->currPos, and initialize moreLeft/moreRight such that the next
901 * call to _bt_readnextpage treats this backend similarly to a serial
902 * backend that steps from *last_curr_page to *next_scan_page (unless this
903 * backend's so->currPos is initialized by _bt_readfirstpage before then).
904 */
906 so->currPos.moreLeft = so->currPos.moreRight = true;
907
908 if (first)
909 {
910 /*
911 * Initialize array related state when called from _bt_first, assuming
912 * that this will be the first primitive index scan for the scan
913 */
914 so->needPrimScan = false;
915 so->scanBehind = false;
916 so->oppositeDirCheck = false;
917 }
918 else
919 {
920 /*
921 * Don't attempt to seize the scan when it requires another primitive
922 * index scan, since caller's backend cannot start it right now
923 */
924 if (so->needPrimScan)
925 return false;
926 }
927
928 btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
929 parallel_scan->ps_offset_am);
930
931 while (1)
932 {
933 LWLockAcquire(&btscan->btps_lock, LW_EXCLUSIVE);
934
935 if (btscan->btps_pageStatus == BTPARALLEL_DONE)
936 {
937 /* We're done with this parallel index scan */
938 status = false;
939 }
940 else if (btscan->btps_pageStatus == BTPARALLEL_IDLE &&
941 btscan->btps_nextScanPage == P_NONE)
942 {
943 /* End this parallel index scan */
944 status = false;
945 endscan = true;
946 }
947 else if (btscan->btps_pageStatus == BTPARALLEL_NEED_PRIMSCAN)
948 {
949 Assert(so->numArrayKeys);
950
951 if (first)
952 {
953 /* Can start scheduled primitive scan right away, so do so */
954 btscan->btps_pageStatus = BTPARALLEL_ADVANCING;
955
956 /* Restore scan's array keys from serialized values */
957 _bt_parallel_restore_arrays(rel, btscan, so);
958 exit_loop = true;
959 }
960 else
961 {
962 /*
963 * Don't attempt to seize the scan when it requires another
964 * primitive index scan, since caller's backend cannot start
965 * it right now
966 */
967 status = false;
968 }
969
970 /*
971 * Either way, update backend local state to indicate that a
972 * pending primitive scan is required
973 */
974 so->needPrimScan = true;
975 so->scanBehind = false;
976 so->oppositeDirCheck = false;
977 }
978 else if (btscan->btps_pageStatus != BTPARALLEL_ADVANCING)
979 {
980 /*
981 * We have successfully seized control of the scan for the purpose
982 * of advancing it to a new page!
983 */
984 btscan->btps_pageStatus = BTPARALLEL_ADVANCING;
985 Assert(btscan->btps_nextScanPage != P_NONE);
986 *next_scan_page = btscan->btps_nextScanPage;
987 *last_curr_page = btscan->btps_lastCurrPage;
988 exit_loop = true;
989 }
990 LWLockRelease(&btscan->btps_lock);
991 if (exit_loop || !status)
992 break;
993 ConditionVariableSleep(&btscan->btps_cv, WAIT_EVENT_BTREE_PAGE);
994 }
996
997 /* When the scan has reached the rightmost (or leftmost) page, end it */
998 if (endscan)
999 _bt_parallel_done(scan);
1000
1001 return status;
1002}
bool ConditionVariableCancelSleep(void)
void ConditionVariableSleep(ConditionVariable *cv, uint32 wait_event_info)
@ BTPARALLEL_ADVANCING
Definition: nbtree.c:58
static void _bt_parallel_restore_arrays(Relation rel, BTParallelScanDesc btscan, BTScanOpaque so)
Definition: nbtree.c:775
#define BTScanPosInvalidate(scanpos)
Definition: nbtree.h:1027
bool oppositeDirCheck
Definition: nbtree.h:1065
bool moreRight
Definition: nbtree.h:986
bool moreLeft
Definition: nbtree.h:985

References _bt_parallel_done(), _bt_parallel_restore_arrays(), Assert(), BTPARALLEL_ADVANCING, BTPARALLEL_DONE, BTPARALLEL_IDLE, BTPARALLEL_NEED_PRIMSCAN, BTScanPosInvalidate, ConditionVariableCancelSleep(), ConditionVariableSleep(), BTScanOpaqueData::currPos, IndexScanDescData::indexRelation, InvalidBlockNumber, LW_EXCLUSIVE, LWLockAcquire(), LWLockRelease(), BTScanPosData::moreLeft, BTScanPosData::moreRight, BTScanOpaqueData::needPrimScan, BTScanOpaqueData::numArrayKeys, OffsetToPointer, IndexScanDescData::opaque, BTScanOpaqueData::oppositeDirCheck, P_NONE, IndexScanDescData::parallel_scan, and BTScanOpaqueData::scanBehind.

Referenced by _bt_first(), and _bt_readnextpage().

◆ _bt_pendingfsm_finalize()

void _bt_pendingfsm_finalize ( Relation  rel,
BTVacState vstate 
)

Definition at line 3000 of file nbtpage.c.

3001{
3002 IndexBulkDeleteResult *stats = vstate->stats;
3003 Relation heaprel = vstate->info->heaprel;
3004
3005 Assert(stats->pages_newly_deleted >= vstate->npendingpages);
3006 Assert(heaprel != NULL);
3007
3008 if (vstate->npendingpages == 0)
3009 {
3010 /* Just free memory when nothing to do */
3011 if (vstate->pendingpages)
3012 pfree(vstate->pendingpages);
3013
3014 return;
3015 }
3016
3017#ifdef DEBUG_BTREE_PENDING_FSM
3018
3019 /*
3020 * Debugging aid: Sleep for 5 seconds to greatly increase the chances of
3021 * placing pending pages in the FSM. Note that the optimization will
3022 * never be effective without some other backend concurrently consuming an
3023 * XID.
3024 */
3025 pg_usleep(5000000L);
3026#endif
3027
3028 /*
3029 * Recompute VACUUM XID boundaries.
3030 *
3031 * We don't actually care about the oldest non-removable XID. Computing
3032 * the oldest such XID has a useful side-effect that we rely on: it
3033 * forcibly updates the XID horizon state for this backend. This step is
3034 * essential; GlobalVisCheckRemovableFullXid() will not reliably recognize
3035 * that it is now safe to recycle newly deleted pages without this step.
3036 */
3038
3039 for (int i = 0; i < vstate->npendingpages; i++)
3040 {
3041 BlockNumber target = vstate->pendingpages[i].target;
3042 FullTransactionId safexid = vstate->pendingpages[i].safexid;
3043
3044 /*
3045 * Do the equivalent of checking BTPageIsRecyclable(), but without
3046 * accessing the page again a second time.
3047 *
3048 * Give up on finding the first non-recyclable page -- all later pages
3049 * must be non-recyclable too, since _bt_pendingfsm_add() adds pages
3050 * to the array in safexid order.
3051 */
3052 if (!GlobalVisCheckRemovableFullXid(heaprel, safexid))
3053 break;
3054
3055 RecordFreeIndexPage(rel, target);
3056 stats->pages_free++;
3057 }
3058
3059 pfree(vstate->pendingpages);
3060}
void RecordFreeIndexPage(Relation rel, BlockNumber freeBlock)
Definition: indexfsm.c:52
TransactionId GetOldestNonRemovableTransactionId(Relation rel)
Definition: procarray.c:1953
bool GlobalVisCheckRemovableFullXid(Relation rel, FullTransactionId fxid)
Definition: procarray.c:4246
void pg_usleep(long microsec)
Definition: signal.c:53
FullTransactionId safexid
Definition: nbtree.h:328
BlockNumber target
Definition: nbtree.h:327
IndexBulkDeleteResult * stats
Definition: nbtree.h:334
BTPendingFSM * pendingpages
Definition: nbtree.h:345
int npendingpages
Definition: nbtree.h:346
BlockNumber pages_newly_deleted
Definition: genam.h:108
BlockNumber pages_free
Definition: genam.h:110

References Assert(), GetOldestNonRemovableTransactionId(), GlobalVisCheckRemovableFullXid(), IndexVacuumInfo::heaprel, i, BTVacState::info, BTVacState::npendingpages, IndexBulkDeleteResult::pages_free, IndexBulkDeleteResult::pages_newly_deleted, BTVacState::pendingpages, pfree(), pg_usleep(), RecordFreeIndexPage(), BTPendingFSM::safexid, BTVacState::stats, and BTPendingFSM::target.

Referenced by btvacuumscan().

◆ _bt_pendingfsm_init()

void _bt_pendingfsm_init ( Relation  rel,
BTVacState vstate,
bool  cleanuponly 
)

Definition at line 2958 of file nbtpage.c.

2959{
2960 Size maxbufsize;
2961
2962 /*
2963 * Don't bother with optimization in cleanup-only case -- we don't expect
2964 * any newly deleted pages. Besides, cleanup-only calls to btvacuumscan()
2965 * can only take place because this optimization didn't work out during
2966 * the last VACUUM.
2967 */
2968 if (cleanuponly)
2969 return;
2970
2971 /*
2972 * Cap maximum size of array so that we always respect work_mem. Avoid
2973 * int overflow here.
2974 */
2975 vstate->bufsize = 256;
2976 maxbufsize = (work_mem * (Size) 1024) / sizeof(BTPendingFSM);
2977 maxbufsize = Min(maxbufsize, MaxAllocSize / sizeof(BTPendingFSM));
2978 /* BTVacState.maxbufsize has type int */
2979 maxbufsize = Min(maxbufsize, INT_MAX);
2980 /* Stay sane with small work_mem */
2981 maxbufsize = Max(maxbufsize, vstate->bufsize);
2982 vstate->maxbufsize = (int) maxbufsize;
2983
2984 /* Allocate buffer, indicate that there are currently 0 pending pages */
2985 vstate->pendingpages = palloc_array(BTPendingFSM, vstate->bufsize);
2986 vstate->npendingpages = 0;
2987}
#define MaxAllocSize
Definition: fe_memutils.h:22
int work_mem
Definition: globals.c:131
struct BTPendingFSM BTPendingFSM
int bufsize
Definition: nbtree.h:343
int maxbufsize
Definition: nbtree.h:344

References BTVacState::bufsize, Max, MaxAllocSize, BTVacState::maxbufsize, Min, BTVacState::npendingpages, palloc_array, BTVacState::pendingpages, and work_mem.

Referenced by btvacuumscan().

◆ _bt_preprocess_keys()

void _bt_preprocess_keys ( IndexScanDesc  scan)

Definition at line 203 of file nbtpreprocesskeys.c.

204{
205 BTScanOpaque so = (BTScanOpaque) scan->opaque;
206 int numberOfKeys = scan->numberOfKeys;
207 int16 *indoption = scan->indexRelation->rd_indoption;
208 int new_numberOfKeys;
209 int numberOfEqualCols;
210 ScanKey inkeys;
212 bool test_result,
213 redundant_key_kept = false;
214 AttrNumber attno;
215 ScanKey arrayKeyData;
216 int *keyDataMap = NULL;
217 int arrayidx = 0;
218
219 if (so->numberOfKeys > 0)
220 {
221 /*
222 * Only need to do preprocessing once per btrescan, at most. All
223 * calls after the first are handled as no-ops.
224 */
225 return;
226 }
227
228 /* initialize result variables */
229 so->qual_ok = true;
230 so->numberOfKeys = 0;
231
232 if (numberOfKeys < 1)
233 return; /* done if qual-less scan */
234
235 /* If any keys are SK_SEARCHARRAY type, set up array-key info */
236 arrayKeyData = _bt_preprocess_array_keys(scan, &numberOfKeys);
237 if (!so->qual_ok)
238 {
239 /* unmatchable array, so give up */
240 return;
241 }
242
243 /*
244 * Treat arrayKeyData[] (a partially preprocessed copy of scan->keyData[])
245 * as our input if _bt_preprocess_array_keys just allocated it, else just
246 * use scan->keyData[]
247 */
248 if (arrayKeyData)
249 {
250 inkeys = arrayKeyData;
251
252 /* Also maintain keyDataMap for remapping so->orderProcs[] later */
253 keyDataMap = MemoryContextAlloc(so->arrayContext,
254 numberOfKeys * sizeof(int));
255
256 /*
257 * Also enlarge output array when it might otherwise not have room for
258 * a skip array's scan key
259 */
260 if (numberOfKeys > scan->numberOfKeys)
261 so->keyData = repalloc(so->keyData,
262 numberOfKeys * sizeof(ScanKeyData));
263 }
264 else
265 inkeys = scan->keyData;
266
267 /* we check that input keys are correctly ordered */
268 if (inkeys[0].sk_attno < 1)
269 elog(ERROR, "btree index keys must be ordered by attribute");
270
271 /* We can short-circuit most of the work if there's just one key */
272 if (numberOfKeys == 1)
273 {
274 /* Apply indoption to scankey (might change sk_strategy!) */
275 if (!_bt_fix_scankey_strategy(&inkeys[0], indoption))
276 so->qual_ok = false;
277 memcpy(&so->keyData[0], &inkeys[0], sizeof(ScanKeyData));
278 so->numberOfKeys = 1;
279 /* We can mark the qual as required if it's for first index col */
280 if (inkeys[0].sk_attno == 1)
282 if (arrayKeyData)
283 {
284 /*
285 * Don't call _bt_preprocess_array_keys_final in this fast path
286 * (we'll miss out on the single value array transformation, but
287 * that's not nearly as important when there's only one scan key)
288 */
291 (so->arrayKeys[0].scan_key == 0 &&
292 !(so->keyData[0].sk_flags & SK_BT_SKIP) &&
293 OidIsValid(so->orderProcs[0].fn_oid)));
294 }
295
296 return;
297 }
298
299 /*
300 * Otherwise, do the full set of pushups.
301 */
302 new_numberOfKeys = 0;
303 numberOfEqualCols = 0;
304
305 /*
306 * Initialize for processing of keys for attr 1.
307 *
308 * xform[i] points to the currently best scan key of strategy type i+1; it
309 * is NULL if we haven't yet found such a key for this attr.
310 */
311 attno = 1;
312 memset(xform, 0, sizeof(xform));
313
314 /*
315 * Loop iterates from 0 to numberOfKeys inclusive; we use the last pass to
316 * handle after-last-key processing. Actual exit from the loop is at the
317 * "break" statement below.
318 */
319 for (int i = 0;; i++)
320 {
321 ScanKey inkey = inkeys + i;
322 int j;
323
324 if (i < numberOfKeys)
325 {
326 /* Apply indoption to scankey (might change sk_strategy!) */
327 if (!_bt_fix_scankey_strategy(inkey, indoption))
328 {
329 /* NULL can't be matched, so give up */
330 so->qual_ok = false;
331 return;
332 }
333 }
334
335 /*
336 * If we are at the end of the keys for a particular attr, finish up
337 * processing and emit the cleaned-up keys.
338 */
339 if (i == numberOfKeys || inkey->sk_attno != attno)
340 {
341 int priorNumberOfEqualCols = numberOfEqualCols;
342
343 /* check input keys are correctly ordered */
344 if (i < numberOfKeys && inkey->sk_attno < attno)
345 elog(ERROR, "btree index keys must be ordered by attribute");
346
347 /*
348 * If = has been specified, all other keys can be eliminated as
349 * redundant. Note that this is no less true if the = key is
350 * SEARCHARRAY; the only real difference is that the inequality
351 * key _becomes_ redundant by making _bt_compare_scankey_args
352 * eliminate the subset of elements that won't need to be matched
353 * (with SAOP arrays and skip arrays alike).
354 *
355 * If we have a case like "key = 1 AND key > 2", we set qual_ok to
356 * false and abandon further processing. We'll do the same thing
357 * given a case like "key IN (0, 1) AND key > 2".
358 *
359 * We also have to deal with the case of "key IS NULL", which is
360 * unsatisfiable in combination with any other index condition. By
361 * the time we get here, that's been classified as an equality
362 * check, and we've rejected any combination of it with a regular
363 * equality condition; but not with other types of conditions.
364 */
365 if (xform[BTEqualStrategyNumber - 1].inkey)
366 {
367 ScanKey eq = xform[BTEqualStrategyNumber - 1].inkey;
368 BTArrayKeyInfo *array = NULL;
369 FmgrInfo *orderproc = NULL;
370
371 if (arrayKeyData && (eq->sk_flags & SK_SEARCHARRAY))
372 {
373 int eq_in_ikey,
374 eq_arrayidx;
375
376 eq_in_ikey = xform[BTEqualStrategyNumber - 1].inkeyi;
377 eq_arrayidx = xform[BTEqualStrategyNumber - 1].arrayidx;
378 array = &so->arrayKeys[eq_arrayidx - 1];
379 orderproc = so->orderProcs + eq_in_ikey;
380
381 Assert(array->scan_key == eq_in_ikey);
382 Assert(OidIsValid(orderproc->fn_oid));
383 }
384
385 for (j = BTMaxStrategyNumber; --j >= 0;)
386 {
387 ScanKey chk = xform[j].inkey;
388
389 if (!chk || j == (BTEqualStrategyNumber - 1))
390 continue;
391
392 if (eq->sk_flags & SK_SEARCHNULL)
393 {
394 /* IS NULL is contradictory to anything else */
395 so->qual_ok = false;
396 return;
397 }
398
399 if (_bt_compare_scankey_args(scan, chk, eq, chk,
400 array, orderproc,
401 &test_result))
402 {
403 if (!test_result)
404 {
405 /* keys proven mutually contradictory */
406 so->qual_ok = false;
407 return;
408 }
409 /* else discard the redundant non-equality key */
410 xform[j].inkey = NULL;
411 xform[j].inkeyi = -1;
412 }
413 else
414 redundant_key_kept = true;
415 }
416 /* track number of attrs for which we have "=" keys */
417 numberOfEqualCols++;
418 }
419
420 /* try to keep only one of <, <= */
421 if (xform[BTLessStrategyNumber - 1].inkey &&
422 xform[BTLessEqualStrategyNumber - 1].inkey)
423 {
424 ScanKey lt = xform[BTLessStrategyNumber - 1].inkey;
425 ScanKey le = xform[BTLessEqualStrategyNumber - 1].inkey;
426
427 if (_bt_compare_scankey_args(scan, le, lt, le, NULL, NULL,
428 &test_result))
429 {
430 if (test_result)
431 xform[BTLessEqualStrategyNumber - 1].inkey = NULL;
432 else
433 xform[BTLessStrategyNumber - 1].inkey = NULL;
434 }
435 else
436 redundant_key_kept = true;
437 }
438
439 /* try to keep only one of >, >= */
440 if (xform[BTGreaterStrategyNumber - 1].inkey &&
441 xform[BTGreaterEqualStrategyNumber - 1].inkey)
442 {
443 ScanKey gt = xform[BTGreaterStrategyNumber - 1].inkey;
444 ScanKey ge = xform[BTGreaterEqualStrategyNumber - 1].inkey;
445
446 if (_bt_compare_scankey_args(scan, ge, gt, ge, NULL, NULL,
447 &test_result))
448 {
449 if (test_result)
450 xform[BTGreaterEqualStrategyNumber - 1].inkey = NULL;
451 else
452 xform[BTGreaterStrategyNumber - 1].inkey = NULL;
453 }
454 else
455 redundant_key_kept = true;
456 }
457
458 /*
459 * Emit the cleaned-up keys into the so->keyData[] array, and then
460 * mark them if they are required. They are required (possibly
461 * only in one direction) if all attrs before this one had "=".
462 *
463 * In practice we'll rarely output non-required scan keys here;
464 * typically, _bt_preprocess_array_keys has already added "=" keys
465 * sufficient to form an unbroken series of "=" constraints on all
466 * attrs prior to the attr from the final scan->keyData[] key.
467 */
468 for (j = BTMaxStrategyNumber; --j >= 0;)
469 {
470 if (xform[j].inkey)
471 {
472 ScanKey outkey = &so->keyData[new_numberOfKeys++];
473
474 memcpy(outkey, xform[j].inkey, sizeof(ScanKeyData));
475 if (arrayKeyData)
476 keyDataMap[new_numberOfKeys - 1] = xform[j].inkeyi;
477 if (priorNumberOfEqualCols == attno - 1)
479 }
480 }
481
482 /*
483 * Exit loop here if done.
484 */
485 if (i == numberOfKeys)
486 break;
487
488 /* Re-initialize for new attno */
489 attno = inkey->sk_attno;
490 memset(xform, 0, sizeof(xform));
491 }
492
493 /* check strategy this key's operator corresponds to */
494 j = inkey->sk_strategy - 1;
495
496 if (inkey->sk_strategy == BTEqualStrategyNumber &&
497 (inkey->sk_flags & SK_SEARCHARRAY))
498 {
499 /* must track how input scan keys map to arrays */
500 Assert(arrayKeyData);
501 arrayidx++;
502 }
503
504 /*
505 * have we seen a scan key for this same attribute and using this same
506 * operator strategy before now?
507 */
508 if (xform[j].inkey == NULL)
509 {
510 /* nope, so this scan key wins by default (at least for now) */
511 xform[j].inkey = inkey;
512 xform[j].inkeyi = i;
513 xform[j].arrayidx = arrayidx;
514 }
515 else
516 {
517 FmgrInfo *orderproc = NULL;
518 BTArrayKeyInfo *array = NULL;
519
520 /*
521 * Seen one of these before, so keep only the more restrictive key
522 * if possible
523 */
524 if (j == (BTEqualStrategyNumber - 1) && arrayKeyData)
525 {
526 /*
527 * Have to set up array keys
528 */
529 if (inkey->sk_flags & SK_SEARCHARRAY)
530 {
531 array = &so->arrayKeys[arrayidx - 1];
532 orderproc = so->orderProcs + i;
533
534 Assert(array->scan_key == i);
535 Assert(OidIsValid(orderproc->fn_oid));
536 Assert(!(inkey->sk_flags & SK_BT_SKIP));
537 }
538 else if (xform[j].inkey->sk_flags & SK_SEARCHARRAY)
539 {
540 array = &so->arrayKeys[xform[j].arrayidx - 1];
541 orderproc = so->orderProcs + xform[j].inkeyi;
542
543 Assert(array->scan_key == xform[j].inkeyi);
544 Assert(OidIsValid(orderproc->fn_oid));
545 Assert(!(xform[j].inkey->sk_flags & SK_BT_SKIP));
546 }
547
548 /*
549 * Both scan keys might have arrays, in which case we'll
550 * arbitrarily pass only one of the arrays. That won't
551 * matter, since _bt_compare_scankey_args is aware that two
552 * SEARCHARRAY scan keys mean that _bt_preprocess_array_keys
553 * failed to eliminate redundant arrays through array merging.
554 * _bt_compare_scankey_args just returns false when it sees
555 * this; it won't even try to examine either array.
556 */
557 }
558
559 if (_bt_compare_scankey_args(scan, inkey, inkey, xform[j].inkey,
560 array, orderproc, &test_result))
561 {
562 /* Have all we need to determine redundancy */
563 if (test_result)
564 {
565 /*
566 * New key is more restrictive, and so replaces old key...
567 */
568 if (j != (BTEqualStrategyNumber - 1) ||
569 !(xform[j].inkey->sk_flags & SK_SEARCHARRAY))
570 {
571 xform[j].inkey = inkey;
572 xform[j].inkeyi = i;
573 xform[j].arrayidx = arrayidx;
574 }
575 else
576 {
577 /*
578 * ...unless we have to keep the old key because it's
579 * an array that rendered the new key redundant. We
580 * need to make sure that we don't throw away an array
581 * scan key. _bt_preprocess_array_keys_final expects
582 * us to keep all of the arrays that weren't already
583 * eliminated by _bt_preprocess_array_keys earlier on.
584 */
585 Assert(!(inkey->sk_flags & SK_SEARCHARRAY));
586 }
587 }
588 else if (j == (BTEqualStrategyNumber - 1))
589 {
590 /* key == a && key == b, but a != b */
591 so->qual_ok = false;
592 return;
593 }
594 /* else old key is more restrictive, keep it */
595 }
596 else
597 {
598 /*
599 * We can't determine which key is more restrictive. Push
600 * xform[j] directly to the output array, then set xform[j] to
601 * the new scan key.
602 *
603 * Note: We do things this way around so that our arrays are
604 * always in the same order as their corresponding scan keys.
605 * _bt_preprocess_array_keys_final expects this.
606 */
607 ScanKey outkey = &so->keyData[new_numberOfKeys++];
608
609 memcpy(outkey, xform[j].inkey, sizeof(ScanKeyData));
610 if (arrayKeyData)
611 keyDataMap[new_numberOfKeys - 1] = xform[j].inkeyi;
612 if (numberOfEqualCols == attno - 1)
614 xform[j].inkey = inkey;
615 xform[j].inkeyi = i;
616 xform[j].arrayidx = arrayidx;
617 redundant_key_kept = true;
618 }
619 }
620 }
621
622 so->numberOfKeys = new_numberOfKeys;
623
624 /*
625 * Now that we've built a temporary mapping from so->keyData[] (output
626 * scan keys) to arrayKeyData[] (our input scan keys), fix array->scan_key
627 * references. Also consolidate the so->orderProcs[] array such that it
628 * can be subscripted using so->keyData[]-wise offsets.
629 */
630 if (arrayKeyData)
631 _bt_preprocess_array_keys_final(scan, keyDataMap);
632
633 /*
634 * If there are remaining redundant inequality keys, we must make sure
635 * that each index attribute has no more than one required >/>= key, and
636 * no more than one required </<= key. Attributes that have one or more
637 * required = keys now must keep only one required key (the first = key).
638 */
639 if (unlikely(redundant_key_kept) && so->qual_ok)
640 _bt_unmark_keys(scan, keyDataMap);
641
642 /* Could pfree arrayKeyData/keyDataMap now, but not worth the cycles */
643}
if(TABLE==NULL||TABLE_index==NULL)
Definition: isn.c:81
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:1610
static bool _bt_fix_scankey_strategy(ScanKey skey, int16 *indoption)
static void _bt_unmark_keys(IndexScanDesc scan, int *keyDataMap)
static void _bt_mark_scankey_required(ScanKey skey)
static ScanKey _bt_preprocess_array_keys(IndexScanDesc scan, int *new_numberOfKeys)
static void _bt_preprocess_array_keys_final(IndexScanDesc scan, int *keyDataMap)
static bool _bt_compare_scankey_args(IndexScanDesc scan, ScanKey op, ScanKey leftarg, ScanKey rightarg, BTArrayKeyInfo *array, FmgrInfo *orderproc, bool *result)
#define SK_SEARCHNULL
Definition: skey.h:121
#define BTMaxStrategyNumber
Definition: stratnum.h:35
FmgrInfo * orderProcs
Definition: nbtree.h:1067
MemoryContext arrayContext
Definition: nbtree.h:1068
Oid fn_oid
Definition: fmgr.h:59
struct ScanKeyData * keyData
Definition: relscan.h:143

References _bt_compare_scankey_args(), _bt_fix_scankey_strategy(), _bt_mark_scankey_required(), _bt_preprocess_array_keys(), _bt_preprocess_array_keys_final(), _bt_unmark_keys(), BTScanOpaqueData::arrayContext, BTScanOpaqueData::arrayKeys, Assert(), BTEqualStrategyNumber, BTGreaterEqualStrategyNumber, BTGreaterStrategyNumber, BTLessEqualStrategyNumber, BTLessStrategyNumber, BTMaxStrategyNumber, elog, ERROR, FmgrInfo::fn_oid, i, if(), IndexScanDescData::indexRelation, j, BTScanOpaqueData::keyData, IndexScanDescData::keyData, MemoryContextAlloc(), BTScanOpaqueData::numberOfKeys, IndexScanDescData::numberOfKeys, OidIsValid, IndexScanDescData::opaque, BTScanOpaqueData::orderProcs, BTScanOpaqueData::qual_ok, RelationData::rd_indoption, repalloc(), BTArrayKeyInfo::scan_key, ScanKeyData::sk_attno, SK_BT_SKIP, ScanKeyData::sk_flags, SK_SEARCHARRAY, SK_SEARCHNULL, ScanKeyData::sk_strategy, and unlikely.

Referenced by _bt_first().

◆ _bt_readpage()

bool _bt_readpage ( IndexScanDesc  scan,
ScanDirection  dir,
OffsetNumber  offnum,
bool  firstpage 
)

Definition at line 134 of file nbtreadpage.c.

136{
137 Relation rel = scan->indexRelation;
138 BTScanOpaque so = (BTScanOpaque) scan->opaque;
139 Page page;
140 BTPageOpaque opaque;
141 OffsetNumber minoff;
142 OffsetNumber maxoff;
143 BTReadPageState pstate;
144 bool arrayKeys,
145 ignore_killed_tuples = scan->ignore_killed_tuples;
146 int itemIndex,
147 indnatts;
148
149 /* save the page/buffer block number, along with its sibling links */
150 page = BufferGetPage(so->currPos.buf);
151 opaque = BTPageGetOpaque(page);
153 so->currPos.prevPage = opaque->btpo_prev;
154 so->currPos.nextPage = opaque->btpo_next;
155 /* delay setting so->currPos.lsn until _bt_drop_lock_and_maybe_pin */
156 pstate.dir = so->currPos.dir = dir;
157 so->currPos.nextTupleOffset = 0;
158
159 /* either moreRight or moreLeft should be set now (may be unset later) */
161 so->currPos.moreLeft);
162 Assert(!P_IGNORE(opaque));
164 Assert(!so->needPrimScan);
165
166 /* initialize local variables */
168 arrayKeys = so->numArrayKeys != 0;
169 minoff = P_FIRSTDATAKEY(opaque);
170 maxoff = PageGetMaxOffsetNumber(page);
171
172 /* initialize page-level state that we'll pass to _bt_checkkeys */
173 pstate.minoff = minoff;
174 pstate.maxoff = maxoff;
175 pstate.finaltup = NULL;
176 pstate.page = page;
177 pstate.firstpage = firstpage;
178 pstate.forcenonrequired = false;
179 pstate.startikey = 0;
180 pstate.offnum = InvalidOffsetNumber;
181 pstate.skip = InvalidOffsetNumber;
182 pstate.continuescan = true; /* default assumption */
183 pstate.rechecks = 0;
184 pstate.targetdistance = 0;
185 pstate.nskipadvances = 0;
186
187 if (scan->parallel_scan)
188 {
189 /* allow next/prev page to be read by other worker without delay */
190 if (ScanDirectionIsForward(dir))
192 so->currPos.currPage);
193 else
195 so->currPos.currPage);
196 }
197
199
200 if (ScanDirectionIsForward(dir))
201 {
202 /* SK_SEARCHARRAY forward scans must provide high key up front */
203 if (arrayKeys)
204 {
205 if (!P_RIGHTMOST(opaque))
206 {
207 ItemId iid = PageGetItemId(page, P_HIKEY);
208
209 pstate.finaltup = (IndexTuple) PageGetItem(page, iid);
210
211 if (unlikely(so->scanBehind) &&
212 !_bt_scanbehind_checkkeys(scan, dir, pstate.finaltup))
213 {
214 /* Schedule another primitive index scan after all */
215 so->currPos.moreRight = false;
216 so->needPrimScan = true;
217 if (scan->parallel_scan)
219 so->currPos.currPage);
220 return false;
221 }
222 }
223
224 so->scanBehind = so->oppositeDirCheck = false; /* reset */
225 }
226
227 /*
228 * Consider pstate.startikey optimization once the ongoing primitive
229 * index scan has already read at least one page
230 */
231 if (!pstate.firstpage && minoff < maxoff)
232 _bt_set_startikey(scan, &pstate);
233
234 /* load items[] in ascending order */
235 itemIndex = 0;
236
237 offnum = Max(offnum, minoff);
238
239 while (offnum <= maxoff)
240 {
241 ItemId iid = PageGetItemId(page, offnum);
242 IndexTuple itup;
243 bool passes_quals;
244
245 /*
246 * If the scan specifies not to return killed tuples, then we
247 * treat a killed tuple as not passing the qual
248 */
249 if (ignore_killed_tuples && ItemIdIsDead(iid))
250 {
251 offnum = OffsetNumberNext(offnum);
252 continue;
253 }
254
255 itup = (IndexTuple) PageGetItem(page, iid);
257
258 pstate.offnum = offnum;
259 passes_quals = _bt_checkkeys(scan, &pstate, arrayKeys,
260 itup, indnatts);
261
262 /*
263 * Check if we need to skip ahead to a later tuple (only possible
264 * when the scan uses array keys)
265 */
266 if (arrayKeys && OffsetNumberIsValid(pstate.skip))
267 {
268 Assert(!passes_quals && pstate.continuescan);
269 Assert(offnum < pstate.skip);
270 Assert(!pstate.forcenonrequired);
271
272 offnum = pstate.skip;
273 pstate.skip = InvalidOffsetNumber;
274 continue;
275 }
276
277 if (passes_quals)
278 {
279 /* tuple passes all scan key conditions */
280 if (!BTreeTupleIsPosting(itup))
281 {
282 /* Remember it */
283 _bt_saveitem(so, itemIndex, offnum, itup);
284 itemIndex++;
285 }
286 else
287 {
288 int tupleOffset;
289
290 /* Set up posting list state (and remember first TID) */
291 tupleOffset =
292 _bt_setuppostingitems(so, itemIndex, offnum,
293 BTreeTupleGetPostingN(itup, 0),
294 itup);
295 itemIndex++;
296
297 /* Remember all later TIDs (must be at least one) */
298 for (int i = 1; i < BTreeTupleGetNPosting(itup); i++)
299 {
300 _bt_savepostingitem(so, itemIndex, offnum,
302 tupleOffset);
303 itemIndex++;
304 }
305 }
306 }
307 /* When !continuescan, there can't be any more matches, so stop */
308 if (!pstate.continuescan)
309 break;
310
311 offnum = OffsetNumberNext(offnum);
312 }
313
314 /*
315 * We don't need to visit page to the right when the high key
316 * indicates that no more matches will be found there.
317 *
318 * Checking the high key like this works out more often than you might
319 * think. Leaf page splits pick a split point between the two most
320 * dissimilar tuples (this is weighed against the need to evenly share
321 * free space). Leaf pages with high key attribute values that can
322 * only appear on non-pivot tuples on the right sibling page are
323 * common.
324 */
325 if (pstate.continuescan && !so->scanBehind && !P_RIGHTMOST(opaque))
326 {
327 ItemId iid = PageGetItemId(page, P_HIKEY);
328 IndexTuple itup = (IndexTuple) PageGetItem(page, iid);
329 int truncatt;
330
331 /* Reset arrays, per _bt_set_startikey contract */
332 if (pstate.forcenonrequired)
333 _bt_start_array_keys(scan, dir);
334 pstate.forcenonrequired = false;
335 pstate.startikey = 0; /* _bt_set_startikey ignores P_HIKEY */
336
337 truncatt = BTreeTupleGetNAtts(itup, rel);
338 _bt_checkkeys(scan, &pstate, arrayKeys, itup, truncatt);
339 }
340
341 if (!pstate.continuescan)
342 so->currPos.moreRight = false;
343
344 Assert(itemIndex <= MaxTIDsPerBTreePage);
345 so->currPos.firstItem = 0;
346 so->currPos.lastItem = itemIndex - 1;
347 so->currPos.itemIndex = 0;
348 }
349 else
350 {
351 /* SK_SEARCHARRAY backward scans must provide final tuple up front */
352 if (arrayKeys)
353 {
354 if (minoff <= maxoff && !P_LEFTMOST(opaque))
355 {
356 ItemId iid = PageGetItemId(page, minoff);
357
358 pstate.finaltup = (IndexTuple) PageGetItem(page, iid);
359
360 if (unlikely(so->scanBehind) &&
361 !_bt_scanbehind_checkkeys(scan, dir, pstate.finaltup))
362 {
363 /* Schedule another primitive index scan after all */
364 so->currPos.moreLeft = false;
365 so->needPrimScan = true;
366 if (scan->parallel_scan)
368 so->currPos.currPage);
369 return false;
370 }
371 }
372
373 so->scanBehind = so->oppositeDirCheck = false; /* reset */
374 }
375
376 /*
377 * Consider pstate.startikey optimization once the ongoing primitive
378 * index scan has already read at least one page
379 */
380 if (!pstate.firstpage && minoff < maxoff)
381 _bt_set_startikey(scan, &pstate);
382
383 /* load items[] in descending order */
384 itemIndex = MaxTIDsPerBTreePage;
385
386 offnum = Min(offnum, maxoff);
387
388 while (offnum >= minoff)
389 {
390 ItemId iid = PageGetItemId(page, offnum);
391 IndexTuple itup;
392 bool tuple_alive;
393 bool passes_quals;
394
395 /*
396 * If the scan specifies not to return killed tuples, then we
397 * treat a killed tuple as not passing the qual. Most of the
398 * time, it's a win to not bother examining the tuple's index
399 * keys, but just skip to the next tuple (previous, actually,
400 * since we're scanning backwards). However, if this is the first
401 * tuple on the page, we do check the index keys, to prevent
402 * uselessly advancing to the page to the left. This is similar
403 * to the high key optimization used by forward scans.
404 */
405 if (ignore_killed_tuples && ItemIdIsDead(iid))
406 {
407 if (offnum > minoff)
408 {
409 offnum = OffsetNumberPrev(offnum);
410 continue;
411 }
412
413 tuple_alive = false;
414 }
415 else
416 tuple_alive = true;
417
418 itup = (IndexTuple) PageGetItem(page, iid);
420
421 pstate.offnum = offnum;
422 if (arrayKeys && offnum == minoff && pstate.forcenonrequired)
423 {
424 /* Reset arrays, per _bt_set_startikey contract */
425 pstate.forcenonrequired = false;
426 pstate.startikey = 0;
427 _bt_start_array_keys(scan, dir);
428 }
429 passes_quals = _bt_checkkeys(scan, &pstate, arrayKeys,
430 itup, indnatts);
431
432 if (arrayKeys && so->scanBehind)
433 {
434 /*
435 * Done scanning this page, but not done with the current
436 * primscan.
437 *
438 * Note: Forward scans don't check this explicitly, since they
439 * prefer to reuse pstate.skip for this instead.
440 */
441 Assert(!passes_quals && pstate.continuescan);
442 Assert(!pstate.forcenonrequired);
443
444 break;
445 }
446
447 /*
448 * Check if we need to skip ahead to a later tuple (only possible
449 * when the scan uses array keys)
450 */
451 if (arrayKeys && OffsetNumberIsValid(pstate.skip))
452 {
453 Assert(!passes_quals && pstate.continuescan);
454 Assert(offnum > pstate.skip);
455 Assert(!pstate.forcenonrequired);
456
457 offnum = pstate.skip;
458 pstate.skip = InvalidOffsetNumber;
459 continue;
460 }
461
462 if (passes_quals && tuple_alive)
463 {
464 /* tuple passes all scan key conditions */
465 if (!BTreeTupleIsPosting(itup))
466 {
467 /* Remember it */
468 itemIndex--;
469 _bt_saveitem(so, itemIndex, offnum, itup);
470 }
471 else
472 {
474 int tupleOffset;
475
476 /* Set up posting list state (and remember last TID) */
477 itemIndex--;
478 tupleOffset =
479 _bt_setuppostingitems(so, itemIndex, offnum,
480 BTreeTupleGetPostingN(itup, nitems - 1),
481 itup);
482
483 /* Remember all prior TIDs (must be at least one) */
484 for (int i = nitems - 2; i >= 0; i--)
485 {
486 itemIndex--;
487 _bt_savepostingitem(so, itemIndex, offnum,
489 tupleOffset);
490 }
491 }
492 }
493 /* When !continuescan, there can't be any more matches, so stop */
494 if (!pstate.continuescan)
495 break;
496
497 offnum = OffsetNumberPrev(offnum);
498 }
499
500 /*
501 * We don't need to visit page to the left when no more matches will
502 * be found there
503 */
504 if (!pstate.continuescan)
505 so->currPos.moreLeft = false;
506
507 Assert(itemIndex >= 0);
508 so->currPos.firstItem = itemIndex;
511 }
512
513 /*
514 * If _bt_set_startikey told us to temporarily treat the scan's keys as
515 * nonrequired (possible only during scans with array keys), there must be
516 * no lasting consequences for the scan's array keys. The scan's arrays
517 * should now have exactly the same elements as they would have had if the
518 * nonrequired behavior had never been used. (In general, a scan's arrays
519 * are expected to track its progress through the index's key space.)
520 *
521 * We are required (by _bt_set_startikey) to call _bt_checkkeys against
522 * pstate.finaltup with pstate.forcenonrequired=false to allow the scan's
523 * arrays to recover. Assert that that step hasn't been missed.
524 */
525 Assert(!pstate.forcenonrequired);
526
527 return (so->currPos.firstItem <= so->currPos.lastItem);
528}
#define nitems(x)
Definition: indent.h:31
static void _bt_saveitem(BTScanOpaque so, int itemIndex, OffsetNumber offnum, IndexTuple itup)
Definition: nbtreadpage.c:1032
static int _bt_setuppostingitems(BTScanOpaque so, int itemIndex, OffsetNumber offnum, const ItemPointerData *heapTid, IndexTuple itup)
Definition: nbtreadpage.c:1062
static bool _bt_scanbehind_checkkeys(IndexScanDesc scan, ScanDirection dir, IndexTuple finaltup)
Definition: nbtreadpage.c:952
static void _bt_savepostingitem(BTScanOpaque so, int itemIndex, OffsetNumber offnum, ItemPointer heapTid, int tupleOffset)
Definition: nbtreadpage.c:1100
static void _bt_set_startikey(IndexScanDesc scan, BTReadPageState *pstate)
Definition: nbtreadpage.c:593
static bool _bt_checkkeys(IndexScanDesc scan, BTReadPageState *pstate, bool arrayKeys, IndexTuple tuple, int tupnatts)
Definition: nbtreadpage.c:1149
void _bt_parallel_primscan_schedule(IndexScanDesc scan, BlockNumber curr_page)
Definition: nbtree.c:1100
void _bt_parallel_release(IndexScanDesc scan, BlockNumber next_scan_page, BlockNumber curr_page)
Definition: nbtree.c:1023
void PredicateLockPage(Relation relation, BlockNumber blkno, Snapshot snapshot)
Definition: predicate.c:2597
int nextTupleOffset
Definition: nbtree.h:979
BlockNumber prevPage
Definition: nbtree.h:968
BlockNumber nextPage
Definition: nbtree.h:969
ScanDirection dir
Definition: nbtree.h:973
bool ignore_killed_tuples
Definition: relscan.h:150

References _bt_checkkeys(), _bt_parallel_primscan_schedule(), _bt_parallel_release(), _bt_saveitem(), _bt_savepostingitem(), _bt_scanbehind_checkkeys(), _bt_set_startikey(), _bt_setuppostingitems(), _bt_start_array_keys(), Assert(), BTPageGetOpaque, BTreeTupleGetNAtts, BTreeTupleGetNPosting(), BTreeTupleGetPostingN(), BTreeTupleIsPivot(), BTreeTupleIsPosting(), BTScanPosIsPinned, BTScanPosData::buf, BufferGetBlockNumber(), BufferGetPage(), BTScanPosData::currPage, BTScanOpaqueData::currPos, BTScanPosData::dir, BTScanPosData::firstItem, i, IndexScanDescData::ignore_killed_tuples, IndexScanDescData::indexRelation, IndexRelationGetNumberOfAttributes, InvalidOffsetNumber, ItemIdIsDead, BTScanPosData::itemIndex, BTScanPosData::lastItem, Max, MaxTIDsPerBTreePage, Min, BTScanPosData::moreLeft, BTScanPosData::moreRight, BTScanOpaqueData::needPrimScan, BTScanPosData::nextPage, BTScanPosData::nextTupleOffset, nitems, BTScanOpaqueData::numArrayKeys, OffsetNumberIsValid, OffsetNumberNext, OffsetNumberPrev, IndexScanDescData::opaque, BTScanOpaqueData::oppositeDirCheck, P_FIRSTDATAKEY, P_HIKEY, P_IGNORE, P_LEFTMOST, P_RIGHTMOST, PageGetItem(), PageGetItemId(), PageGetMaxOffsetNumber(), IndexScanDescData::parallel_scan, PredicateLockPage(), BTScanPosData::prevPage, BTScanOpaqueData::scanBehind, ScanDirectionIsForward, unlikely, and IndexScanDescData::xs_snapshot.

Referenced by _bt_readfirstpage(), and _bt_readnextpage().

◆ _bt_relandgetbuf()

Buffer _bt_relandgetbuf ( Relation  rel,
Buffer  obuf,
BlockNumber  blkno,
int  access 
)

Definition at line 1004 of file nbtpage.c.

1005{
1006 Buffer buf;
1007
1008 Assert(BlockNumberIsValid(blkno));
1009 if (BufferIsValid(obuf))
1010 _bt_unlockbuf(rel, obuf);
1011 buf = ReleaseAndReadBuffer(obuf, rel, blkno);
1012 _bt_lockbuf(rel, buf, access);
1013
1014 _bt_checkpage(rel, buf);
1015 return buf;
1016}
Buffer ReleaseAndReadBuffer(Buffer buffer, Relation relation, BlockNumber blockNum)
Definition: bufmgr.c:3008

References _bt_checkpage(), _bt_lockbuf(), _bt_unlockbuf(), Assert(), BlockNumberIsValid(), buf, BufferIsValid(), and ReleaseAndReadBuffer().

Referenced by _bt_check_unique(), _bt_get_endpoint(), _bt_getroot(), _bt_gettrueroot(), _bt_lock_and_validate_left(), _bt_moveright(), _bt_search(), and _bt_stepright().

◆ _bt_relbuf()

◆ _bt_search()

BTStack _bt_search ( Relation  rel,
Relation  heaprel,
BTScanInsert  key,
Buffer bufP,
int  access 
)

Definition at line 97 of file nbtsearch.c.

99{
100 BTStack stack_in = NULL;
101 int page_access = BT_READ;
102
103 /* heaprel must be set whenever _bt_allocbuf is reachable */
105 Assert(access == BT_READ || heaprel != NULL);
106
107 /* Get the root page to start with */
108 *bufP = _bt_getroot(rel, heaprel, access);
109
110 /* If index is empty and access = BT_READ, no root page is created. */
111 if (!BufferIsValid(*bufP))
112 return (BTStack) NULL;
113
114 /* Loop iterates once per level descended in the tree */
115 for (;;)
116 {
117 Page page;
118 BTPageOpaque opaque;
119 OffsetNumber offnum;
120 ItemId itemid;
121 IndexTuple itup;
122 BlockNumber child;
123 BTStack new_stack;
124
125 /*
126 * Race -- the page we just grabbed may have split since we read its
127 * downlink in its parent page (or the metapage). If it has, we may
128 * need to move right to its new sibling. Do that.
129 *
130 * In write-mode, allow _bt_moveright to finish any incomplete splits
131 * along the way. Strictly speaking, we'd only need to finish an
132 * incomplete split on the leaf page we're about to insert to, not on
133 * any of the upper levels (internal pages with incomplete splits are
134 * also taken care of in _bt_getstackbuf). But this is a good
135 * opportunity to finish splits of internal pages too.
136 */
137 *bufP = _bt_moveright(rel, heaprel, key, *bufP, (access == BT_WRITE),
138 stack_in, page_access);
139
140 /* if this is a leaf page, we're done */
141 page = BufferGetPage(*bufP);
142 opaque = BTPageGetOpaque(page);
143 if (P_ISLEAF(opaque))
144 break;
145
146 /*
147 * Find the appropriate pivot tuple on this page. Its downlink points
148 * to the child page that we're about to descend to.
149 */
150 offnum = _bt_binsrch(rel, key, *bufP);
151 itemid = PageGetItemId(page, offnum);
152 itup = (IndexTuple) PageGetItem(page, itemid);
153 Assert(BTreeTupleIsPivot(itup) || !key->heapkeyspace);
154 child = BTreeTupleGetDownLink(itup);
155
156 /*
157 * We need to save the location of the pivot tuple we chose in a new
158 * stack entry for this page/level. If caller ends up splitting a
159 * page one level down, it usually ends up inserting a new pivot
160 * tuple/downlink immediately after the location recorded here.
161 */
162 new_stack = (BTStack) palloc_object(BTStackData);
163 new_stack->bts_blkno = BufferGetBlockNumber(*bufP);
164 new_stack->bts_offset = offnum;
165 new_stack->bts_parent = stack_in;
166
167 /*
168 * Page level 1 is lowest non-leaf page level prior to leaves. So, if
169 * we're on the level 1 and asked to lock leaf page in write mode,
170 * then lock next page in write mode, because it must be a leaf.
171 */
172 if (opaque->btpo_level == 1 && access == BT_WRITE)
173 page_access = BT_WRITE;
174
175 /* drop the read lock on the page, then acquire one on its child */
176 *bufP = _bt_relandgetbuf(rel, *bufP, child, page_access);
177
178 /* okay, all set to move down a level */
179 stack_in = new_stack;
180 }
181
182 /*
183 * If we're asked to lock leaf in write mode, but didn't manage to, then
184 * relock. This should only happen when the root page is a leaf page (and
185 * the only page in the index other than the metapage).
186 */
187 if (access == BT_WRITE && page_access == BT_READ)
188 {
189 /* trade in our read lock for a write lock */
190 _bt_unlockbuf(rel, *bufP);
191 _bt_lockbuf(rel, *bufP, BT_WRITE);
192
193 /*
194 * Race -- the leaf page may have split after we dropped the read lock
195 * but before we acquired a write lock. If it has, we may need to
196 * move right to its new sibling. Do that.
197 */
198 *bufP = _bt_moveright(rel, heaprel, key, *bufP, true, stack_in, BT_WRITE);
199 }
200
201 return stack_in;
202}
BTStackData * BTStack
Definition: nbtree.h:750
static Buffer _bt_moveright(Relation rel, Relation heaprel, BTScanInsert key, Buffer buf, bool forupdate, BTStack stack, int access)
Definition: nbtsearch.c:236

References _bt_binsrch(), _bt_getroot(), _bt_lockbuf(), _bt_moveright(), _bt_relandgetbuf(), _bt_unlockbuf(), Assert(), BT_READ, BT_WRITE, BTPageGetOpaque, BTPageOpaqueData::btpo_level, BTreeTupleGetDownLink(), BTreeTupleIsPivot(), BTStackData::bts_blkno, BTStackData::bts_offset, BTStackData::bts_parent, BufferGetBlockNumber(), BufferGetPage(), BufferIsValid(), sort-test::key, P_ISLEAF, PageGetItem(), PageGetItemId(), and palloc_object.

Referenced by _bt_first(), _bt_pagedel(), _bt_search_insert(), and bt_rootdescend().

◆ _bt_set_cleanup_info()

void _bt_set_cleanup_info ( Relation  rel,
BlockNumber  num_delpages 
)

Definition at line 233 of file nbtpage.c.

234{
235 Buffer metabuf;
236 Page metapg;
237 BTMetaPageData *metad;
238
239 /*
240 * On-disk compatibility note: The btm_last_cleanup_num_delpages metapage
241 * field started out as a TransactionId field called btm_oldest_btpo_xact.
242 * Both "versions" are just uint32 fields. It was convenient to repurpose
243 * the field when we began to use 64-bit XIDs in deleted pages.
244 *
245 * It's possible that a pg_upgrade'd database will contain an XID value in
246 * what is now recognized as the metapage's btm_last_cleanup_num_delpages
247 * field. _bt_vacuum_needs_cleanup() may even believe that this value
248 * indicates that there are lots of pages that it needs to recycle, when
249 * in reality there are only one or two. The worst that can happen is
250 * that there will be a call to btvacuumscan a little earlier, which will
251 * set btm_last_cleanup_num_delpages to a sane value when we're called.
252 *
253 * Note also that the metapage's btm_last_cleanup_num_heap_tuples field is
254 * no longer used as of PostgreSQL 14. We set it to -1.0 on rewrite, just
255 * to be consistent.
256 */
257 metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
258 metapg = BufferGetPage(metabuf);
259 metad = BTPageGetMeta(metapg);
260
261 /* Don't miss chance to upgrade index/metapage when BTREE_MIN_VERSION */
262 if (metad->btm_version >= BTREE_NOVAC_VERSION &&
263 metad->btm_last_cleanup_num_delpages == num_delpages)
264 {
265 /* Usually means index continues to have num_delpages of 0 */
266 _bt_relbuf(rel, metabuf);
267 return;
268 }
269
270 /* trade in our read lock for a write lock */
271 _bt_unlockbuf(rel, metabuf);
272 _bt_lockbuf(rel, metabuf, BT_WRITE);
273
275
276 /* upgrade meta-page if needed */
277 if (metad->btm_version < BTREE_NOVAC_VERSION)
278 _bt_upgrademetapage(metapg);
279
280 /* update cleanup-related information */
281 metad->btm_last_cleanup_num_delpages = num_delpages;
283 MarkBufferDirty(metabuf);
284
285 /* write wal record if needed */
286 if (RelationNeedsWAL(rel))
287 {
289 XLogRecPtr recptr;
290
293
295 md.version = metad->btm_version;
296 md.root = metad->btm_root;
297 md.level = metad->btm_level;
298 md.fastroot = metad->btm_fastroot;
299 md.fastlevel = metad->btm_fastlevel;
300 md.last_cleanup_num_delpages = num_delpages;
302
303 XLogRegisterBufData(0, &md, sizeof(xl_btree_metadata));
304
305 recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_META_CLEANUP);
306
307 PageSetLSN(metapg, recptr);
308 }
309
311
312 _bt_relbuf(rel, metabuf);
313}
#define XLOG_BTREE_META_CLEANUP
Definition: nbtxlog.h:41

References _bt_getbuf(), _bt_lockbuf(), _bt_relbuf(), _bt_unlockbuf(), _bt_upgrademetapage(), xl_btree_metadata::allequalimage, Assert(), BT_READ, BT_WRITE, BTMetaPageData::btm_allequalimage, BTMetaPageData::btm_fastlevel, BTMetaPageData::btm_fastroot, BTMetaPageData::btm_last_cleanup_num_delpages, BTMetaPageData::btm_last_cleanup_num_heap_tuples, BTMetaPageData::btm_level, BTMetaPageData::btm_root, BTMetaPageData::btm_version, BTPageGetMeta, BTREE_METAPAGE, BTREE_NOVAC_VERSION, BufferGetPage(), END_CRIT_SECTION, xl_btree_metadata::fastlevel, xl_btree_metadata::fastroot, xl_btree_metadata::last_cleanup_num_delpages, xl_btree_metadata::level, MarkBufferDirty(), PageSetLSN(), REGBUF_STANDARD, REGBUF_WILL_INIT, RelationNeedsWAL, xl_btree_metadata::root, START_CRIT_SECTION, xl_btree_metadata::version, XLOG_BTREE_META_CLEANUP, XLogBeginInsert(), XLogInsert(), XLogRegisterBufData(), and XLogRegisterBuffer().

Referenced by btvacuumcleanup().

◆ _bt_start_array_keys()

void _bt_start_array_keys ( IndexScanDesc  scan,
ScanDirection  dir 
)

Definition at line 537 of file nbtreadpage.c.

538{
539 Relation rel = scan->indexRelation;
540 BTScanOpaque so = (BTScanOpaque) scan->opaque;
541
542 Assert(so->numArrayKeys);
543 Assert(so->qual_ok);
544
545 for (int i = 0; i < so->numArrayKeys; i++)
546 {
547 BTArrayKeyInfo *array = &so->arrayKeys[i];
548 ScanKey skey = &so->keyData[array->scan_key];
549
551
552 _bt_array_set_low_or_high(rel, skey, array,
554 }
555 so->scanBehind = so->oppositeDirCheck = false; /* reset */
556}
static void _bt_array_set_low_or_high(Relation rel, ScanKey skey, BTArrayKeyInfo *array, bool low_not_high)
Definition: nbtreadpage.c:3203

References _bt_array_set_low_or_high(), BTScanOpaqueData::arrayKeys, Assert(), i, IndexScanDescData::indexRelation, BTScanOpaqueData::keyData, BTScanOpaqueData::numArrayKeys, IndexScanDescData::opaque, BTScanOpaqueData::oppositeDirCheck, BTScanOpaqueData::qual_ok, BTArrayKeyInfo::scan_key, BTScanOpaqueData::scanBehind, ScanDirectionIsForward, ScanKeyData::sk_flags, and SK_SEARCHARRAY.

Referenced by _bt_advance_array_keys_increment(), _bt_first(), _bt_readpage(), and btrestrpos().

◆ _bt_start_vacuum()

BTCycleId _bt_start_vacuum ( Relation  rel)

Definition at line 469 of file nbtutils.c.

470{
471 BTCycleId result;
472 int i;
473 BTOneVacInfo *vac;
474
475 LWLockAcquire(BtreeVacuumLock, LW_EXCLUSIVE);
476
477 /*
478 * Assign the next cycle ID, being careful to avoid zero as well as the
479 * reserved high values.
480 */
481 result = ++(btvacinfo->cycle_ctr);
482 if (result == 0 || result > MAX_BT_CYCLE_ID)
483 result = btvacinfo->cycle_ctr = 1;
484
485 /* Let's just make sure there's no entry already for this index */
486 for (i = 0; i < btvacinfo->num_vacuums; i++)
487 {
488 vac = &btvacinfo->vacuums[i];
489 if (vac->relid.relId == rel->rd_lockInfo.lockRelId.relId &&
490 vac->relid.dbId == rel->rd_lockInfo.lockRelId.dbId)
491 {
492 /*
493 * Unlike most places in the backend, we have to explicitly
494 * release our LWLock before throwing an error. This is because
495 * we expect _bt_end_vacuum() to be called before transaction
496 * abort cleanup can run to release LWLocks.
497 */
498 LWLockRelease(BtreeVacuumLock);
499 elog(ERROR, "multiple active vacuums for index \"%s\"",
501 }
502 }
503
504 /* OK, add an entry */
506 {
507 LWLockRelease(BtreeVacuumLock);
508 elog(ERROR, "out of btvacinfo slots");
509 }
511 vac->relid = rel->rd_lockInfo.lockRelId;
512 vac->cycleid = result;
514
515 LWLockRelease(BtreeVacuumLock);
516 return result;
517}
#define MAX_BT_CYCLE_ID
Definition: nbtree.h:94
uint16 BTCycleId
Definition: nbtree.h:30
BTCycleId cycleid
Definition: nbtutils.c:411
BTCycleId cycle_ctr
Definition: nbtutils.c:416
int max_vacuums
Definition: nbtutils.c:418

References btvacinfo, BTVacInfo::cycle_ctr, BTOneVacInfo::cycleid, LockRelId::dbId, elog, ERROR, i, LockInfoData::lockRelId, LW_EXCLUSIVE, LWLockAcquire(), LWLockRelease(), MAX_BT_CYCLE_ID, BTVacInfo::max_vacuums, BTVacInfo::num_vacuums, RelationData::rd_lockInfo, RelationGetRelationName, BTOneVacInfo::relid, LockRelId::relId, and BTVacInfo::vacuums.

Referenced by btbulkdelete().

◆ _bt_swap_posting()

IndexTuple _bt_swap_posting ( IndexTuple  newitem,
IndexTuple  oposting,
int  postingoff 
)

Definition at line 1020 of file nbtdedup.c.

1021{
1022 int nhtids;
1023 char *replacepos;
1024 char *replaceposright;
1025 Size nmovebytes;
1026 IndexTuple nposting;
1027
1028 nhtids = BTreeTupleGetNPosting(oposting);
1029 Assert(_bt_posting_valid(oposting));
1030
1031 /*
1032 * The postingoff argument originated as a _bt_binsrch_posting() return
1033 * value. It will be 0 in the event of corruption that makes a leaf page
1034 * contain a non-pivot tuple that's somehow identical to newitem (no two
1035 * non-pivot tuples should ever have the same TID). This has been known
1036 * to happen in the field from time to time.
1037 *
1038 * Perform a basic sanity check to catch this case now.
1039 */
1040 if (!(postingoff > 0 && postingoff < nhtids))
1041 elog(ERROR, "posting list tuple with %d items cannot be split at offset %d",
1042 nhtids, postingoff);
1043
1044 /*
1045 * Move item pointers in posting list to make a gap for the new item's
1046 * heap TID. We shift TIDs one place to the right, losing original
1047 * rightmost TID. (nmovebytes must not include TIDs to the left of
1048 * postingoff, nor the existing rightmost/max TID that gets overwritten.)
1049 */
1050 nposting = CopyIndexTuple(oposting);
1051 replacepos = (char *) BTreeTupleGetPostingN(nposting, postingoff);
1052 replaceposright = (char *) BTreeTupleGetPostingN(nposting, postingoff + 1);
1053 nmovebytes = (nhtids - postingoff - 1) * sizeof(ItemPointerData);
1054 memmove(replaceposright, replacepos, nmovebytes);
1055
1056 /* Fill the gap at postingoff with TID of new item (original new TID) */
1057 Assert(!BTreeTupleIsPivot(newitem) && !BTreeTupleIsPosting(newitem));
1058 ItemPointerCopy(&newitem->t_tid, (ItemPointer) replacepos);
1059
1060 /* Now copy oposting's rightmost/max TID into new item (final new TID) */
1061 ItemPointerCopy(BTreeTupleGetMaxHeapTID(oposting), &newitem->t_tid);
1062
1064 BTreeTupleGetHeapTID(newitem)) < 0);
1065 Assert(_bt_posting_valid(nposting));
1066
1067 return nposting;
1068}

References Assert(), BTreeTupleGetHeapTID(), BTreeTupleGetMaxHeapTID(), BTreeTupleGetNPosting(), BTreeTupleGetPostingN(), BTreeTupleIsPivot(), BTreeTupleIsPosting(), CopyIndexTuple(), elog, ERROR, ItemPointerCompare(), ItemPointerCopy(), and IndexTupleData::t_tid.

Referenced by _bt_insertonpg(), btree_xlog_insert(), and btree_xlog_split().

◆ _bt_truncate()

IndexTuple _bt_truncate ( Relation  rel,
IndexTuple  lastleft,
IndexTuple  firstright,
BTScanInsert  itup_key 
)

Definition at line 698 of file nbtutils.c.

700{
701 TupleDesc itupdesc = RelationGetDescr(rel);
703 int keepnatts;
704 IndexTuple pivot;
705 IndexTuple tidpivot;
706 ItemPointer pivotheaptid;
707 Size newsize;
708
709 /*
710 * We should only ever truncate non-pivot tuples from leaf pages. It's
711 * never okay to truncate when splitting an internal page.
712 */
713 Assert(!BTreeTupleIsPivot(lastleft) && !BTreeTupleIsPivot(firstright));
714
715 /* Determine how many attributes must be kept in truncated tuple */
716 keepnatts = _bt_keep_natts(rel, lastleft, firstright, itup_key);
717
718#ifdef DEBUG_NO_TRUNCATE
719 /* Force truncation to be ineffective for testing purposes */
720 keepnatts = nkeyatts + 1;
721#endif
722
723 pivot = index_truncate_tuple(itupdesc, firstright,
724 Min(keepnatts, nkeyatts));
725
726 if (BTreeTupleIsPosting(pivot))
727 {
728 /*
729 * index_truncate_tuple() just returns a straight copy of firstright
730 * when it has no attributes to truncate. When that happens, we may
731 * need to truncate away a posting list here instead.
732 */
733 Assert(keepnatts == nkeyatts || keepnatts == nkeyatts + 1);
735 pivot->t_info &= ~INDEX_SIZE_MASK;
736 pivot->t_info |= MAXALIGN(BTreeTupleGetPostingOffset(firstright));
737 }
738
739 /*
740 * If there is a distinguishing key attribute within pivot tuple, we're
741 * done
742 */
743 if (keepnatts <= nkeyatts)
744 {
745 BTreeTupleSetNAtts(pivot, keepnatts, false);
746 return pivot;
747 }
748
749 /*
750 * We have to store a heap TID in the new pivot tuple, since no non-TID
751 * key attribute value in firstright distinguishes the right side of the
752 * split from the left side. nbtree conceptualizes this case as an
753 * inability to truncate away any key attributes, since heap TID is
754 * treated as just another key attribute (despite lacking a pg_attribute
755 * entry).
756 *
757 * Use enlarged space that holds a copy of pivot. We need the extra space
758 * to store a heap TID at the end (using the special pivot tuple
759 * representation). Note that the original pivot already has firstright's
760 * possible posting list/non-key attribute values removed at this point.
761 */
762 newsize = MAXALIGN(IndexTupleSize(pivot)) + MAXALIGN(sizeof(ItemPointerData));
763 tidpivot = palloc0(newsize);
764 memcpy(tidpivot, pivot, MAXALIGN(IndexTupleSize(pivot)));
765 /* Cannot leak memory here */
766 pfree(pivot);
767
768 /*
769 * Store all of firstright's key attribute values plus a tiebreaker heap
770 * TID value in enlarged pivot tuple
771 */
772 tidpivot->t_info &= ~INDEX_SIZE_MASK;
773 tidpivot->t_info |= newsize;
774 BTreeTupleSetNAtts(tidpivot, nkeyatts, true);
775 pivotheaptid = BTreeTupleGetHeapTID(tidpivot);
776
777 /*
778 * Lehman & Yao use lastleft as the leaf high key in all cases, but don't
779 * consider suffix truncation. It seems like a good idea to follow that
780 * example in cases where no truncation takes place -- use lastleft's heap
781 * TID. (This is also the closest value to negative infinity that's
782 * legally usable.)
783 */
784 ItemPointerCopy(BTreeTupleGetMaxHeapTID(lastleft), pivotheaptid);
785
786 /*
787 * We're done. Assert() that heap TID invariants hold before returning.
788 *
789 * Lehman and Yao require that the downlink to the right page, which is to
790 * be inserted into the parent page in the second phase of a page split be
791 * a strict lower bound on items on the right page, and a non-strict upper
792 * bound for items on the left page. Assert that heap TIDs follow these
793 * invariants, since a heap TID value is apparently needed as a
794 * tiebreaker.
795 */
796#ifndef DEBUG_NO_TRUNCATE
798 BTreeTupleGetHeapTID(firstright)) < 0);
799 Assert(ItemPointerCompare(pivotheaptid,
800 BTreeTupleGetHeapTID(lastleft)) >= 0);
801 Assert(ItemPointerCompare(pivotheaptid,
802 BTreeTupleGetHeapTID(firstright)) < 0);
803#else
804
805 /*
806 * Those invariants aren't guaranteed to hold for lastleft + firstright
807 * heap TID attribute values when they're considered here only because
808 * DEBUG_NO_TRUNCATE is defined (a heap TID is probably not actually
809 * needed as a tiebreaker). DEBUG_NO_TRUNCATE must therefore use a heap
810 * TID value that always works as a strict lower bound for items to the
811 * right. In particular, it must avoid using firstright's leading key
812 * attribute values along with lastleft's heap TID value when lastleft's
813 * TID happens to be greater than firstright's TID.
814 */
815 ItemPointerCopy(BTreeTupleGetHeapTID(firstright), pivotheaptid);
816
817 /*
818 * Pivot heap TID should never be fully equal to firstright. Note that
819 * the pivot heap TID will still end up equal to lastleft's heap TID when
820 * that's the only usable value.
821 */
822 ItemPointerSetOffsetNumber(pivotheaptid,
824 Assert(ItemPointerCompare(pivotheaptid,
825 BTreeTupleGetHeapTID(firstright)) < 0);
826#endif
827
828 return tidpivot;
829}
IndexTuple index_truncate_tuple(TupleDesc sourceDescriptor, IndexTuple source, int leavenatts)
Definition: indextuple.c:576
static void ItemPointerSetOffsetNumber(ItemPointerData *pointer, OffsetNumber offsetNumber)
Definition: itemptr.h:158
static void BTreeTupleSetNAtts(IndexTuple itup, uint16 nkeyatts, bool heaptid)
Definition: nbtree.h:596
static int _bt_keep_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright, BTScanInsert itup_key)
Definition: nbtutils.c:843

References _bt_keep_natts(), Assert(), BTreeTupleGetHeapTID(), BTreeTupleGetMaxHeapTID(), BTreeTupleGetPostingOffset(), BTreeTupleIsPivot(), BTreeTupleIsPosting(), BTreeTupleSetNAtts(), index_truncate_tuple(), IndexRelationGetNumberOfAttributes, IndexRelationGetNumberOfKeyAttributes, IndexTupleSize(), ItemPointerCompare(), ItemPointerCopy(), ItemPointerGetOffsetNumber(), ItemPointerSetOffsetNumber(), MAXALIGN, Min, OffsetNumberPrev, palloc0(), pfree(), RelationGetDescr, and IndexTupleData::t_info.

Referenced by _bt_buildadd(), and _bt_split().

◆ _bt_unlockbuf()

void _bt_unlockbuf ( Relation  rel,
Buffer  buf 
)

Definition at line 1071 of file nbtpage.c.

1072{
1073 /*
1074 * Buffer is pinned and locked, which means that it is expected to be
1075 * defined and addressable. Check that proactively.
1076 */
1078
1079 /* LockBuffer() asserts that pin is held by this backend */
1081
1082 if (!RelationUsesLocalBuffers(rel))
1084}
@ BUFFER_LOCK_UNLOCK
Definition: bufmgr.h:205
#define VALGRIND_CHECK_MEM_IS_DEFINED(addr, size)
Definition: memdebug.h:23
#define VALGRIND_MAKE_MEM_NOACCESS(addr, size)
Definition: memdebug.h:27

References buf, BUFFER_LOCK_UNLOCK, BufferGetPage(), LockBuffer(), RelationUsesLocalBuffers, VALGRIND_CHECK_MEM_IS_DEFINED, and VALGRIND_MAKE_MEM_NOACCESS.

Referenced by _bt_drop_lock_and_maybe_pin(), _bt_getroot(), _bt_killitems(), _bt_moveright(), _bt_pagedel(), _bt_readfirstpage(), _bt_relandgetbuf(), _bt_relbuf(), _bt_search(), _bt_set_cleanup_info(), and _bt_unlink_halfdead_page().

◆ _bt_update_posting()

void _bt_update_posting ( BTVacuumPosting  vacposting)

Definition at line 922 of file nbtdedup.c.

923{
924 IndexTuple origtuple = vacposting->itup;
925 uint32 keysize,
926 newsize;
927 IndexTuple itup;
928 int nhtids;
929 int ui,
930 d;
931 ItemPointer htids;
932
933 nhtids = BTreeTupleGetNPosting(origtuple) - vacposting->ndeletedtids;
934
935 Assert(_bt_posting_valid(origtuple));
936 Assert(nhtids > 0 && nhtids < BTreeTupleGetNPosting(origtuple));
937
938 /*
939 * Determine final size of new tuple.
940 *
941 * This calculation needs to match the code used within _bt_form_posting()
942 * for new posting list tuples. We avoid calling _bt_form_posting() here
943 * to save ourselves a second memory allocation for a htids workspace.
944 */
945 keysize = BTreeTupleGetPostingOffset(origtuple);
946 if (nhtids > 1)
947 newsize = MAXALIGN(keysize +
948 nhtids * sizeof(ItemPointerData));
949 else
950 newsize = keysize;
951
952 Assert(newsize <= INDEX_SIZE_MASK);
953 Assert(newsize == MAXALIGN(newsize));
954
955 /* Allocate memory using palloc0() (matches index_form_tuple()) */
956 itup = palloc0(newsize);
957 memcpy(itup, origtuple, keysize);
958 itup->t_info &= ~INDEX_SIZE_MASK;
959 itup->t_info |= newsize;
960
961 if (nhtids > 1)
962 {
963 /* Form posting list tuple */
964 BTreeTupleSetPosting(itup, nhtids, keysize);
965 htids = BTreeTupleGetPosting(itup);
966 }
967 else
968 {
969 /* Form standard non-pivot tuple */
970 itup->t_info &= ~INDEX_ALT_TID_MASK;
971 htids = &itup->t_tid;
972 }
973
974 ui = 0;
975 d = 0;
976 for (int i = 0; i < BTreeTupleGetNPosting(origtuple); i++)
977 {
978 if (d < vacposting->ndeletedtids && vacposting->deletetids[d] == i)
979 {
980 d++;
981 continue;
982 }
983 htids[ui++] = *BTreeTupleGetPostingN(origtuple, i);
984 }
985 Assert(ui == nhtids);
986 Assert(d == vacposting->ndeletedtids);
987 Assert(nhtids == 1 || _bt_posting_valid(itup));
988 Assert(nhtids > 1 || ItemPointerIsValid(&itup->t_tid));
989
990 /* vacposting arg's itup will now point to updated version */
991 vacposting->itup = itup;
992}

References Assert(), BTreeTupleGetNPosting(), BTreeTupleGetPosting(), BTreeTupleGetPostingN(), BTreeTupleGetPostingOffset(), BTreeTupleSetPosting(), BTVacuumPostingData::deletetids, i, INDEX_SIZE_MASK, ItemPointerIsValid(), BTVacuumPostingData::itup, MAXALIGN, BTVacuumPostingData::ndeletedtids, palloc0(), IndexTupleData::t_info, and IndexTupleData::t_tid.

Referenced by _bt_delitems_update(), and btree_xlog_updates().

◆ _bt_upgradelockbufcleanup()

void _bt_upgradelockbufcleanup ( Relation  rel,
Buffer  buf 
)

Definition at line 1110 of file nbtpage.c.

1111{
1112 /*
1113 * Buffer is pinned and locked, which means that it is expected to be
1114 * defined and addressable. Check that proactively.
1115 */
1117
1118 /* LockBuffer() asserts that pin is held by this backend */
1121}
void LockBufferForCleanup(Buffer buffer)
Definition: bufmgr.c:5684

References buf, BUFFER_LOCK_UNLOCK, BufferGetPage(), LockBuffer(), LockBufferForCleanup(), and VALGRIND_CHECK_MEM_IS_DEFINED.

Referenced by btvacuumpage().

◆ _bt_upgrademetapage()

void _bt_upgrademetapage ( Page  page)

Definition at line 108 of file nbtpage.c.

109{
110 BTMetaPageData *metad;
112
113 metad = BTPageGetMeta(page);
114 metaopaque = BTPageGetOpaque(page);
115
116 /* It must be really a meta page of upgradable version */
117 Assert(metaopaque->btpo_flags & BTP_META);
120
121 /* Set version number and fill extra fields added into version 3 */
125 /* Only a REINDEX can set this field */
126 Assert(!metad->btm_allequalimage);
127 metad->btm_allequalimage = false;
128
129 /* Adjust pd_lower (see _bt_initmetapage() for details) */
130 ((PageHeader) page)->pd_lower =
131 ((char *) metad + sizeof(BTMetaPageData)) - (char *) page;
132}

References Assert(), BTMetaPageData::btm_allequalimage, BTMetaPageData::btm_last_cleanup_num_delpages, BTMetaPageData::btm_last_cleanup_num_heap_tuples, BTMetaPageData::btm_version, BTP_META, BTPageGetMeta, BTPageGetOpaque, BTREE_MIN_VERSION, BTREE_NOVAC_VERSION, and PG_USED_FOR_ASSERTS_ONLY.

Referenced by _bt_getroot(), _bt_insertonpg(), _bt_newlevel(), _bt_set_cleanup_info(), and _bt_unlink_halfdead_page().

◆ _bt_vacuum_cycleid()

BTCycleId _bt_vacuum_cycleid ( Relation  rel)

Definition at line 435 of file nbtutils.c.

436{
437 BTCycleId result = 0;
438 int i;
439
440 /* Share lock is enough since this is a read-only operation */
441 LWLockAcquire(BtreeVacuumLock, LW_SHARED);
442
443 for (i = 0; i < btvacinfo->num_vacuums; i++)
444 {
446
447 if (vac->relid.relId == rel->rd_lockInfo.lockRelId.relId &&
448 vac->relid.dbId == rel->rd_lockInfo.lockRelId.dbId)
449 {
450 result = vac->cycleid;
451 break;
452 }
453 }
454
455 LWLockRelease(BtreeVacuumLock);
456 return result;
457}
@ LW_SHARED
Definition: lwlock.h:113

References btvacinfo, BTOneVacInfo::cycleid, LockRelId::dbId, i, LockInfoData::lockRelId, LW_SHARED, LWLockAcquire(), LWLockRelease(), BTVacInfo::num_vacuums, RelationData::rd_lockInfo, BTOneVacInfo::relid, LockRelId::relId, and BTVacInfo::vacuums.

Referenced by _bt_split().

◆ _bt_vacuum_needs_cleanup()

bool _bt_vacuum_needs_cleanup ( Relation  rel)

Definition at line 180 of file nbtpage.c.

181{
182 Buffer metabuf;
183 Page metapg;
184 BTMetaPageData *metad;
185 uint32 btm_version;
186 BlockNumber prev_num_delpages;
187
188 /*
189 * Copy details from metapage to local variables quickly.
190 *
191 * Note that we deliberately avoid using cached version of metapage here.
192 */
193 metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
194 metapg = BufferGetPage(metabuf);
195 metad = BTPageGetMeta(metapg);
196 btm_version = metad->btm_version;
197
198 if (btm_version < BTREE_NOVAC_VERSION)
199 {
200 /*
201 * Metapage needs to be dynamically upgraded to store fields that are
202 * only present when btm_version >= BTREE_NOVAC_VERSION
203 */
204 _bt_relbuf(rel, metabuf);
205 return true;
206 }
207
208 prev_num_delpages = metad->btm_last_cleanup_num_delpages;
209 _bt_relbuf(rel, metabuf);
210
211 /*
212 * Trigger cleanup in rare cases where prev_num_delpages exceeds 5% of the
213 * total size of the index. We can reasonably expect (though are not
214 * guaranteed) to be able to recycle this many pages if we decide to do a
215 * btvacuumscan call during the ongoing btvacuumcleanup. For further
216 * details see the nbtree/README section on placing deleted pages in the
217 * FSM.
218 */
219 if (prev_num_delpages > 0 &&
220 prev_num_delpages > RelationGetNumberOfBlocks(rel) / 20)
221 return true;
222
223 return false;
224}
#define RelationGetNumberOfBlocks(reln)
Definition: bufmgr.h:294

References _bt_getbuf(), _bt_relbuf(), BT_READ, BTMetaPageData::btm_last_cleanup_num_delpages, BTMetaPageData::btm_version, BTPageGetMeta, BTREE_METAPAGE, BTREE_NOVAC_VERSION, BufferGetPage(), and RelationGetNumberOfBlocks.

Referenced by btvacuumcleanup().

◆ btadjustmembers()

void btadjustmembers ( Oid  opfamilyoid,
Oid  opclassoid,
List operators,
List functions 
)

Definition at line 288 of file nbtvalidate.c.

292{
293 Oid opcintype;
294 ListCell *lc;
295
296 /*
297 * Btree operators and comparison support functions are always "loose"
298 * members of the opfamily if they are cross-type. If they are not
299 * cross-type, we prefer to tie them to the appropriate opclass ... but if
300 * the user hasn't created one, we can't do that, and must fall back to
301 * using the opfamily dependency. (We mustn't force creation of an
302 * opclass in such a case, as leaving an incomplete opclass laying about
303 * would be bad. Throwing an error is another undesirable alternative.)
304 *
305 * This behavior results in a bit of a dump/reload hazard, in that the
306 * order of restoring objects could affect what dependencies we end up
307 * with. pg_dump's existing behavior will preserve the dependency choices
308 * in most cases, but not if a cross-type operator has been bound tightly
309 * into an opclass. That's a mistake anyway, so silently "fixing" it
310 * isn't awful.
311 *
312 * Optional support functions are always "loose" family members.
313 *
314 * To avoid repeated lookups, we remember the most recently used opclass's
315 * input type.
316 */
317 if (OidIsValid(opclassoid))
318 {
319 /* During CREATE OPERATOR CLASS, need CCI to see the pg_opclass row */
321 opcintype = get_opclass_input_type(opclassoid);
322 }
323 else
324 opcintype = InvalidOid;
325
326 /*
327 * We handle operators and support functions almost identically, so rather
328 * than duplicate this code block, just join the lists.
329 */
330 foreach(lc, list_concat_copy(operators, functions))
331 {
333
334 if (op->is_func && op->number != BTORDER_PROC)
335 {
336 /* Optional support proc, so always a soft family dependency */
337 op->ref_is_hard = false;
338 op->ref_is_family = true;
339 op->refobjid = opfamilyoid;
340 }
341 else if (op->lefttype != op->righttype)
342 {
343 /* Cross-type, so always a soft family dependency */
344 op->ref_is_hard = false;
345 op->ref_is_family = true;
346 op->refobjid = opfamilyoid;
347 }
348 else
349 {
350 /* Not cross-type; is there a suitable opclass? */
351 if (op->lefttype != opcintype)
352 {
353 /* Avoid repeating this expensive lookup, even if it fails */
354 opcintype = op->lefttype;
355 opclassoid = opclass_for_family_datatype(BTREE_AM_OID,
356 opfamilyoid,
357 opcintype);
358 }
359 if (OidIsValid(opclassoid))
360 {
361 /* Hard dependency on opclass */
362 op->ref_is_hard = true;
363 op->ref_is_family = false;
364 op->refobjid = opclassoid;
365 }
366 else
367 {
368 /* We're stuck, so make a soft dependency on the opfamily */
369 op->ref_is_hard = false;
370 op->ref_is_family = true;
371 op->refobjid = opfamilyoid;
372 }
373 }
374 }
375}
Oid opclass_for_family_datatype(Oid amoid, Oid opfamilyoid, Oid datatypeoid)
Definition: amvalidate.c:236
List * list_concat_copy(const List *list1, const List *list2)
Definition: list.c:598
Oid get_opclass_input_type(Oid opclass)
Definition: lsyscache.c:1329
#define lfirst(lc)
Definition: pg_list.h:172
static const struct fns functions
Definition: regcomp.c:358
Oid refobjid
Definition: amapi.h:98
Oid lefttype
Definition: amapi.h:93
bool ref_is_family
Definition: amapi.h:97
Oid righttype
Definition: amapi.h:94
int number
Definition: amapi.h:92
bool is_func
Definition: amapi.h:90
bool ref_is_hard
Definition: amapi.h:96
void CommandCounterIncrement(void)
Definition: xact.c:1101

References BTORDER_PROC, CommandCounterIncrement(), functions, get_opclass_input_type(), InvalidOid, OpFamilyMember::is_func, OpFamilyMember::lefttype, lfirst, list_concat_copy(), OpFamilyMember::number, OidIsValid, opclass_for_family_datatype(), OpFamilyMember::ref_is_family, OpFamilyMember::ref_is_hard, OpFamilyMember::refobjid, and OpFamilyMember::righttype.

Referenced by bthandler().

◆ btbeginscan()

IndexScanDesc btbeginscan ( Relation  rel,
int  nkeys,
int  norderbys 
)

Definition at line 336 of file nbtree.c.

337{
338 IndexScanDesc scan;
339 BTScanOpaque so;
340
341 /* no order by operators allowed */
342 Assert(norderbys == 0);
343
344 /* get the scan */
345 scan = RelationGetIndexScan(rel, nkeys, norderbys);
346
347 /* allocate private workspace */
351 if (scan->numberOfKeys > 0)
352 so->keyData = (ScanKey) palloc(scan->numberOfKeys * sizeof(ScanKeyData));
353 else
354 so->keyData = NULL;
355
356 so->skipScan = false;
357 so->needPrimScan = false;
358 so->scanBehind = false;
359 so->oppositeDirCheck = false;
360 so->arrayKeys = NULL;
361 so->orderProcs = NULL;
362 so->arrayContext = NULL;
363
364 so->killedItems = NULL; /* until needed */
365 so->numKilled = 0;
366
367 /*
368 * We don't know yet whether the scan will be index-only, so we do not
369 * allocate the tuple workspace arrays until btrescan. However, we set up
370 * scan->xs_itupdesc whether we'll need it or not, since that's so cheap.
371 */
372 so->currTuples = so->markTuples = NULL;
373
374 scan->xs_itupdesc = RelationGetDescr(rel);
375
376 scan->opaque = so;
377
378 return scan;
379}
IndexScanDesc RelationGetIndexScan(Relation indexRelation, int nkeys, int norderbys)
Definition: genam.c:80
char * markTuples
Definition: nbtree.h:1081
char * currTuples
Definition: nbtree.h:1080
BTScanPosData markPos
Definition: nbtree.h:1094
struct TupleDescData * xs_itupdesc
Definition: relscan.h:170

References BTScanOpaqueData::arrayContext, BTScanOpaqueData::arrayKeys, Assert(), BTScanPosInvalidate, BTScanOpaqueData::currPos, BTScanOpaqueData::currTuples, BTScanOpaqueData::keyData, BTScanOpaqueData::killedItems, BTScanOpaqueData::markPos, BTScanOpaqueData::markTuples, BTScanOpaqueData::needPrimScan, IndexScanDescData::numberOfKeys, BTScanOpaqueData::numKilled, IndexScanDescData::opaque, BTScanOpaqueData::oppositeDirCheck, BTScanOpaqueData::orderProcs, palloc(), palloc_object, RelationGetDescr, RelationGetIndexScan(), BTScanOpaqueData::scanBehind, BTScanOpaqueData::skipScan, and IndexScanDescData::xs_itupdesc.

Referenced by bthandler().

◆ btbuild()

IndexBuildResult * btbuild ( Relation  heap,
Relation  index,
struct IndexInfo indexInfo 
)

Definition at line 296 of file nbtsort.c.

297{
298 IndexBuildResult *result;
299 BTBuildState buildstate;
300 double reltuples;
301
302#ifdef BTREE_BUILD_STATS
304 ResetUsage();
305#endif /* BTREE_BUILD_STATS */
306
307 buildstate.isunique = indexInfo->ii_Unique;
308 buildstate.nulls_not_distinct = indexInfo->ii_NullsNotDistinct;
309 buildstate.havedead = false;
310 buildstate.heap = heap;
311 buildstate.spool = NULL;
312 buildstate.spool2 = NULL;
313 buildstate.indtuples = 0;
314 buildstate.btleader = NULL;
315
316 /*
317 * We expect to be called exactly once for any index relation. If that's
318 * not the case, big trouble's what we have.
319 */
321 elog(ERROR, "index \"%s\" already contains data",
323
324 reltuples = _bt_spools_heapscan(heap, index, &buildstate, indexInfo);
325
326 /*
327 * Finish the build by (1) completing the sort of the spool file, (2)
328 * inserting the sorted tuples into btree pages and (3) building the upper
329 * levels. Finally, it may also be necessary to end use of parallelism.
330 */
331 _bt_leafbuild(buildstate.spool, buildstate.spool2);
332 _bt_spooldestroy(buildstate.spool);
333 if (buildstate.spool2)
334 _bt_spooldestroy(buildstate.spool2);
335 if (buildstate.btleader)
336 _bt_end_parallel(buildstate.btleader);
337
339
340 result->heap_tuples = reltuples;
341 result->index_tuples = buildstate.indtuples;
342
343#ifdef BTREE_BUILD_STATS
345 {
346 ShowUsage("BTREE BUILD STATS");
347 ResetUsage();
348 }
349#endif /* BTREE_BUILD_STATS */
350
351 return result;
352}
static void _bt_end_parallel(BTLeader *btleader)
Definition: nbtsort.c:1608
static void _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2)
Definition: nbtsort.c:539
static double _bt_spools_heapscan(Relation heap, Relation index, BTBuildState *buildstate, IndexInfo *indexInfo)
Definition: nbtsort.c:366
static void _bt_spooldestroy(BTSpool *btspool)
Definition: nbtsort.c:518
bool isunique
Definition: nbtsort.c:207
BTSpool * spool
Definition: nbtsort.c:211
BTLeader * btleader
Definition: nbtsort.c:225
bool nulls_not_distinct
Definition: nbtsort.c:208
bool havedead
Definition: nbtsort.c:209
Relation heap
Definition: nbtsort.c:210
BTSpool * spool2
Definition: nbtsort.c:217
double indtuples
Definition: nbtsort.c:218
double heap_tuples
Definition: genam.h:59
double index_tuples
Definition: genam.h:60
bool ii_Unique
Definition: execnodes.h:200
bool ii_NullsNotDistinct
Definition: execnodes.h:202
Definition: type.h:96

References _bt_end_parallel(), _bt_leafbuild(), _bt_spooldestroy(), _bt_spools_heapscan(), BTBuildState::btleader, elog, ERROR, BTBuildState::havedead, BTBuildState::heap, IndexBuildResult::heap_tuples, IndexInfo::ii_NullsNotDistinct, IndexInfo::ii_Unique, IndexBuildResult::index_tuples, BTBuildState::indtuples, BTBuildState::isunique, log_btree_build_stats, BTBuildState::nulls_not_distinct, palloc_object, RelationGetNumberOfBlocks, RelationGetRelationName, ResetUsage(), ShowUsage(), BTBuildState::spool, and BTBuildState::spool2.

Referenced by bthandler().

◆ btbuildempty()

void btbuildempty ( Relation  index)

Definition at line 180 of file nbtree.c.

181{
182 bool allequalimage = _bt_allequalimage(index, false);
183 BulkWriteState *bulkstate;
184 BulkWriteBuffer metabuf;
185
187
188 /* Construct metapage. */
189 metabuf = smgr_bulk_get_buf(bulkstate);
190 _bt_initmetapage((Page) metabuf, P_NONE, 0, allequalimage);
191 smgr_bulk_write(bulkstate, BTREE_METAPAGE, metabuf, true);
192
193 smgr_bulk_finish(bulkstate);
194}
BulkWriteState * smgr_bulk_start_rel(Relation rel, ForkNumber forknum)
Definition: bulk_write.c:87
void smgr_bulk_write(BulkWriteState *bulkstate, BlockNumber blocknum, BulkWriteBuffer buf, bool page_std)
Definition: bulk_write.c:323
BulkWriteBuffer smgr_bulk_get_buf(BulkWriteState *bulkstate)
Definition: bulk_write.c:347
void smgr_bulk_finish(BulkWriteState *bulkstate)
Definition: bulk_write.c:130
void _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level, bool allequalimage)
Definition: nbtpage.c:68
bool _bt_allequalimage(Relation rel, bool debugmessage)
Definition: nbtutils.c:1181
@ INIT_FORKNUM
Definition: relpath.h:61

References _bt_allequalimage(), _bt_initmetapage(), BTREE_METAPAGE, INIT_FORKNUM, P_NONE, smgr_bulk_finish(), smgr_bulk_get_buf(), smgr_bulk_start_rel(), and smgr_bulk_write().

Referenced by bthandler().

◆ btbuildphasename()

char * btbuildphasename ( int64  phasenum)

Definition at line 650 of file nbtutils.c.

651{
652 switch (phasenum)
653 {
655 return "initializing";
657 return "scanning table";
659 return "sorting live tuples";
661 return "sorting dead tuples";
663 return "loading tuples in tree";
664 default:
665 return NULL;
666 }
667}
#define PROGRESS_BTREE_PHASE_PERFORMSORT_2
Definition: nbtree.h:1148
#define PROGRESS_BTREE_PHASE_LEAF_LOAD
Definition: nbtree.h:1149
#define PROGRESS_BTREE_PHASE_INDEXBUILD_TABLESCAN
Definition: nbtree.h:1146
#define PROGRESS_BTREE_PHASE_PERFORMSORT_1
Definition: nbtree.h:1147
#define PROGRESS_CREATEIDX_SUBPHASE_INITIALIZE
Definition: progress.h:126

References PROGRESS_BTREE_PHASE_INDEXBUILD_TABLESCAN, PROGRESS_BTREE_PHASE_LEAF_LOAD, PROGRESS_BTREE_PHASE_PERFORMSORT_1, PROGRESS_BTREE_PHASE_PERFORMSORT_2, and PROGRESS_CREATEIDX_SUBPHASE_INITIALIZE.

Referenced by bthandler().

◆ btbulkdelete()

IndexBulkDeleteResult * btbulkdelete ( IndexVacuumInfo info,
IndexBulkDeleteResult stats,
IndexBulkDeleteCallback  callback,
void *  callback_state 
)

Definition at line 1134 of file nbtree.c.

1136{
1137 Relation rel = info->index;
1138 BTCycleId cycleid;
1139
1140 /* allocate stats if first time through, else re-use existing struct */
1141 if (stats == NULL)
1143
1144 /* Establish the vacuum cycle ID to use for this scan */
1145 /* The ENSURE stuff ensures we clean up shared memory on failure */
1147 {
1148 cycleid = _bt_start_vacuum(rel);
1149
1150 btvacuumscan(info, stats, callback, callback_state, cycleid);
1151 }
1153 _bt_end_vacuum(rel);
1154
1155 return stats;
1156}
#define PG_ENSURE_ERROR_CLEANUP(cleanup_function, arg)
Definition: ipc.h:47
#define PG_END_ENSURE_ERROR_CLEANUP(cleanup_function, arg)
Definition: ipc.h:52
static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state, BTCycleId cycleid)
Definition: nbtree.c:1252
void _bt_end_vacuum_callback(int code, Datum arg)
Definition: nbtutils.c:554
BTCycleId _bt_start_vacuum(Relation rel)
Definition: nbtutils.c:469
static Datum PointerGetDatum(const void *X)
Definition: postgres.h:332
Relation index
Definition: genam.h:73
static void callback(struct sockaddr *addr, struct sockaddr *mask, void *unused)
Definition: test_ifaddrs.c:46

References _bt_end_vacuum(), _bt_end_vacuum_callback(), _bt_start_vacuum(), btvacuumscan(), callback(), IndexVacuumInfo::index, palloc0_object, PG_END_ENSURE_ERROR_CLEANUP, PG_ENSURE_ERROR_CLEANUP, and PointerGetDatum().

Referenced by bthandler().

◆ btcanreturn()

bool btcanreturn ( Relation  index,
int  attno 
)

Definition at line 1814 of file nbtree.c.

1815{
1816 return true;
1817}

Referenced by bthandler().

◆ btendscan()

void btendscan ( IndexScanDesc  scan)

Definition at line 470 of file nbtree.c.

471{
472 BTScanOpaque so = (BTScanOpaque) scan->opaque;
473
474 /* we aren't holding any read locks, but gotta drop the pins */
476 {
477 /* Before leaving current page, deal with any killed items */
478 if (so->numKilled > 0)
479 _bt_killitems(scan);
481 }
482
483 so->markItemIndex = -1;
485
486 /* No need to invalidate positions, the RAM is about to be freed. */
487
488 /* Release storage */
489 if (so->keyData != NULL)
490 pfree(so->keyData);
491 /* so->arrayKeys and so->orderProcs are in arrayContext */
492 if (so->arrayContext != NULL)
494 if (so->killedItems != NULL)
495 pfree(so->killedItems);
496 if (so->currTuples != NULL)
497 pfree(so->currTuples);
498 /* so->markTuples should not be pfree'd, see btrescan */
499 pfree(so);
500}
void MemoryContextDelete(MemoryContext context)
Definition: mcxt.c:469
#define BTScanPosUnpinIfPinned(scanpos)
Definition: nbtree.h:1015
void _bt_killitems(IndexScanDesc scan)
Definition: nbtutils.c:205

References _bt_killitems(), BTScanOpaqueData::arrayContext, BTScanPosIsValid, BTScanPosUnpinIfPinned, BTScanOpaqueData::currPos, BTScanOpaqueData::currTuples, if(), BTScanOpaqueData::keyData, BTScanOpaqueData::killedItems, BTScanOpaqueData::markItemIndex, BTScanOpaqueData::markPos, MemoryContextDelete(), BTScanOpaqueData::numKilled, IndexScanDescData::opaque, and pfree().

Referenced by bthandler().

◆ btestimateparallelscan()

Size btestimateparallelscan ( Relation  rel,
int  nkeys,
int  norderbys 
)

Definition at line 590 of file nbtree.c.

591{
593 Size estnbtreeshared,
594 genericattrspace;
595
596 /*
597 * Pessimistically assume that every input scan key will be output with
598 * its own SAOP array
599 */
600 estnbtreeshared = offsetof(BTParallelScanDescData, btps_arrElems) +
601 sizeof(int) * nkeys;
602
603 /* Single column indexes cannot possibly use a skip array */
604 if (nkeyatts == 1)
605 return estnbtreeshared;
606
607 /*
608 * Pessimistically assume that all attributes prior to the least
609 * significant attribute require a skip array (and an associated key)
610 */
611 genericattrspace = datumEstimateSpace((Datum) 0, false, true,
612 sizeof(Datum));
613 for (int attnum = 1; attnum < nkeyatts; attnum++)
614 {
615 CompactAttribute *attr;
616
617 /*
618 * We make the conservative assumption that every index column will
619 * also require a skip array.
620 *
621 * Every skip array must have space to store its scan key's sk_flags.
622 */
623 estnbtreeshared = add_size(estnbtreeshared, sizeof(int));
624
625 /* Consider space required to store a datum of opclass input type */
626 attr = TupleDescCompactAttr(rel->rd_att, attnum - 1);
627 if (attr->attbyval)
628 {
629 /* This index attribute stores pass-by-value datums */
630 Size estfixed = datumEstimateSpace((Datum) 0, false,
631 true, attr->attlen);
632
633 estnbtreeshared = add_size(estnbtreeshared, estfixed);
634 continue;
635 }
636
637 /*
638 * This index attribute stores pass-by-reference datums.
639 *
640 * Assume that serializing this array will use just as much space as a
641 * pass-by-value datum, in addition to space for the largest possible
642 * whole index tuple (this is not just a per-datum portion of the
643 * largest possible tuple because that'd be almost as large anyway).
644 *
645 * This is quite conservative, but it's not clear how we could do much
646 * better. The executor requires an up-front storage request size
647 * that reliably covers the scan's high watermark memory usage. We
648 * can't be sure of the real high watermark until the scan is over.
649 */
650 estnbtreeshared = add_size(estnbtreeshared, genericattrspace);
651 estnbtreeshared = add_size(estnbtreeshared, BTMaxItemSize);
652 }
653
654 return estnbtreeshared;
655}
Size datumEstimateSpace(Datum value, bool isnull, bool typByVal, int typLen)
Definition: datum.c:412
Size add_size(Size s1, Size s2)
Definition: shmem.c:495
TupleDesc rd_att
Definition: rel.h:112

References add_size(), CompactAttribute::attbyval, CompactAttribute::attlen, attnum, BTMaxItemSize, BTParallelScanDescData::btps_arrElems, datumEstimateSpace(), IndexRelationGetNumberOfKeyAttributes, RelationData::rd_att, and TupleDescCompactAttr().

Referenced by bthandler().

◆ btgetbitmap()

int64 btgetbitmap ( IndexScanDesc  scan,
TIDBitmap tbm 
)

Definition at line 288 of file nbtree.c.

289{
290 BTScanOpaque so = (BTScanOpaque) scan->opaque;
291 int64 ntids = 0;
292 ItemPointer heapTid;
293
294 Assert(scan->heapRelation == NULL);
295
296 /* Each loop iteration performs another primitive index scan */
297 do
298 {
299 /* Fetch the first page & tuple */
301 {
302 /* Save tuple ID, and continue scanning */
303 heapTid = &scan->xs_heaptid;
304 tbm_add_tuples(tbm, heapTid, 1, false);
305 ntids++;
306
307 for (;;)
308 {
309 /*
310 * Advance to next tuple within page. This is the same as the
311 * easy case in _bt_next().
312 */
313 if (++so->currPos.itemIndex > so->currPos.lastItem)
314 {
315 /* let _bt_next do the heavy lifting */
316 if (!_bt_next(scan, ForwardScanDirection))
317 break;
318 }
319
320 /* Save tuple ID, and continue scanning */
321 heapTid = &so->currPos.items[so->currPos.itemIndex].heapTid;
322 tbm_add_tuples(tbm, heapTid, 1, false);
323 ntids++;
324 }
325 }
326 /* Now see if we need another primitive index scan */
327 } while (so->numArrayKeys && _bt_start_prim_scan(scan));
328
329 return ntids;
330}
int64_t int64
Definition: c.h:549
static bool _bt_start_prim_scan(IndexScanDesc scan)
Definition: nbtree.c:668
bool _bt_first(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:877
bool _bt_next(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:1585
@ ForwardScanDirection
Definition: sdir.h:28
ItemPointerData xs_heaptid
Definition: relscan.h:174
void tbm_add_tuples(TIDBitmap *tbm, const ItemPointerData *tids, int ntids, bool recheck)
Definition: tidbitmap.c:367

References _bt_first(), _bt_next(), _bt_start_prim_scan(), Assert(), BTScanOpaqueData::currPos, ForwardScanDirection, IndexScanDescData::heapRelation, BTScanPosItem::heapTid, BTScanPosData::itemIndex, BTScanPosData::items, BTScanPosData::lastItem, BTScanOpaqueData::numArrayKeys, IndexScanDescData::opaque, tbm_add_tuples(), and IndexScanDescData::xs_heaptid.

Referenced by bthandler().

◆ btgettreeheight()

int btgettreeheight ( Relation  rel)

Definition at line 1823 of file nbtree.c.

1824{
1825 return _bt_getrootheight(rel);
1826}
int _bt_getrootheight(Relation rel)
Definition: nbtpage.c:676

References _bt_getrootheight().

Referenced by bthandler().

◆ btgettuple()

bool btgettuple ( IndexScanDesc  scan,
ScanDirection  dir 
)

Definition at line 227 of file nbtree.c.

228{
229 BTScanOpaque so = (BTScanOpaque) scan->opaque;
230 bool res;
231
232 Assert(scan->heapRelation != NULL);
233
234 /* btree indexes are never lossy */
235 scan->xs_recheck = false;
236
237 /* Each loop iteration performs another primitive index scan */
238 do
239 {
240 /*
241 * If we've already initialized this scan, we can just advance it in
242 * the appropriate direction. If we haven't done so yet, we call
243 * _bt_first() to get the first item in the scan.
244 */
245 if (!BTScanPosIsValid(so->currPos))
246 res = _bt_first(scan, dir);
247 else
248 {
249 /*
250 * Check to see if we should kill the previously-fetched tuple.
251 */
252 if (scan->kill_prior_tuple)
253 {
254 /*
255 * Yes, remember it for later. (We'll deal with all such
256 * tuples at once right before leaving the index page.) The
257 * test for numKilled overrun is not just paranoia: if the
258 * caller reverses direction in the indexscan then the same
259 * item might get entered multiple times. It's not worth
260 * trying to optimize that, so we don't detect it, but instead
261 * just forget any excess entries.
262 */
263 if (so->killedItems == NULL)
266 so->killedItems[so->numKilled++] = so->currPos.itemIndex;
267 }
268
269 /*
270 * Now continue the scan.
271 */
272 res = _bt_next(scan, dir);
273 }
274
275 /* If we have a tuple, return it ... */
276 if (res)
277 break;
278 /* ... otherwise see if we need another primitive index scan */
279 } while (so->numArrayKeys && _bt_start_prim_scan(scan));
280
281 return res;
282}
bool kill_prior_tuple
Definition: relscan.h:149

References _bt_first(), _bt_next(), _bt_start_prim_scan(), Assert(), BTScanPosIsValid, BTScanOpaqueData::currPos, IndexScanDescData::heapRelation, BTScanPosData::itemIndex, IndexScanDescData::kill_prior_tuple, BTScanOpaqueData::killedItems, MaxTIDsPerBTreePage, BTScanOpaqueData::numArrayKeys, BTScanOpaqueData::numKilled, IndexScanDescData::opaque, palloc_array, and IndexScanDescData::xs_recheck.

Referenced by bthandler().

◆ btinitparallelscan()

void btinitparallelscan ( void *  target)

Definition at line 826 of file nbtree.c.

827{
828 BTParallelScanDesc bt_target = (BTParallelScanDesc) target;
829
830 LWLockInitialize(&bt_target->btps_lock,
831 LWTRANCHE_PARALLEL_BTREE_SCAN);
835 ConditionVariableInit(&bt_target->btps_cv);
836}
void ConditionVariableInit(ConditionVariable *cv)
void LWLockInitialize(LWLock *lock, int tranche_id)
Definition: lwlock.c:698
@ BTPARALLEL_NOT_INITIALIZED
Definition: nbtree.c:56

References BTPARALLEL_NOT_INITIALIZED, BTParallelScanDescData::btps_cv, BTParallelScanDescData::btps_lastCurrPage, BTParallelScanDescData::btps_lock, BTParallelScanDescData::btps_nextScanPage, BTParallelScanDescData::btps_pageStatus, ConditionVariableInit(), InvalidBlockNumber, and LWLockInitialize().

Referenced by bthandler().

◆ btinsert()

bool btinsert ( Relation  rel,
Datum values,
bool *  isnull,
ItemPointer  ht_ctid,
Relation  heapRel,
IndexUniqueCheck  checkUnique,
bool  indexUnchanged,
struct IndexInfo indexInfo 
)

Definition at line 203 of file nbtree.c.

208{
209 bool result;
210 IndexTuple itup;
211
212 /* generate an index tuple */
213 itup = index_form_tuple(RelationGetDescr(rel), values, isnull);
214 itup->t_tid = *ht_ctid;
215
216 result = _bt_doinsert(rel, itup, checkUnique, indexUnchanged, heapRel);
217
218 pfree(itup);
219
220 return result;
221}
static Datum values[MAXATTR]
Definition: bootstrap.c:153
IndexTuple index_form_tuple(TupleDesc tupleDescriptor, const Datum *values, const bool *isnull)
Definition: indextuple.c:44
bool _bt_doinsert(Relation rel, IndexTuple itup, IndexUniqueCheck checkUnique, bool indexUnchanged, Relation heapRel)
Definition: nbtinsert.c:104

References _bt_doinsert(), index_form_tuple(), pfree(), RelationGetDescr, IndexTupleData::t_tid, and values.

Referenced by bthandler().

◆ btmarkpos()

void btmarkpos ( IndexScanDesc  scan)

Definition at line 506 of file nbtree.c.

507{
508 BTScanOpaque so = (BTScanOpaque) scan->opaque;
509
510 /* There may be an old mark with a pin (but no lock). */
512
513 /*
514 * Just record the current itemIndex. If we later step to next page
515 * before releasing the marked position, _bt_steppage makes a full copy of
516 * the currPos struct in markPos. If (as often happens) the mark is moved
517 * before we leave the page, we don't have to do that work.
518 */
519 if (BTScanPosIsValid(so->currPos))
521 else
522 {
524 so->markItemIndex = -1;
525 }
526}

References BTScanPosInvalidate, BTScanPosIsValid, BTScanPosUnpinIfPinned, BTScanOpaqueData::currPos, BTScanPosData::itemIndex, BTScanOpaqueData::markItemIndex, BTScanOpaqueData::markPos, and IndexScanDescData::opaque.

Referenced by bthandler().

◆ btoptions()

bytea * btoptions ( Datum  reloptions,
bool  validate 
)

Definition at line 604 of file nbtutils.c.

605{
606 static const relopt_parse_elt tab[] = {
607 {"fillfactor", RELOPT_TYPE_INT, offsetof(BTOptions, fillfactor)},
608 {"vacuum_cleanup_index_scale_factor", RELOPT_TYPE_REAL,
609 offsetof(BTOptions, vacuum_cleanup_index_scale_factor)},
610 {"deduplicate_items", RELOPT_TYPE_BOOL,
611 offsetof(BTOptions, deduplicate_items)}
612 };
613
614 return (bytea *) build_reloptions(reloptions, validate,
616 sizeof(BTOptions),
617 tab, lengthof(tab));
618}
static bool validate(Port *port, const char *auth)
Definition: auth-oauth.c:638
#define lengthof(array)
Definition: c.h:801
static int fillfactor
Definition: pgbench.c:188
void * build_reloptions(Datum reloptions, bool validate, relopt_kind kind, Size relopt_struct_size, const relopt_parse_elt *relopt_elems, int num_relopt_elems)
Definition: reloptions.c:1954
@ RELOPT_KIND_BTREE
Definition: reloptions.h:44
@ RELOPT_TYPE_INT
Definition: reloptions.h:32
@ RELOPT_TYPE_BOOL
Definition: reloptions.h:31
@ RELOPT_TYPE_REAL
Definition: reloptions.h:33
Definition: c.h:706

References build_reloptions(), fillfactor, lengthof, RELOPT_KIND_BTREE, RELOPT_TYPE_BOOL, RELOPT_TYPE_INT, RELOPT_TYPE_REAL, and validate().

Referenced by bthandler().

◆ BTPageGetDeleteXid()

static FullTransactionId BTPageGetDeleteXid ( Page  page)
inlinestatic

Definition at line 261 of file nbtree.h.

262{
263 BTPageOpaque opaque;
264 BTDeletedPageData *contents;
265
266 /* We only expect to be called with a deleted page */
267 Assert(!PageIsNew(page));
268 opaque = BTPageGetOpaque(page);
269 Assert(P_ISDELETED(opaque));
270
271 /* pg_upgrade'd deleted page -- must be safe to recycle now */
272 if (!P_HAS_FULLXID(opaque))
274
275 /* Get safexid from deleted page */
276 contents = ((BTDeletedPageData *) PageGetContents(page));
277 return contents->safexid;
278}
static char * PageGetContents(Page page)
Definition: bufpage.h:257
#define P_HAS_FULLXID(opaque)
Definition: nbtree.h:229
FullTransactionId safexid
Definition: nbtree.h:236
#define FirstNormalFullTransactionId
Definition: transam.h:57

References Assert(), BTPageGetOpaque, FirstNormalFullTransactionId, P_HAS_FULLXID, P_ISDELETED, PageGetContents(), PageIsNew(), and BTDeletedPageData::safexid.

Referenced by _bt_allocbuf(), BTPageIsRecyclable(), and GetBTPageStatistics().

◆ BTPageIsRecyclable()

static bool BTPageIsRecyclable ( Page  page,
Relation  heaprel 
)
inlinestatic

Definition at line 292 of file nbtree.h.

293{
294 BTPageOpaque opaque;
295
296 Assert(!PageIsNew(page));
297 Assert(heaprel != NULL);
298
299 /* Recycling okay iff page is deleted and safexid is old enough */
300 opaque = BTPageGetOpaque(page);
301 if (P_ISDELETED(opaque))
302 {
304
305 /*
306 * The page was deleted, but when? If it was just deleted, a scan
307 * might have seen the downlink to it, and will read the page later.
308 * As long as that can happen, we must keep the deleted page around as
309 * a tombstone.
310 *
311 * For that check if the deletion XID could still be visible to
312 * anyone. If not, then no scan that's still in progress could have
313 * seen its downlink, and we can recycle it.
314 */
315 return GlobalVisCheckRemovableFullXid(heaprel, safexid);
316 }
317
318 return false;
319}

References Assert(), BTPageGetDeleteXid(), BTPageGetOpaque, GlobalVisCheckRemovableFullXid(), P_ISDELETED, and PageIsNew().

Referenced by _bt_allocbuf(), and btvacuumpage().

◆ BTPageSetDeleted()

static void BTPageSetDeleted ( Page  page,
FullTransactionId  safexid 
)
inlinestatic

Definition at line 240 of file nbtree.h.

241{
242 BTPageOpaque opaque;
243 PageHeader header;
244 BTDeletedPageData *contents;
245
246 opaque = BTPageGetOpaque(page);
247 header = ((PageHeader) page);
248
249 opaque->btpo_flags &= ~BTP_HALF_DEAD;
252 sizeof(BTDeletedPageData);
253 header->pd_upper = header->pd_special;
254
255 /* Set safexid in deleted page */
256 contents = ((BTDeletedPageData *) PageGetContents(page));
257 contents->safexid = safexid;
258}
#define BTP_HAS_FULLXID
Definition: nbtree.h:85
struct BTDeletedPageData BTDeletedPageData
#define BTP_DELETED
Definition: nbtree.h:79
LocationIndex pd_special
Definition: bufpage.h:167
LocationIndex pd_upper
Definition: bufpage.h:166
LocationIndex pd_lower
Definition: bufpage.h:165

References BTP_DELETED, BTP_HAS_FULLXID, BTPageGetOpaque, BTPageOpaqueData::btpo_flags, MAXALIGN, PageGetContents(), PageHeaderData::pd_lower, PageHeaderData::pd_special, PageHeaderData::pd_upper, BTDeletedPageData::safexid, and SizeOfPageHeaderData.

Referenced by _bt_unlink_halfdead_page(), and btree_xlog_unlink_page().

◆ btparallelrescan()

void btparallelrescan ( IndexScanDesc  scan)

Definition at line 842 of file nbtree.c.

843{
844 BTParallelScanDesc btscan;
845 ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
846
847 Assert(parallel_scan);
848
849 btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
850 parallel_scan->ps_offset_am);
851
852 /*
853 * In theory, we don't need to acquire the LWLock here, because there
854 * shouldn't be any other workers running at this point, but we do so for
855 * consistency.
856 */
861 LWLockRelease(&btscan->btps_lock);
862}

References Assert(), BTPARALLEL_NOT_INITIALIZED, BTParallelScanDescData::btps_lastCurrPage, BTParallelScanDescData::btps_lock, BTParallelScanDescData::btps_nextScanPage, BTParallelScanDescData::btps_pageStatus, InvalidBlockNumber, LW_EXCLUSIVE, LWLockAcquire(), LWLockRelease(), OffsetToPointer, IndexScanDescData::parallel_scan, and ParallelIndexScanDescData::ps_offset_am.

Referenced by bthandler().

◆ btproperty()

bool btproperty ( Oid  index_oid,
int  attno,
IndexAMProperty  prop,
const char *  propname,
bool *  res,
bool *  isnull 
)

Definition at line 627 of file nbtutils.c.

630{
631 switch (prop)
632 {
634 /* answer only for columns, not AM or whole index */
635 if (attno == 0)
636 return false;
637 /* otherwise, btree can always return data */
638 *res = true;
639 return true;
640
641 default:
642 return false; /* punt to generic code */
643 }
644}
@ AMPROP_RETURNABLE
Definition: amapi.h:47

References AMPROP_RETURNABLE.

Referenced by bthandler().

◆ BTreeShmemInit()

void BTreeShmemInit ( void  )

Definition at line 576 of file nbtutils.c.

577{
578 bool found;
579
580 btvacinfo = (BTVacInfo *) ShmemInitStruct("BTree Vacuum State",
582 &found);
583
585 {
586 /* Initialize shared memory area */
587 Assert(!found);
588
589 /*
590 * It doesn't really matter what the cycle counter starts at, but
591 * having it always start the same doesn't seem good. Seed with
592 * low-order bits of time() instead.
593 */
594 btvacinfo->cycle_ctr = (BTCycleId) time(NULL);
595
598 }
599 else
600 Assert(found);
601}
bool IsUnderPostmaster
Definition: globals.c:120
int MaxBackends
Definition: globals.c:146
Size BTreeShmemSize(void)
Definition: nbtutils.c:563
void * ShmemInitStruct(const char *name, Size size, bool *foundPtr)
Definition: shmem.c:389

References Assert(), BTreeShmemSize(), btvacinfo, BTVacInfo::cycle_ctr, IsUnderPostmaster, BTVacInfo::max_vacuums, MaxBackends, BTVacInfo::num_vacuums, and ShmemInitStruct().

Referenced by CreateOrAttachShmemStructs().

◆ BTreeShmemSize()

Size BTreeShmemSize ( void  )

Definition at line 563 of file nbtutils.c.

564{
565 Size size;
566
567 size = offsetof(BTVacInfo, vacuums);
568 size = add_size(size, mul_size(MaxBackends, sizeof(BTOneVacInfo)));
569 return size;
570}
Size mul_size(Size s1, Size s2)
Definition: shmem.c:510

References add_size(), MaxBackends, and mul_size().

Referenced by BTreeShmemInit(), and CalculateShmemSize().

◆ BTreeTupleGetDownLink()

static BlockNumber BTreeTupleGetDownLink ( IndexTuple  pivot)
inlinestatic

◆ BTreeTupleGetHeapTID()

static ItemPointer BTreeTupleGetHeapTID ( IndexTuple  itup)
inlinestatic

Definition at line 639 of file nbtree.h.

640{
641 if (BTreeTupleIsPivot(itup))
642 {
643 /* Pivot tuple heap TID representation? */
646 return (ItemPointer) ((char *) itup + IndexTupleSize(itup) -
647 sizeof(ItemPointerData));
648
649 /* Heap TID attribute was truncated */
650 return NULL;
651 }
652 else if (BTreeTupleIsPosting(itup))
653 return BTreeTupleGetPosting(itup);
654
655 return &itup->t_tid;
656}
struct ItemPointerData ItemPointerData

References BT_PIVOT_HEAP_TID_ATTR, BTreeTupleGetPosting(), BTreeTupleIsPivot(), BTreeTupleIsPosting(), IndexTupleSize(), ItemPointerGetOffsetNumberNoCheck(), and IndexTupleData::t_tid.

Referenced by _bt_bottomupdel_finish_pending(), _bt_check_natts(), _bt_check_third_page(), _bt_compare(), _bt_delitems_delete_check(), _bt_mkscankey(), _bt_swap_posting(), _bt_truncate(), bt_entry_unique_check(), bt_page_print_tuples(), bt_target_page_check(), BTreeTupleGetHeapTIDCareful(), and BTreeTupleGetPointsToTID().

◆ BTreeTupleGetMaxHeapTID()

static ItemPointer BTreeTupleGetMaxHeapTID ( IndexTuple  itup)
inlinestatic

◆ BTreeTupleGetNPosting()

◆ BTreeTupleGetPosting()

static ItemPointer BTreeTupleGetPosting ( IndexTuple  posting)
inlinestatic

◆ BTreeTupleGetPostingN()

◆ BTreeTupleGetPostingOffset()

◆ BTreeTupleGetTopParent()

static BlockNumber BTreeTupleGetTopParent ( IndexTuple  leafhikey)
inlinestatic

Definition at line 621 of file nbtree.h.

622{
623 return ItemPointerGetBlockNumberNoCheck(&leafhikey->t_tid);
624}

References ItemPointerGetBlockNumberNoCheck(), and IndexTupleData::t_tid.

Referenced by _bt_unlink_halfdead_page(), and bt_downlink_missing_check().

◆ BTreeTupleIsPivot()

◆ BTreeTupleIsPosting()

◆ BTreeTupleSetDownLink()

static void BTreeTupleSetDownLink ( IndexTuple  pivot,
BlockNumber  blkno 
)
inlinestatic

Definition at line 563 of file nbtree.h.

564{
565 ItemPointerSetBlockNumber(&pivot->t_tid, blkno);
566}
static void ItemPointerSetBlockNumber(ItemPointerData *pointer, BlockNumber blockNumber)
Definition: itemptr.h:147

References ItemPointerSetBlockNumber(), and IndexTupleData::t_tid.

Referenced by _bt_buildadd(), _bt_insert_parent(), _bt_mark_page_halfdead(), _bt_newlevel(), _bt_uppershutdown(), and btree_xlog_mark_page_halfdead().

◆ BTreeTupleSetNAtts()

static void BTreeTupleSetNAtts ( IndexTuple  itup,
uint16  nkeyatts,
bool  heaptid 
)
inlinestatic

Definition at line 596 of file nbtree.h.

597{
598 Assert(nkeyatts <= INDEX_MAX_KEYS);
599 Assert((nkeyatts & BT_STATUS_OFFSET_MASK) == 0);
600 Assert(!heaptid || nkeyatts > 0);
601 Assert(!BTreeTupleIsPivot(itup) || nkeyatts == 0);
602
603 itup->t_info |= INDEX_ALT_TID_MASK;
604
605 if (heaptid)
606 nkeyatts |= BT_PIVOT_HEAP_TID_ATTR;
607
608 /* BT_IS_POSTING bit is deliberately unset here */
609 ItemPointerSetOffsetNumber(&itup->t_tid, nkeyatts);
611}
#define BT_STATUS_OFFSET_MASK
Definition: nbtree.h:464

References Assert(), BT_PIVOT_HEAP_TID_ATTR, BT_STATUS_OFFSET_MASK, BTreeTupleIsPivot(), INDEX_ALT_TID_MASK, INDEX_MAX_KEYS, ItemPointerSetOffsetNumber(), IndexTupleData::t_info, and IndexTupleData::t_tid.

Referenced by _bt_buildadd(), _bt_newlevel(), _bt_pgaddtup(), _bt_sortaddtup(), _bt_truncate(), and BTreeTupleSetTopParent().

◆ BTreeTupleSetPosting()

static void BTreeTupleSetPosting ( IndexTuple  itup,
uint16  nhtids,
int  postingoffset 
)
inlinestatic

Definition at line 505 of file nbtree.h.

506{
507 Assert(nhtids > 1);
508 Assert((nhtids & BT_STATUS_OFFSET_MASK) == 0);
509 Assert((size_t) postingoffset == MAXALIGN(postingoffset));
510 Assert(postingoffset < INDEX_SIZE_MASK);
512
513 itup->t_info |= INDEX_ALT_TID_MASK;
515 ItemPointerSetBlockNumber(&itup->t_tid, postingoffset);
516}

References Assert(), BT_IS_POSTING, BT_STATUS_OFFSET_MASK, BTreeTupleIsPivot(), INDEX_ALT_TID_MASK, INDEX_SIZE_MASK, ItemPointerSetBlockNumber(), ItemPointerSetOffsetNumber(), MAXALIGN, IndexTupleData::t_info, and IndexTupleData::t_tid.

Referenced by _bt_form_posting(), and _bt_update_posting().

◆ BTreeTupleSetTopParent()

static void BTreeTupleSetTopParent ( IndexTuple  leafhikey,
BlockNumber  blkno 
)
inlinestatic

◆ btrescan()

void btrescan ( IndexScanDesc  scan,
ScanKey  scankey,
int  nscankeys,
ScanKey  orderbys,
int  norderbys 
)

Definition at line 385 of file nbtree.c.

387{
388 BTScanOpaque so = (BTScanOpaque) scan->opaque;
389
390 /* we aren't holding any read locks, but gotta drop the pins */
392 {
393 /* Before leaving current page, deal with any killed items */
394 if (so->numKilled > 0)
395 _bt_killitems(scan);
398 }
399
400 /*
401 * We prefer to eagerly drop leaf page pins before btgettuple returns.
402 * This avoids making VACUUM wait to acquire a cleanup lock on the page.
403 *
404 * We cannot safely drop leaf page pins during index-only scans due to a
405 * race condition involving VACUUM setting pages all-visible in the VM.
406 * It's also unsafe for plain index scans that use a non-MVCC snapshot.
407 *
408 * When we drop pins eagerly, the mechanism that marks so->killedItems[]
409 * index tuples LP_DEAD has to deal with concurrent TID recycling races.
410 * The scheme used to detect unsafe TID recycling won't work when scanning
411 * unlogged relations (since it involves saving an affected page's LSN).
412 * Opt out of eager pin dropping during unlogged relation scans for now
413 * (this is preferable to opting out of kill_prior_tuple LP_DEAD setting).
414 *
415 * Also opt out of dropping leaf page pins eagerly during bitmap scans.
416 * Pins cannot be held for more than an instant during bitmap scans either
417 * way, so we might as well avoid wasting cycles on acquiring page LSNs.
418 *
419 * See nbtree/README section on making concurrent TID recycling safe.
420 *
421 * Note: so->dropPin should never change across rescans.
422 */
423 so->dropPin = (!scan->xs_want_itup &&
426 scan->heapRelation != NULL);
427
428 so->markItemIndex = -1;
429 so->needPrimScan = false;
430 so->scanBehind = false;
431 so->oppositeDirCheck = false;
434
435 /*
436 * Allocate tuple workspace arrays, if needed for an index-only scan and
437 * not already done in a previous rescan call. To save on palloc
438 * overhead, both workspaces are allocated as one palloc block; only this
439 * function and btendscan know that.
440 *
441 * NOTE: this data structure also makes it safe to return data from a
442 * "name" column, even though btree name_ops uses an underlying storage
443 * datatype of cstring. The risk there is that "name" is supposed to be
444 * padded to NAMEDATALEN, but the actual index tuple is probably shorter.
445 * However, since we only return data out of tuples sitting in the
446 * currTuples array, a fetch of NAMEDATALEN bytes can at worst pull some
447 * data out of the markTuples array --- running off the end of memory for
448 * a SIGSEGV is not possible. Yeah, this is ugly as sin, but it beats
449 * adding special-case treatment for name_ops elsewhere.
450 */
451 if (scan->xs_want_itup && so->currTuples == NULL)
452 {
453 so->currTuples = (char *) palloc(BLCKSZ * 2);
454 so->markTuples = so->currTuples + BLCKSZ;
455 }
456
457 /*
458 * Reset the scan keys
459 */
460 if (scankey && scan->numberOfKeys > 0)
461 memcpy(scan->keyData, scankey, scan->numberOfKeys * sizeof(ScanKeyData));
462 so->numberOfKeys = 0; /* until _bt_preprocess_keys sets it */
463 so->numArrayKeys = 0; /* ditto */
464}
#define IsMVCCSnapshot(snapshot)
Definition: snapmgr.h:55

References _bt_killitems(), BTScanPosInvalidate, BTScanPosIsValid, BTScanPosUnpinIfPinned, BTScanOpaqueData::currPos, BTScanOpaqueData::currTuples, BTScanOpaqueData::dropPin, IndexScanDescData::heapRelation, if(), IndexScanDescData::indexRelation, IsMVCCSnapshot, IndexScanDescData::keyData, BTScanOpaqueData::markItemIndex, BTScanOpaqueData::markPos, BTScanOpaqueData::markTuples, BTScanOpaqueData::needPrimScan, BTScanOpaqueData::numArrayKeys, BTScanOpaqueData::numberOfKeys, IndexScanDescData::numberOfKeys, BTScanOpaqueData::numKilled, IndexScanDescData::opaque, BTScanOpaqueData::oppositeDirCheck, palloc(), RelationNeedsWAL, BTScanOpaqueData::scanBehind, IndexScanDescData::xs_snapshot, and IndexScanDescData::xs_want_itup.

Referenced by bthandler().

◆ btrestrpos()

void btrestrpos ( IndexScanDesc  scan)

Definition at line 532 of file nbtree.c.

533{
534 BTScanOpaque so = (BTScanOpaque) scan->opaque;
535
536 if (so->markItemIndex >= 0)
537 {
538 /*
539 * The scan has never moved to a new page since the last mark. Just
540 * restore the itemIndex.
541 *
542 * NB: In this case we can't count on anything in so->markPos to be
543 * accurate.
544 */
546 }
547 else
548 {
549 /*
550 * The scan moved to a new page after last mark or restore, and we are
551 * now restoring to the marked page. We aren't holding any read
552 * locks, but if we're still holding the pin for the current position,
553 * we must drop it.
554 */
555 if (BTScanPosIsValid(so->currPos))
556 {
557 /* Before leaving current page, deal with any killed items */
558 if (so->numKilled > 0)
559 _bt_killitems(scan);
561 }
562
563 if (BTScanPosIsValid(so->markPos))
564 {
565 /* bump pin on mark buffer for assignment to current buffer */
566 if (BTScanPosIsPinned(so->markPos))
568 memcpy(&so->currPos, &so->markPos,
569 offsetof(BTScanPosData, items[1]) +
570 so->markPos.lastItem * sizeof(BTScanPosItem));
571 if (so->currTuples)
572 memcpy(so->currTuples, so->markTuples,
574 /* Reset the scan's array keys (see _bt_steppage for why) */
575 if (so->numArrayKeys)
576 {
578 so->needPrimScan = false;
579 }
580 }
581 else
583 }
584}
void IncrBufferRefCount(Buffer buffer)
Definition: bufmgr.c:5398
static ItemArray items
Definition: test_tidstore.c:48

References _bt_killitems(), _bt_start_array_keys(), BTScanPosInvalidate, BTScanPosIsPinned, BTScanPosIsValid, BTScanPosUnpinIfPinned, BTScanPosData::buf, BTScanOpaqueData::currPos, BTScanOpaqueData::currTuples, BTScanPosData::dir, if(), IncrBufferRefCount(), BTScanPosData::itemIndex, items, BTScanPosData::lastItem, BTScanOpaqueData::markItemIndex, BTScanOpaqueData::markPos, BTScanOpaqueData::markTuples, BTScanOpaqueData::needPrimScan, BTScanPosData::nextTupleOffset, BTScanOpaqueData::numArrayKeys, BTScanOpaqueData::numKilled, and IndexScanDescData::opaque.

Referenced by bthandler().

◆ bttranslatecmptype()

StrategyNumber bttranslatecmptype ( CompareType  cmptype,
Oid  opfamily 
)

Definition at line 1849 of file nbtree.c.

1850{
1851 switch (cmptype)
1852 {
1853 case COMPARE_LT:
1854 return BTLessStrategyNumber;
1855 case COMPARE_LE:
1857 case COMPARE_EQ:
1858 return BTEqualStrategyNumber;
1859 case COMPARE_GE:
1861 case COMPARE_GT:
1863 default:
1864 return InvalidStrategy;
1865 }
1866}
@ COMPARE_LE
Definition: cmptype.h:35
@ COMPARE_GT
Definition: cmptype.h:38
@ COMPARE_EQ
Definition: cmptype.h:36
@ COMPARE_GE
Definition: cmptype.h:37
@ COMPARE_LT
Definition: cmptype.h:34

References BTEqualStrategyNumber, BTGreaterEqualStrategyNumber, BTGreaterStrategyNumber, BTLessEqualStrategyNumber, BTLessStrategyNumber, COMPARE_EQ, COMPARE_GE, COMPARE_GT, COMPARE_LE, COMPARE_LT, and InvalidStrategy.

Referenced by bthandler().

◆ bttranslatestrategy()

CompareType bttranslatestrategy ( StrategyNumber  strategy,
Oid  opfamily 
)

Definition at line 1829 of file nbtree.c.

1830{
1831 switch (strategy)
1832 {
1834 return COMPARE_LT;
1836 return COMPARE_LE;
1838 return COMPARE_EQ;
1840 return COMPARE_GE;
1842 return COMPARE_GT;
1843 default:
1844 return COMPARE_INVALID;
1845 }
1846}
@ COMPARE_INVALID
Definition: cmptype.h:33

References BTEqualStrategyNumber, BTGreaterEqualStrategyNumber, BTGreaterStrategyNumber, BTLessEqualStrategyNumber, BTLessStrategyNumber, COMPARE_EQ, COMPARE_GE, COMPARE_GT, COMPARE_INVALID, COMPARE_LE, and COMPARE_LT.

Referenced by bthandler().

◆ btvacuumcleanup()

IndexBulkDeleteResult * btvacuumcleanup ( IndexVacuumInfo info,
IndexBulkDeleteResult stats 
)

Definition at line 1164 of file nbtree.c.

1165{
1166 BlockNumber num_delpages;
1167
1168 /* No-op in ANALYZE ONLY mode */
1169 if (info->analyze_only)
1170 return stats;
1171
1172 /*
1173 * If btbulkdelete was called, we need not do anything (we just maintain
1174 * the information used within _bt_vacuum_needs_cleanup() by calling
1175 * _bt_set_cleanup_info() below).
1176 *
1177 * If btbulkdelete was _not_ called, then we have a choice to make: we
1178 * must decide whether or not a btvacuumscan() call is needed now (i.e.
1179 * whether the ongoing VACUUM operation can entirely avoid a physical scan
1180 * of the index). A call to _bt_vacuum_needs_cleanup() decides it for us
1181 * now.
1182 */
1183 if (stats == NULL)
1184 {
1185 /* Check if VACUUM operation can entirely avoid btvacuumscan() call */
1186 if (!_bt_vacuum_needs_cleanup(info->index))
1187 return NULL;
1188
1189 /*
1190 * Since we aren't going to actually delete any leaf items, there's no
1191 * need to go through all the vacuum-cycle-ID pushups here.
1192 *
1193 * Posting list tuples are a source of inaccuracy for cleanup-only
1194 * scans. btvacuumscan() will assume that the number of index tuples
1195 * from each page can be used as num_index_tuples, even though
1196 * num_index_tuples is supposed to represent the number of TIDs in the
1197 * index. This naive approach can underestimate the number of tuples
1198 * in the index significantly.
1199 *
1200 * We handle the problem by making num_index_tuples an estimate in
1201 * cleanup-only case.
1202 */
1204 btvacuumscan(info, stats, NULL, NULL, 0);
1205 stats->estimated_count = true;
1206 }
1207
1208 /*
1209 * Maintain num_delpages value in metapage for _bt_vacuum_needs_cleanup().
1210 *
1211 * num_delpages is the number of deleted pages now in the index that were
1212 * not safe to place in the FSM to be recycled just yet. num_delpages is
1213 * greater than 0 only when _bt_pagedel() actually deleted pages during
1214 * our call to btvacuumscan(). Even then, _bt_pendingfsm_finalize() must
1215 * have failed to place any newly deleted pages in the FSM just moments
1216 * ago. (Actually, there are edge cases where recycling of the current
1217 * VACUUM's newly deleted pages does not even become safe by the time the
1218 * next VACUUM comes around. See nbtree/README.)
1219 */
1220 Assert(stats->pages_deleted >= stats->pages_free);
1221 num_delpages = stats->pages_deleted - stats->pages_free;
1222 _bt_set_cleanup_info(info->index, num_delpages);
1223
1224 /*
1225 * It's quite possible for us to be fooled by concurrent page splits into
1226 * double-counting some index tuples, so disbelieve any total that exceeds
1227 * the underlying heap's count ... if we know that accurately. Otherwise
1228 * this might just make matters worse.
1229 */
1230 if (!info->estimated_count)
1231 {
1232 if (stats->num_index_tuples > info->num_heap_tuples)
1233 stats->num_index_tuples = info->num_heap_tuples;
1234 }
1235
1236 return stats;
1237}
void _bt_set_cleanup_info(Relation rel, BlockNumber num_delpages)
Definition: nbtpage.c:233
bool _bt_vacuum_needs_cleanup(Relation rel)
Definition: nbtpage.c:180
BlockNumber pages_deleted
Definition: genam.h:109
double num_index_tuples
Definition: genam.h:106
double num_heap_tuples
Definition: genam.h:79
bool analyze_only
Definition: genam.h:75
bool estimated_count
Definition: genam.h:77

References _bt_set_cleanup_info(), _bt_vacuum_needs_cleanup(), IndexVacuumInfo::analyze_only, Assert(), btvacuumscan(), IndexVacuumInfo::estimated_count, IndexBulkDeleteResult::estimated_count, IndexVacuumInfo::index, IndexVacuumInfo::num_heap_tuples, IndexBulkDeleteResult::num_index_tuples, IndexBulkDeleteResult::pages_deleted, IndexBulkDeleteResult::pages_free, and palloc0_object.

Referenced by bthandler().

◆ btvalidate()

bool btvalidate ( Oid  opclassoid)

Definition at line 40 of file nbtvalidate.c.

41{
42 bool result = true;
43 HeapTuple classtup;
44 Form_pg_opclass classform;
45 Oid opfamilyoid;
46 Oid opcintype;
47 char *opclassname;
48 char *opfamilyname;
49 CatCList *proclist,
50 *oprlist;
51 List *grouplist;
52 OpFamilyOpFuncGroup *opclassgroup;
53 List *familytypes;
54 int usefulgroups;
55 int i;
56 ListCell *lc;
57
58 /* Fetch opclass information */
59 classtup = SearchSysCache1(CLAOID, ObjectIdGetDatum(opclassoid));
60 if (!HeapTupleIsValid(classtup))
61 elog(ERROR, "cache lookup failed for operator class %u", opclassoid);
62 classform = (Form_pg_opclass) GETSTRUCT(classtup);
63
64 opfamilyoid = classform->opcfamily;
65 opcintype = classform->opcintype;
66 opclassname = NameStr(classform->opcname);
67
68 /* Fetch opfamily information */
69 opfamilyname = get_opfamily_name(opfamilyoid, false);
70
71 /* Fetch all operators and support functions of the opfamily */
72 oprlist = SearchSysCacheList1(AMOPSTRATEGY, ObjectIdGetDatum(opfamilyoid));
73 proclist = SearchSysCacheList1(AMPROCNUM, ObjectIdGetDatum(opfamilyoid));
74
75 /* Check individual support functions */
76 for (i = 0; i < proclist->n_members; i++)
77 {
78 HeapTuple proctup = &proclist->members[i]->tuple;
79 Form_pg_amproc procform = (Form_pg_amproc) GETSTRUCT(proctup);
80 bool ok;
81
82 /* Check procedure numbers and function signatures */
83 switch (procform->amprocnum)
84 {
85 case BTORDER_PROC:
86 ok = check_amproc_signature(procform->amproc, INT4OID, true,
87 2, 2, procform->amproclefttype,
88 procform->amprocrighttype);
89 break;
91 ok = check_amproc_signature(procform->amproc, VOIDOID, true,
92 1, 1, INTERNALOID);
93 break;
94 case BTINRANGE_PROC:
95 ok = check_amproc_signature(procform->amproc, BOOLOID, true,
96 5, 5,
97 procform->amproclefttype,
98 procform->amproclefttype,
99 procform->amprocrighttype,
100 BOOLOID, BOOLOID);
101 break;
103 ok = check_amproc_signature(procform->amproc, BOOLOID, true,
104 1, 1, OIDOID);
105 break;
106 case BTOPTIONS_PROC:
107 ok = check_amoptsproc_signature(procform->amproc);
108 break;
110 ok = check_amproc_signature(procform->amproc, VOIDOID, true,
111 1, 1, INTERNALOID);
112 break;
113 default:
115 (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
116 errmsg("operator family \"%s\" of access method %s contains function %s with invalid support number %d",
117 opfamilyname, "btree",
118 format_procedure(procform->amproc),
119 procform->amprocnum)));
120 result = false;
121 continue; /* don't want additional message */
122 }
123
124 if (!ok)
125 {
127 (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
128 errmsg("operator family \"%s\" of access method %s contains function %s with wrong signature for support number %d",
129 opfamilyname, "btree",
130 format_procedure(procform->amproc),
131 procform->amprocnum)));
132 result = false;
133 }
134 }
135
136 /* Check individual operators */
137 for (i = 0; i < oprlist->n_members; i++)
138 {
139 HeapTuple oprtup = &oprlist->members[i]->tuple;
140 Form_pg_amop oprform = (Form_pg_amop) GETSTRUCT(oprtup);
141
142 /* Check that only allowed strategy numbers exist */
143 if (oprform->amopstrategy < 1 ||
144 oprform->amopstrategy > BTMaxStrategyNumber)
145 {
147 (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
148 errmsg("operator family \"%s\" of access method %s contains operator %s with invalid strategy number %d",
149 opfamilyname, "btree",
150 format_operator(oprform->amopopr),
151 oprform->amopstrategy)));
152 result = false;
153 }
154
155 /* btree doesn't support ORDER BY operators */
156 if (oprform->amoppurpose != AMOP_SEARCH ||
157 OidIsValid(oprform->amopsortfamily))
158 {
160 (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
161 errmsg("operator family \"%s\" of access method %s contains invalid ORDER BY specification for operator %s",
162 opfamilyname, "btree",
163 format_operator(oprform->amopopr))));
164 result = false;
165 }
166
167 /* Check operator signature --- same for all btree strategies */
168 if (!check_amop_signature(oprform->amopopr, BOOLOID,
169 oprform->amoplefttype,
170 oprform->amoprighttype))
171 {
173 (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
174 errmsg("operator family \"%s\" of access method %s contains operator %s with wrong signature",
175 opfamilyname, "btree",
176 format_operator(oprform->amopopr))));
177 result = false;
178 }
179 }
180
181 /* Now check for inconsistent groups of operators/functions */
182 grouplist = identify_opfamily_groups(oprlist, proclist);
183 usefulgroups = 0;
184 opclassgroup = NULL;
185 familytypes = NIL;
186 foreach(lc, grouplist)
187 {
189
190 /*
191 * It is possible for an in_range support function to have a RHS type
192 * that is otherwise irrelevant to the opfamily --- for instance, SQL
193 * requires the datetime_ops opclass to have range support with an
194 * interval offset. So, if this group appears to contain only an
195 * in_range function, ignore it: it doesn't represent a pair of
196 * supported types.
197 */
198 if (thisgroup->operatorset == 0 &&
199 thisgroup->functionset == (1 << BTINRANGE_PROC))
200 continue;
201
202 /* Else count it as a relevant group */
203 usefulgroups++;
204
205 /* Remember the group exactly matching the test opclass */
206 if (thisgroup->lefttype == opcintype &&
207 thisgroup->righttype == opcintype)
208 opclassgroup = thisgroup;
209
210 /*
211 * Identify all distinct data types handled in this opfamily. This
212 * implementation is O(N^2), but there aren't likely to be enough
213 * types in the family for it to matter.
214 */
215 familytypes = list_append_unique_oid(familytypes, thisgroup->lefttype);
216 familytypes = list_append_unique_oid(familytypes, thisgroup->righttype);
217
218 /*
219 * Complain if there seems to be an incomplete set of either operators
220 * or support functions for this datatype pair. The sortsupport,
221 * in_range, and equalimage functions are considered optional.
222 */
223 if (thisgroup->operatorset !=
224 ((1 << BTLessStrategyNumber) |
226 (1 << BTEqualStrategyNumber) |
229 {
231 (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
232 errmsg("operator family \"%s\" of access method %s is missing operator(s) for types %s and %s",
233 opfamilyname, "btree",
234 format_type_be(thisgroup->lefttype),
235 format_type_be(thisgroup->righttype))));
236 result = false;
237 }
238 if ((thisgroup->functionset & (1 << BTORDER_PROC)) == 0)
239 {
241 (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
242 errmsg("operator family \"%s\" of access method %s is missing support function for types %s and %s",
243 opfamilyname, "btree",
244 format_type_be(thisgroup->lefttype),
245 format_type_be(thisgroup->righttype))));
246 result = false;
247 }
248 }
249
250 /* Check that the originally-named opclass is supported */
251 /* (if group is there, we already checked it adequately above) */
252 if (!opclassgroup)
253 {
255 (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
256 errmsg("operator class \"%s\" of access method %s is missing operator(s)",
257 opclassname, "btree")));
258 result = false;
259 }
260
261 /*
262 * Complain if the opfamily doesn't have entries for all possible
263 * combinations of its supported datatypes. While missing cross-type
264 * operators are not fatal, they do limit the planner's ability to derive
265 * additional qual clauses from equivalence classes, so it seems
266 * reasonable to insist that all built-in btree opfamilies be complete.
267 */
268 if (usefulgroups != (list_length(familytypes) * list_length(familytypes)))
269 {
271 (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
272 errmsg("operator family \"%s\" of access method %s is missing cross-type operator(s)",
273 opfamilyname, "btree")));
274 result = false;
275 }
276
277 ReleaseCatCacheList(proclist);
278 ReleaseCatCacheList(oprlist);
279 ReleaseSysCache(classtup);
280
281 return result;
282}
bool check_amproc_signature(Oid funcid, Oid restype, bool exact, int minargs, int maxargs,...)
Definition: amvalidate.c:152
bool check_amop_signature(Oid opno, Oid restype, Oid lefttype, Oid righttype)
Definition: amvalidate.c:206
List * identify_opfamily_groups(CatCList *oprlist, CatCList *proclist)
Definition: amvalidate.c:43
bool check_amoptsproc_signature(Oid funcid)
Definition: amvalidate.c:192
#define NameStr(name)
Definition: c.h:765
void ReleaseCatCacheList(CatCList *list)
Definition: catcache.c:2100
#define INFO
Definition: elog.h:34
char * format_type_be(Oid type_oid)
Definition: format_type.c:343
#define HeapTupleIsValid(tuple)
Definition: htup.h:78
static void * GETSTRUCT(const HeapTupleData *tuple)
Definition: htup_details.h:728
List * list_append_unique_oid(List *list, Oid datum)
Definition: list.c:1380
char * get_opfamily_name(Oid opfid, bool missing_ok)
Definition: lsyscache.c:1418
#define BTSKIPSUPPORT_PROC
Definition: nbtree.h:722
#define BTSORTSUPPORT_PROC
Definition: nbtree.h:718
#define BTINRANGE_PROC
Definition: nbtree.h:719
#define BTOPTIONS_PROC
Definition: nbtree.h:721
FormData_pg_amop * Form_pg_amop
Definition: pg_amop.h:88
FormData_pg_amproc * Form_pg_amproc
Definition: pg_amproc.h:68
static int list_length(const List *l)
Definition: pg_list.h:152
#define NIL
Definition: pg_list.h:68
FormData_pg_opclass * Form_pg_opclass
Definition: pg_opclass.h:83
char * format_procedure(Oid procedure_oid)
Definition: regproc.c:305
char * format_operator(Oid operator_oid)
Definition: regproc.c:801
Definition: pg_list.h:54
CatCTup * members[FLEXIBLE_ARRAY_MEMBER]
Definition: catcache.h:185
int n_members
Definition: catcache.h:183
HeapTupleData tuple
Definition: catcache.h:124
void ReleaseSysCache(HeapTuple tuple)
Definition: syscache.c:264
HeapTuple SearchSysCache1(int cacheId, Datum key1)
Definition: syscache.c:220
#define SearchSysCacheList1(cacheId, key1)
Definition: syscache.h:127

References BTEQUALIMAGE_PROC, BTEqualStrategyNumber, BTGreaterEqualStrategyNumber, BTGreaterStrategyNumber, BTINRANGE_PROC, BTLessEqualStrategyNumber, BTLessStrategyNumber, BTMaxStrategyNumber, BTOPTIONS_PROC, BTORDER_PROC, BTSKIPSUPPORT_PROC, BTSORTSUPPORT_PROC, check_amop_signature(), check_amoptsproc_signature(), check_amproc_signature(), elog, ereport, errcode(), errmsg(), ERROR, format_operator(), format_procedure(), format_type_be(), OpFamilyOpFuncGroup::functionset, get_opfamily_name(), GETSTRUCT(), HeapTupleIsValid, i, identify_opfamily_groups(), INFO, OpFamilyOpFuncGroup::lefttype, lfirst, list_append_unique_oid(), list_length(), catclist::members, catclist::n_members, NameStr, NIL, ObjectIdGetDatum(), OidIsValid, OpFamilyOpFuncGroup::operatorset, ReleaseCatCacheList(), ReleaseSysCache(), OpFamilyOpFuncGroup::righttype, SearchSysCache1(), SearchSysCacheList1, and catctup::tuple.

Referenced by bthandler().

◆ StaticAssertDecl()

StaticAssertDecl ( BT_OFFSET_MASK >=  INDEX_MAX_KEYS,
"BT_OFFSET_MASK can't fit INDEX_MAX_KEYS  
)