PostgreSQL Source Code git master
Loading...
Searching...
No Matches
nbtree.h File Reference
#include "access/amapi.h"
#include "access/itup.h"
#include "access/sdir.h"
#include "catalog/pg_am_d.h"
#include "catalog/pg_class.h"
#include "catalog/pg_index.h"
#include "lib/stringinfo.h"
#include "storage/bufmgr.h"
#include "storage/dsm.h"
#include "storage/shm_toc.h"
#include "utils/skipsupport.h"
Include dependency graph for nbtree.h:
This graph shows which files directly or indirectly include this file:

Go to the source code of this file.

Data Structures

struct  BTPageOpaqueData
 
struct  BTMetaPageData
 
struct  BTDeletedPageData
 
struct  BTPendingFSM
 
struct  BTVacState
 
struct  BTStackData
 
struct  BTScanInsertData
 
struct  BTInsertStateData
 
struct  BTDedupInterval
 
struct  BTDedupStateData
 
struct  BTVacuumPostingData
 
struct  BTScanPosItem
 
struct  BTScanPosData
 
struct  BTArrayKeyInfo
 
struct  BTScanOpaqueData
 
struct  BTOptions
 

Macros

#define BTPageGetOpaque(page)   ((BTPageOpaque) PageGetSpecialPointer(page))
 
#define BTP_LEAF   (1 << 0) /* leaf page, i.e. not internal page */
 
#define BTP_ROOT   (1 << 1) /* root page (has no parent) */
 
#define BTP_DELETED   (1 << 2) /* page has been deleted from tree */
 
#define BTP_META   (1 << 3) /* meta-page */
 
#define BTP_HALF_DEAD   (1 << 4) /* empty, but still in tree */
 
#define BTP_SPLIT_END   (1 << 5) /* rightmost page of split group */
 
#define BTP_HAS_GARBAGE   (1 << 6) /* page has LP_DEAD tuples (deprecated) */
 
#define BTP_INCOMPLETE_SPLIT   (1 << 7) /* right sibling's downlink is missing */
 
#define BTP_HAS_FULLXID   (1 << 8) /* contains BTDeletedPageData */
 
#define MAX_BT_CYCLE_ID   0xFF7F
 
#define BTPageGetMeta(p)    ((BTMetaPageData *) PageGetContents(p))
 
#define BTREE_METAPAGE   0 /* first page is meta */
 
#define BTREE_MAGIC   0x053162 /* magic number in metapage */
 
#define BTREE_VERSION   4 /* current version number */
 
#define BTREE_MIN_VERSION   2 /* minimum supported version */
 
#define BTREE_NOVAC_VERSION   3 /* version with all meta fields set */
 
#define BTMaxItemSize
 
#define BTMaxItemSizeNoHeapTid
 
#define MaxTIDsPerBTreePage
 
#define BTREE_MIN_FILLFACTOR   10
 
#define BTREE_DEFAULT_FILLFACTOR   90
 
#define BTREE_NONLEAF_FILLFACTOR   70
 
#define BTREE_SINGLEVAL_FILLFACTOR   96
 
#define P_NONE   0
 
#define P_LEFTMOST(opaque)   ((opaque)->btpo_prev == P_NONE)
 
#define P_RIGHTMOST(opaque)   ((opaque)->btpo_next == P_NONE)
 
#define P_ISLEAF(opaque)   (((opaque)->btpo_flags & BTP_LEAF) != 0)
 
#define P_ISROOT(opaque)   (((opaque)->btpo_flags & BTP_ROOT) != 0)
 
#define P_ISDELETED(opaque)   (((opaque)->btpo_flags & BTP_DELETED) != 0)
 
#define P_ISMETA(opaque)   (((opaque)->btpo_flags & BTP_META) != 0)
 
#define P_ISHALFDEAD(opaque)   (((opaque)->btpo_flags & BTP_HALF_DEAD) != 0)
 
#define P_IGNORE(opaque)   (((opaque)->btpo_flags & (BTP_DELETED|BTP_HALF_DEAD)) != 0)
 
#define P_HAS_GARBAGE(opaque)   (((opaque)->btpo_flags & BTP_HAS_GARBAGE) != 0)
 
#define P_INCOMPLETE_SPLIT(opaque)   (((opaque)->btpo_flags & BTP_INCOMPLETE_SPLIT) != 0)
 
#define P_HAS_FULLXID(opaque)   (((opaque)->btpo_flags & BTP_HAS_FULLXID) != 0)
 
#define P_HIKEY   ((OffsetNumber) 1)
 
#define P_FIRSTKEY   ((OffsetNumber) 2)
 
#define P_FIRSTDATAKEY(opaque)   (P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY)
 
#define INDEX_ALT_TID_MASK   INDEX_AM_RESERVED_BIT
 
#define BT_OFFSET_MASK   0x0FFF
 
#define BT_STATUS_OFFSET_MASK   0xF000
 
#define BT_PIVOT_HEAP_TID_ATTR   0x1000
 
#define BT_IS_POSTING   0x2000
 
#define BTreeTupleGetNAtts(itup, rel)
 
#define BTCommuteStrategyNumber(strat)   (BTMaxStrategyNumber + 1 - (strat))
 
#define BTORDER_PROC   1
 
#define BTSORTSUPPORT_PROC   2
 
#define BTINRANGE_PROC   3
 
#define BTEQUALIMAGE_PROC   4
 
#define BTOPTIONS_PROC   5
 
#define BTSKIPSUPPORT_PROC   6
 
#define BTNProcs   6
 
#define BT_READ   BUFFER_LOCK_SHARE
 
#define BT_WRITE   BUFFER_LOCK_EXCLUSIVE
 
#define BTScanPosIsPinned(scanpos)
 
#define BTScanPosUnpin(scanpos)
 
#define BTScanPosUnpinIfPinned(scanpos)
 
#define BTScanPosIsValid(scanpos)
 
#define BTScanPosInvalidate(scanpos)
 
#define SK_BT_REQFWD   0x00010000 /* required to continue forward scan */
 
#define SK_BT_REQBKWD   0x00020000 /* required to continue backward scan */
 
#define SK_BT_SKIP   0x00040000 /* skip array on column without input = */
 
#define SK_BT_MINVAL   0x00080000 /* invalid sk_argument, use low_compare */
 
#define SK_BT_MAXVAL   0x00100000 /* invalid sk_argument, use high_compare */
 
#define SK_BT_NEXT   0x00200000 /* positions the scan > sk_argument */
 
#define SK_BT_PRIOR   0x00400000 /* positions the scan < sk_argument */
 
#define SK_BT_INDOPTION_SHIFT   24 /* must clear the above bits */
 
#define SK_BT_DESC   (INDOPTION_DESC << SK_BT_INDOPTION_SHIFT)
 
#define SK_BT_NULLS_FIRST   (INDOPTION_NULLS_FIRST << SK_BT_INDOPTION_SHIFT)
 
#define BTGetFillFactor(relation)
 
#define BTGetTargetPageFreeSpace(relation)    (BLCKSZ * (100 - BTGetFillFactor(relation)) / 100)
 
#define BTGetDeduplicateItems(relation)
 
#define PROGRESS_BTREE_PHASE_INDEXBUILD_TABLESCAN   2
 
#define PROGRESS_BTREE_PHASE_PERFORMSORT_1   3
 
#define PROGRESS_BTREE_PHASE_PERFORMSORT_2   4
 
#define PROGRESS_BTREE_PHASE_LEAF_LOAD   5
 

Typedefs

typedef uint16 BTCycleId
 
typedef struct BTPageOpaqueData BTPageOpaqueData
 
typedef BTPageOpaqueDataBTPageOpaque
 
typedef struct BTMetaPageData BTMetaPageData
 
typedef struct BTDeletedPageData BTDeletedPageData
 
typedef struct BTPendingFSM BTPendingFSM
 
typedef struct BTVacState BTVacState
 
typedef struct BTStackData BTStackData
 
typedef BTStackDataBTStack
 
typedef struct BTScanInsertData BTScanInsertData
 
typedef BTScanInsertDataBTScanInsert
 
typedef struct BTInsertStateData BTInsertStateData
 
typedef BTInsertStateDataBTInsertState
 
typedef struct BTDedupInterval BTDedupInterval
 
typedef struct BTDedupStateData BTDedupStateData
 
typedef BTDedupStateDataBTDedupState
 
typedef struct BTVacuumPostingData BTVacuumPostingData
 
typedef BTVacuumPostingDataBTVacuumPosting
 
typedef struct BTScanPosItem BTScanPosItem
 
typedef struct BTScanPosData BTScanPosData
 
typedef BTScanPosDataBTScanPos
 
typedef struct BTArrayKeyInfo BTArrayKeyInfo
 
typedef struct BTScanOpaqueData BTScanOpaqueData
 
typedef BTScanOpaqueDataBTScanOpaque
 
typedef struct BTOptions BTOptions
 

Functions

static void BTPageSetDeleted (Page page, FullTransactionId safexid)
 
static FullTransactionId BTPageGetDeleteXid (Page page)
 
static bool BTPageIsRecyclable (Page page, Relation heaprel)
 
 StaticAssertDecl (BT_OFFSET_MASK >=INDEX_MAX_KEYS, "BT_OFFSET_MASK can't fit INDEX_MAX_KEYS")
 
static bool BTreeTupleIsPivot (IndexTuple itup)
 
static bool BTreeTupleIsPosting (IndexTuple itup)
 
static void BTreeTupleSetPosting (IndexTuple itup, uint16 nhtids, int postingoffset)
 
static uint16 BTreeTupleGetNPosting (IndexTuple posting)
 
static uint32 BTreeTupleGetPostingOffset (IndexTuple posting)
 
static ItemPointer BTreeTupleGetPosting (IndexTuple posting)
 
static ItemPointer BTreeTupleGetPostingN (IndexTuple posting, int n)
 
static BlockNumber BTreeTupleGetDownLink (IndexTuple pivot)
 
static void BTreeTupleSetDownLink (IndexTuple pivot, BlockNumber blkno)
 
static void BTreeTupleSetNAtts (IndexTuple itup, uint16 nkeyatts, bool heaptid)
 
static BlockNumber BTreeTupleGetTopParent (IndexTuple leafhikey)
 
static void BTreeTupleSetTopParent (IndexTuple leafhikey, BlockNumber blkno)
 
static ItemPointer BTreeTupleGetHeapTID (IndexTuple itup)
 
static ItemPointer BTreeTupleGetMaxHeapTID (IndexTuple itup)
 
void btbuildempty (Relation index)
 
bool btinsert (Relation rel, Datum *values, bool *isnull, ItemPointer ht_ctid, Relation heapRel, IndexUniqueCheck checkUnique, bool indexUnchanged, struct IndexInfo *indexInfo)
 
IndexScanDesc btbeginscan (Relation rel, int nkeys, int norderbys)
 
Size btestimateparallelscan (Relation rel, int nkeys, int norderbys)
 
void btinitparallelscan (void *target)
 
bool btgettuple (IndexScanDesc scan, ScanDirection dir)
 
int64 btgetbitmap (IndexScanDesc scan, TIDBitmap *tbm)
 
void btrescan (IndexScanDesc scan, ScanKey scankey, int nscankeys, ScanKey orderbys, int norderbys)
 
void btparallelrescan (IndexScanDesc scan)
 
void btendscan (IndexScanDesc scan)
 
void btmarkpos (IndexScanDesc scan)
 
void btrestrpos (IndexScanDesc scan)
 
IndexBulkDeleteResultbtbulkdelete (IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state)
 
IndexBulkDeleteResultbtvacuumcleanup (IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
 
bool btcanreturn (Relation index, int attno)
 
int btgettreeheight (Relation rel)
 
CompareType bttranslatestrategy (StrategyNumber strategy, Oid opfamily)
 
StrategyNumber bttranslatecmptype (CompareType cmptype, Oid opfamily)
 
bool _bt_parallel_seize (IndexScanDesc scan, BlockNumber *next_scan_page, BlockNumber *last_curr_page, bool first)
 
void _bt_parallel_release (IndexScanDesc scan, BlockNumber next_scan_page, BlockNumber curr_page)
 
void _bt_parallel_done (IndexScanDesc scan)
 
void _bt_parallel_primscan_schedule (IndexScanDesc scan, BlockNumber curr_page)
 
void _bt_dedup_pass (Relation rel, Buffer buf, IndexTuple newitem, Size newitemsz, bool bottomupdedup)
 
bool _bt_bottomupdel_pass (Relation rel, Buffer buf, Relation heapRel, Size newitemsz)
 
void _bt_dedup_start_pending (BTDedupState state, IndexTuple base, OffsetNumber baseoff)
 
bool _bt_dedup_save_htid (BTDedupState state, IndexTuple itup)
 
Size _bt_dedup_finish_pending (Page newpage, BTDedupState state)
 
IndexTuple _bt_form_posting (IndexTuple base, const ItemPointerData *htids, int nhtids)
 
void _bt_update_posting (BTVacuumPosting vacposting)
 
IndexTuple _bt_swap_posting (IndexTuple newitem, IndexTuple oposting, int postingoff)
 
bool _bt_doinsert (Relation rel, IndexTuple itup, IndexUniqueCheck checkUnique, bool indexUnchanged, Relation heapRel)
 
void _bt_finish_split (Relation rel, Relation heaprel, Buffer lbuf, BTStack stack)
 
Buffer _bt_getstackbuf (Relation rel, Relation heaprel, BTStack stack, BlockNumber child)
 
OffsetNumber _bt_findsplitloc (Relation rel, Page origpage, OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem, bool *newitemonleft)
 
void _bt_initmetapage (Page page, BlockNumber rootbknum, uint32 level, bool allequalimage)
 
bool _bt_vacuum_needs_cleanup (Relation rel)
 
void _bt_set_cleanup_info (Relation rel, BlockNumber num_delpages)
 
void _bt_upgrademetapage (Page page)
 
Buffer _bt_getroot (Relation rel, Relation heaprel, int access)
 
Buffer _bt_gettrueroot (Relation rel)
 
int _bt_getrootheight (Relation rel)
 
void _bt_metaversion (Relation rel, bool *heapkeyspace, bool *allequalimage)
 
void _bt_checkpage (Relation rel, Buffer buf)
 
Buffer _bt_getbuf (Relation rel, BlockNumber blkno, int access)
 
Buffer _bt_allocbuf (Relation rel, Relation heaprel)
 
Buffer _bt_relandgetbuf (Relation rel, Buffer obuf, BlockNumber blkno, int access)
 
void _bt_relbuf (Relation rel, Buffer buf)
 
void _bt_lockbuf (Relation rel, Buffer buf, int access)
 
void _bt_unlockbuf (Relation rel, Buffer buf)
 
bool _bt_conditionallockbuf (Relation rel, Buffer buf)
 
void _bt_upgradelockbufcleanup (Relation rel, Buffer buf)
 
void _bt_pageinit (Page page, Size size)
 
void _bt_delitems_vacuum (Relation rel, Buffer buf, OffsetNumber *deletable, int ndeletable, BTVacuumPosting *updatable, int nupdatable)
 
void _bt_delitems_delete_check (Relation rel, Buffer buf, Relation heapRel, struct TM_IndexDeleteOp *delstate)
 
void _bt_pagedel (Relation rel, Buffer leafbuf, BTVacState *vstate)
 
void _bt_pendingfsm_init (Relation rel, BTVacState *vstate, bool cleanuponly)
 
void _bt_pendingfsm_finalize (Relation rel, BTVacState *vstate)
 
void _bt_preprocess_keys (IndexScanDesc scan)
 
bool _bt_readpage (IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, bool firstpage)
 
void _bt_start_array_keys (IndexScanDesc scan, ScanDirection dir)
 
int _bt_binsrch_array_skey (FmgrInfo *orderproc, bool cur_elem_trig, ScanDirection dir, Datum tupdatum, bool tupnull, BTArrayKeyInfo *array, ScanKey cur, int32 *set_elem_result)
 
BTStack _bt_search (Relation rel, Relation heaprel, BTScanInsert key, Buffer *bufP, int access, bool returnstack)
 
OffsetNumber _bt_binsrch_insert (Relation rel, BTInsertState insertstate)
 
int32 _bt_compare (Relation rel, BTScanInsert key, Page page, OffsetNumber offnum)
 
bool _bt_first (IndexScanDesc scan, ScanDirection dir)
 
bool _bt_next (IndexScanDesc scan, ScanDirection dir)
 
Buffer _bt_get_endpoint (Relation rel, uint32 level, bool rightmost)
 
BTScanInsert _bt_mkscankey (Relation rel, IndexTuple itup)
 
void _bt_killitems (IndexScanDesc scan)
 
BTCycleId _bt_vacuum_cycleid (Relation rel)
 
BTCycleId _bt_start_vacuum (Relation rel)
 
void _bt_end_vacuum (Relation rel)
 
void _bt_end_vacuum_callback (int code, Datum arg)
 
byteabtoptions (Datum reloptions, bool validate)
 
bool btproperty (Oid index_oid, int attno, IndexAMProperty prop, const char *propname, bool *res, bool *isnull)
 
charbtbuildphasename (int64 phasenum)
 
IndexTuple _bt_truncate (Relation rel, IndexTuple lastleft, IndexTuple firstright, BTScanInsert itup_key)
 
int _bt_keep_natts_fast (Relation rel, IndexTuple lastleft, IndexTuple firstright)
 
bool _bt_check_natts (Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum)
 
void _bt_check_third_page (Relation rel, Relation heap, bool needheaptidspace, Page page, IndexTuple newtup)
 
bool _bt_allequalimage (Relation rel, bool debugmessage)
 
bool btvalidate (Oid opclassoid)
 
void btadjustmembers (Oid opfamilyoid, Oid opclassoid, List *operators, List *functions)
 
IndexBuildResultbtbuild (Relation heap, Relation index, struct IndexInfo *indexInfo)
 
void _bt_parallel_build_main (dsm_segment *seg, shm_toc *toc)
 

Macro Definition Documentation

◆ BT_IS_POSTING

#define BT_IS_POSTING   0x2000

Definition at line 467 of file nbtree.h.

◆ BT_OFFSET_MASK

#define BT_OFFSET_MASK   0x0FFF

Definition at line 463 of file nbtree.h.

◆ BT_PIVOT_HEAP_TID_ATTR

#define BT_PIVOT_HEAP_TID_ATTR   0x1000

Definition at line 466 of file nbtree.h.

◆ BT_READ

#define BT_READ   BUFFER_LOCK_SHARE

Definition at line 730 of file nbtree.h.

◆ BT_STATUS_OFFSET_MASK

#define BT_STATUS_OFFSET_MASK   0xF000

Definition at line 464 of file nbtree.h.

◆ BT_WRITE

#define BT_WRITE   BUFFER_LOCK_EXCLUSIVE

Definition at line 731 of file nbtree.h.

◆ BTCommuteStrategyNumber

#define BTCommuteStrategyNumber (   strat)    (BTMaxStrategyNumber + 1 - (strat))

Definition at line 686 of file nbtree.h.

◆ BTEQUALIMAGE_PROC

#define BTEQUALIMAGE_PROC   4

Definition at line 720 of file nbtree.h.

◆ BTGetDeduplicateItems

#define BTGetDeduplicateItems (   relation)
Value:
(AssertMacro(relation->rd_rel->relkind == RELKIND_INDEX && \
relation->rd_rel->relam == BTREE_AM_OID), \
((relation)->rd_options ? \
((BTOptions *) (relation)->rd_options)->deduplicate_items : true))
#define AssertMacro(condition)
Definition c.h:944
static int fb(int x)

Definition at line 1135 of file nbtree.h.

1139 : true))

◆ BTGetFillFactor

#define BTGetFillFactor (   relation)
Value:
(AssertMacro(relation->rd_rel->relkind == RELKIND_INDEX && \
relation->rd_rel->relam == BTREE_AM_OID), \
(relation)->rd_options ? \
((BTOptions *) (relation)->rd_options)->fillfactor : \
#define BTREE_DEFAULT_FILLFACTOR
Definition nbtree.h:201

Definition at line 1127 of file nbtree.h.

1131 : \

◆ BTGetTargetPageFreeSpace

#define BTGetTargetPageFreeSpace (   relation)     (BLCKSZ * (100 - BTGetFillFactor(relation)) / 100)

Definition at line 1133 of file nbtree.h.

◆ BTINRANGE_PROC

#define BTINRANGE_PROC   3

Definition at line 719 of file nbtree.h.

◆ BTMaxItemSize

#define BTMaxItemSize
Value:
MAXALIGN(sizeof(BTPageOpaqueData))) / 3) - \
#define SizeOfPageHeaderData
Definition bufpage.h:241
#define MAXALIGN_DOWN(LEN)
Definition c.h:908
#define MAXALIGN(LEN)
Definition c.h:896

Definition at line 165 of file nbtree.h.

◆ BTMaxItemSizeNoHeapTid

#define BTMaxItemSizeNoHeapTid
Value:

Definition at line 170 of file nbtree.h.

◆ BTNProcs

#define BTNProcs   6

Definition at line 723 of file nbtree.h.

◆ BTOPTIONS_PROC

#define BTOPTIONS_PROC   5

Definition at line 721 of file nbtree.h.

◆ BTORDER_PROC

#define BTORDER_PROC   1

Definition at line 717 of file nbtree.h.

◆ BTP_DELETED

#define BTP_DELETED   (1 << 2) /* page has been deleted from tree */

Definition at line 79 of file nbtree.h.

◆ BTP_HALF_DEAD

#define BTP_HALF_DEAD   (1 << 4) /* empty, but still in tree */

Definition at line 81 of file nbtree.h.

◆ BTP_HAS_FULLXID

#define BTP_HAS_FULLXID   (1 << 8) /* contains BTDeletedPageData */

Definition at line 85 of file nbtree.h.

◆ BTP_HAS_GARBAGE

#define BTP_HAS_GARBAGE   (1 << 6) /* page has LP_DEAD tuples (deprecated) */

Definition at line 83 of file nbtree.h.

◆ BTP_INCOMPLETE_SPLIT

#define BTP_INCOMPLETE_SPLIT   (1 << 7) /* right sibling's downlink is missing */

Definition at line 84 of file nbtree.h.

◆ BTP_LEAF

#define BTP_LEAF   (1 << 0) /* leaf page, i.e. not internal page */

Definition at line 77 of file nbtree.h.

◆ BTP_META

#define BTP_META   (1 << 3) /* meta-page */

Definition at line 80 of file nbtree.h.

◆ BTP_ROOT

#define BTP_ROOT   (1 << 1) /* root page (has no parent) */

Definition at line 78 of file nbtree.h.

◆ BTP_SPLIT_END

#define BTP_SPLIT_END   (1 << 5) /* rightmost page of split group */

Definition at line 82 of file nbtree.h.

◆ BTPageGetMeta

#define BTPageGetMeta (   p)     ((BTMetaPageData *) PageGetContents(p))

Definition at line 122 of file nbtree.h.

◆ BTPageGetOpaque

#define BTPageGetOpaque (   page)    ((BTPageOpaque) PageGetSpecialPointer(page))

Definition at line 74 of file nbtree.h.

◆ BTREE_DEFAULT_FILLFACTOR

#define BTREE_DEFAULT_FILLFACTOR   90

Definition at line 201 of file nbtree.h.

◆ BTREE_MAGIC

#define BTREE_MAGIC   0x053162 /* magic number in metapage */

Definition at line 150 of file nbtree.h.

◆ BTREE_METAPAGE

#define BTREE_METAPAGE   0 /* first page is meta */

Definition at line 149 of file nbtree.h.

◆ BTREE_MIN_FILLFACTOR

#define BTREE_MIN_FILLFACTOR   10

Definition at line 200 of file nbtree.h.

◆ BTREE_MIN_VERSION

#define BTREE_MIN_VERSION   2 /* minimum supported version */

Definition at line 152 of file nbtree.h.

◆ BTREE_NONLEAF_FILLFACTOR

#define BTREE_NONLEAF_FILLFACTOR   70

Definition at line 202 of file nbtree.h.

◆ BTREE_NOVAC_VERSION

#define BTREE_NOVAC_VERSION   3 /* version with all meta fields set */

Definition at line 153 of file nbtree.h.

◆ BTREE_SINGLEVAL_FILLFACTOR

#define BTREE_SINGLEVAL_FILLFACTOR   96

Definition at line 203 of file nbtree.h.

◆ BTREE_VERSION

#define BTREE_VERSION   4 /* current version number */

Definition at line 151 of file nbtree.h.

◆ BTreeTupleGetNAtts

#define BTreeTupleGetNAtts (   itup,
  rel 
)
Value:
( \
(BTreeTupleIsPivot(itup)) ? \
( \
) \
: \
)
static OffsetNumber ItemPointerGetOffsetNumberNoCheck(const ItemPointerData *pointer)
Definition itemptr.h:114
static bool BTreeTupleIsPivot(IndexTuple itup)
Definition nbtree.h:481
#define BT_OFFSET_MASK
Definition nbtree.h:463
#define IndexRelationGetNumberOfAttributes(relation)
Definition rel.h:528

Definition at line 578 of file nbtree.h.

584 : \
586 )

◆ BTScanPosInvalidate

#define BTScanPosInvalidate (   scanpos)
Value:
do { \
(scanpos).currPage = InvalidBlockNumber; \
} while (0)
#define InvalidBlockNumber
Definition block.h:33
#define InvalidBuffer
Definition buf.h:25
static char buf[DEFAULT_XLOG_SEG_SIZE]

Definition at line 1027 of file nbtree.h.

1028 { \
1030 (scanpos).currPage = InvalidBlockNumber; \
1031 } while (0)

◆ BTScanPosIsPinned

#define BTScanPosIsPinned (   scanpos)
Value:
( \
)
static bool BlockNumberIsValid(BlockNumber blockNumber)
Definition block.h:71
static bool BufferIsValid(Buffer bufnum)
Definition bufmgr.h:419

Definition at line 1004 of file nbtree.h.

◆ BTScanPosIsValid

#define BTScanPosIsValid (   scanpos)
Value:

Definition at line 1021 of file nbtree.h.

◆ BTScanPosUnpin

#define BTScanPosUnpin (   scanpos)
Value:
do { \
} while (0)

Definition at line 1010 of file nbtree.h.

1011 { \
1014 } while (0)

◆ BTScanPosUnpinIfPinned

#define BTScanPosUnpinIfPinned (   scanpos)
Value:
do { \
} while (0)
#define BTScanPosIsPinned(scanpos)
Definition nbtree.h:1004

Definition at line 1015 of file nbtree.h.

1016 { \
1019 } while (0)

◆ BTSKIPSUPPORT_PROC

#define BTSKIPSUPPORT_PROC   6

Definition at line 722 of file nbtree.h.

◆ BTSORTSUPPORT_PROC

#define BTSORTSUPPORT_PROC   2

Definition at line 718 of file nbtree.h.

◆ INDEX_ALT_TID_MASK

#define INDEX_ALT_TID_MASK   INDEX_AM_RESERVED_BIT

Definition at line 460 of file nbtree.h.

◆ MAX_BT_CYCLE_ID

#define MAX_BT_CYCLE_ID   0xFF7F

Definition at line 94 of file nbtree.h.

◆ MaxTIDsPerBTreePage

#define MaxTIDsPerBTreePage
Value:

Definition at line 186 of file nbtree.h.

◆ P_FIRSTDATAKEY

#define P_FIRSTDATAKEY (   opaque)    (P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY)

Definition at line 370 of file nbtree.h.

◆ P_FIRSTKEY

#define P_FIRSTKEY   ((OffsetNumber) 2)

Definition at line 369 of file nbtree.h.

◆ P_HAS_FULLXID

#define P_HAS_FULLXID (   opaque)    (((opaque)->btpo_flags & BTP_HAS_FULLXID) != 0)

Definition at line 229 of file nbtree.h.

◆ P_HAS_GARBAGE

#define P_HAS_GARBAGE (   opaque)    (((opaque)->btpo_flags & BTP_HAS_GARBAGE) != 0)

Definition at line 227 of file nbtree.h.

◆ P_HIKEY

#define P_HIKEY   ((OffsetNumber) 1)

Definition at line 368 of file nbtree.h.

◆ P_IGNORE

#define P_IGNORE (   opaque)    (((opaque)->btpo_flags & (BTP_DELETED|BTP_HALF_DEAD)) != 0)

Definition at line 226 of file nbtree.h.

◆ P_INCOMPLETE_SPLIT

#define P_INCOMPLETE_SPLIT (   opaque)    (((opaque)->btpo_flags & BTP_INCOMPLETE_SPLIT) != 0)

Definition at line 228 of file nbtree.h.

◆ P_ISDELETED

#define P_ISDELETED (   opaque)    (((opaque)->btpo_flags & BTP_DELETED) != 0)

Definition at line 223 of file nbtree.h.

◆ P_ISHALFDEAD

#define P_ISHALFDEAD (   opaque)    (((opaque)->btpo_flags & BTP_HALF_DEAD) != 0)

Definition at line 225 of file nbtree.h.

◆ P_ISLEAF

#define P_ISLEAF (   opaque)    (((opaque)->btpo_flags & BTP_LEAF) != 0)

Definition at line 221 of file nbtree.h.

◆ P_ISMETA

#define P_ISMETA (   opaque)    (((opaque)->btpo_flags & BTP_META) != 0)

Definition at line 224 of file nbtree.h.

◆ P_ISROOT

#define P_ISROOT (   opaque)    (((opaque)->btpo_flags & BTP_ROOT) != 0)

Definition at line 222 of file nbtree.h.

◆ P_LEFTMOST

#define P_LEFTMOST (   opaque)    ((opaque)->btpo_prev == P_NONE)

Definition at line 219 of file nbtree.h.

◆ P_NONE

#define P_NONE   0

Definition at line 213 of file nbtree.h.

◆ P_RIGHTMOST

#define P_RIGHTMOST (   opaque)    ((opaque)->btpo_next == P_NONE)

Definition at line 220 of file nbtree.h.

◆ PROGRESS_BTREE_PHASE_INDEXBUILD_TABLESCAN

#define PROGRESS_BTREE_PHASE_INDEXBUILD_TABLESCAN   2

Definition at line 1146 of file nbtree.h.

◆ PROGRESS_BTREE_PHASE_LEAF_LOAD

#define PROGRESS_BTREE_PHASE_LEAF_LOAD   5

Definition at line 1149 of file nbtree.h.

◆ PROGRESS_BTREE_PHASE_PERFORMSORT_1

#define PROGRESS_BTREE_PHASE_PERFORMSORT_1   3

Definition at line 1147 of file nbtree.h.

◆ PROGRESS_BTREE_PHASE_PERFORMSORT_2

#define PROGRESS_BTREE_PHASE_PERFORMSORT_2   4

Definition at line 1148 of file nbtree.h.

◆ SK_BT_DESC

Definition at line 1116 of file nbtree.h.

◆ SK_BT_INDOPTION_SHIFT

#define SK_BT_INDOPTION_SHIFT   24 /* must clear the above bits */

Definition at line 1115 of file nbtree.h.

◆ SK_BT_MAXVAL

#define SK_BT_MAXVAL   0x00100000 /* invalid sk_argument, use high_compare */

Definition at line 1110 of file nbtree.h.

◆ SK_BT_MINVAL

#define SK_BT_MINVAL   0x00080000 /* invalid sk_argument, use low_compare */

Definition at line 1109 of file nbtree.h.

◆ SK_BT_NEXT

#define SK_BT_NEXT   0x00200000 /* positions the scan > sk_argument */

Definition at line 1111 of file nbtree.h.

◆ SK_BT_NULLS_FIRST

#define SK_BT_NULLS_FIRST   (INDOPTION_NULLS_FIRST << SK_BT_INDOPTION_SHIFT)

Definition at line 1117 of file nbtree.h.

◆ SK_BT_PRIOR

#define SK_BT_PRIOR   0x00400000 /* positions the scan < sk_argument */

Definition at line 1112 of file nbtree.h.

◆ SK_BT_REQBKWD

#define SK_BT_REQBKWD   0x00020000 /* required to continue backward scan */

Definition at line 1105 of file nbtree.h.

◆ SK_BT_REQFWD

#define SK_BT_REQFWD   0x00010000 /* required to continue forward scan */

Definition at line 1104 of file nbtree.h.

◆ SK_BT_SKIP

#define SK_BT_SKIP   0x00040000 /* skip array on column without input = */

Definition at line 1106 of file nbtree.h.

Typedef Documentation

◆ BTArrayKeyInfo

◆ BTCycleId

Definition at line 30 of file nbtree.h.

◆ BTDedupInterval

◆ BTDedupState

Definition at line 904 of file nbtree.h.

◆ BTDedupStateData

◆ BTDeletedPageData

◆ BTInsertState

Definition at line 846 of file nbtree.h.

◆ BTInsertStateData

◆ BTMetaPageData

◆ BTOptions

◆ BTPageOpaque

Definition at line 72 of file nbtree.h.

◆ BTPageOpaqueData

◆ BTPendingFSM

◆ BTScanInsert

Definition at line 807 of file nbtree.h.

◆ BTScanInsertData

◆ BTScanOpaque

Definition at line 1097 of file nbtree.h.

◆ BTScanOpaqueData

◆ BTScanPos

Definition at line 1002 of file nbtree.h.

◆ BTScanPosData

◆ BTScanPosItem

◆ BTStack

Definition at line 750 of file nbtree.h.

◆ BTStackData

◆ BTVacState

◆ BTVacuumPosting

Definition at line 925 of file nbtree.h.

◆ BTVacuumPostingData

Function Documentation

◆ _bt_allequalimage()

bool _bt_allequalimage ( Relation  rel,
bool  debugmessage 
)
extern

Definition at line 1175 of file nbtutils.c.

1176{
1177 bool allequalimage = true;
1178
1179 /* INCLUDE indexes can never support deduplication */
1182 return false;
1183
1184 for (int i = 0; i < IndexRelationGetNumberOfKeyAttributes(rel); i++)
1185 {
1186 Oid opfamily = rel->rd_opfamily[i];
1187 Oid opcintype = rel->rd_opcintype[i];
1188 Oid collation = rel->rd_indcollation[i];
1190
1191 equalimageproc = get_opfamily_proc(opfamily, opcintype, opcintype,
1193
1194 /*
1195 * If there is no BTEQUALIMAGE_PROC then deduplication is assumed to
1196 * be unsafe. Otherwise, actually call proc and see what it says.
1197 */
1198 if (!OidIsValid(equalimageproc) ||
1200 ObjectIdGetDatum(opcintype))))
1201 {
1202 allequalimage = false;
1203 break;
1204 }
1205 }
1206
1207 if (debugmessage)
1208 {
1209 if (allequalimage)
1210 elog(DEBUG1, "index \"%s\" can safely use deduplication",
1212 else
1213 elog(DEBUG1, "index \"%s\" cannot use deduplication",
1215 }
1216
1217 return allequalimage;
1218}
#define OidIsValid(objectId)
Definition c.h:858
#define DEBUG1
Definition elog.h:31
#define elog(elevel,...)
Definition elog.h:228
Datum OidFunctionCall1Coll(Oid functionId, Oid collation, Datum arg1)
Definition fmgr.c:1413
int i
Definition isn.c:77
Oid get_opfamily_proc(Oid opfamily, Oid lefttype, Oid righttype, int16 procnum)
Definition lsyscache.c:915
#define BTEQUALIMAGE_PROC
Definition nbtree.h:720
static bool DatumGetBool(Datum X)
Definition postgres.h:100
static Datum ObjectIdGetDatum(Oid X)
Definition postgres.h:252
unsigned int Oid
#define RelationGetRelationName(relation)
Definition rel.h:550
#define IndexRelationGetNumberOfKeyAttributes(relation)
Definition rel.h:535
Oid * rd_opcintype
Definition rel.h:208
Oid * rd_opfamily
Definition rel.h:207
Oid * rd_indcollation
Definition rel.h:217

References BTEQUALIMAGE_PROC, DatumGetBool(), DEBUG1, elog, fb(), get_opfamily_proc(), i, IndexRelationGetNumberOfAttributes, IndexRelationGetNumberOfKeyAttributes, ObjectIdGetDatum(), OidFunctionCall1Coll(), OidIsValid, RelationData::rd_indcollation, RelationData::rd_opcintype, RelationData::rd_opfamily, and RelationGetRelationName.

Referenced by _bt_leafbuild(), bt_index_check_callback(), and btbuildempty().

◆ _bt_allocbuf()

Buffer _bt_allocbuf ( Relation  rel,
Relation  heaprel 
)
extern

Definition at line 874 of file nbtpage.c.

875{
876 Buffer buf;
877 BlockNumber blkno;
878 Page page;
879
880 Assert(heaprel != NULL);
881
882 /*
883 * First see if the FSM knows of any free pages.
884 *
885 * We can't trust the FSM's report unreservedly; we have to check that the
886 * page is still free. (For example, an already-free page could have been
887 * re-used between the time the last VACUUM scanned it and the time the
888 * VACUUM made its FSM updates.)
889 *
890 * In fact, it's worse than that: we can't even assume that it's safe to
891 * take a lock on the reported page. If somebody else has a lock on it,
892 * or even worse our own caller does, we could deadlock. (The own-caller
893 * scenario is actually not improbable. Consider an index on a serial or
894 * timestamp column. Nearly all splits will be at the rightmost page, so
895 * it's entirely likely that _bt_split will call us while holding a lock
896 * on the page most recently acquired from FSM. A VACUUM running
897 * concurrently with the previous split could well have placed that page
898 * back in FSM.)
899 *
900 * To get around that, we ask for only a conditional lock on the reported
901 * page. If we fail, then someone else is using the page, and we may
902 * reasonably assume it's not free. (If we happen to be wrong, the worst
903 * consequence is the page will be lost to use till the next VACUUM, which
904 * is no big problem.)
905 */
906 for (;;)
907 {
908 blkno = GetFreeIndexPage(rel);
909 if (blkno == InvalidBlockNumber)
910 break;
911 buf = ReadBuffer(rel, blkno);
912 if (_bt_conditionallockbuf(rel, buf))
913 {
914 page = BufferGetPage(buf);
915
916 /*
917 * It's possible to find an all-zeroes page in an index. For
918 * example, a backend might successfully extend the relation one
919 * page and then crash before it is able to make a WAL entry for
920 * adding the page. If we find a zeroed page then reclaim it
921 * immediately.
922 */
923 if (PageIsNew(page))
924 {
925 /* Okay to use page. Initialize and return it. */
927 return buf;
928 }
929
930 if (BTPageIsRecyclable(page, heaprel))
931 {
932 /*
933 * If we are generating WAL for Hot Standby then create a WAL
934 * record that will allow us to conflict with queries running
935 * on standby, in case they have snapshots older than safexid
936 * value
937 */
939 {
941
942 /*
943 * Note that we don't register the buffer with the record,
944 * because this operation doesn't modify the page (that
945 * already happened, back when VACUUM deleted the page).
946 * This record only exists to provide a conflict point for
947 * Hot Standby. See record REDO routine comments.
948 */
950 xlrec_reuse.block = blkno;
951 xlrec_reuse.snapshotConflictHorizon = BTPageGetDeleteXid(page);
952 xlrec_reuse.isCatalogRel =
954
957
959 }
960
961 /* Okay to use page. Re-initialize and return it. */
963 return buf;
964 }
965 elog(DEBUG2, "FSM returned nonrecyclable page");
966 _bt_relbuf(rel, buf);
967 }
968 else
969 {
970 elog(DEBUG2, "FSM returned nonlockable page");
971 /* couldn't get lock, so just drop pin */
973 }
974 }
975
976 /*
977 * Extend the relation by one page. Need to use RBM_ZERO_AND_LOCK or we
978 * risk a race condition against btvacuumscan --- see comments therein.
979 * This forces us to repeat the valgrind request that _bt_lockbuf()
980 * otherwise would make, as we can't use _bt_lockbuf() without introducing
981 * a race.
982 */
984 if (!RelationUsesLocalBuffers(rel))
986
987 /* Initialize the new page before returning it */
988 page = BufferGetPage(buf);
989 Assert(PageIsNew(page));
991
992 return buf;
993}
uint32 BlockNumber
Definition block.h:31
int Buffer
Definition buf.h:23
Buffer ExtendBufferedRel(BufferManagerRelation bmr, ForkNumber forkNum, BufferAccessStrategy strategy, uint32 flags)
Definition bufmgr.c:979
void ReleaseBuffer(Buffer buffer)
Definition bufmgr.c:5586
Buffer ReadBuffer(Relation reln, BlockNumber blockNum)
Definition bufmgr.c:879
static Page BufferGetPage(Buffer buffer)
Definition bufmgr.h:468
static Size BufferGetPageSize(Buffer buffer)
Definition bufmgr.h:457
@ EB_LOCK_FIRST
Definition bufmgr.h:87
#define BMR_REL(p_rel)
Definition bufmgr.h:114
static bool PageIsNew(const PageData *page)
Definition bufpage.h:258
PageData * Page
Definition bufpage.h:81
#define Assert(condition)
Definition c.h:943
#define DEBUG2
Definition elog.h:30
BlockNumber GetFreeIndexPage(Relation rel)
Definition indexfsm.c:38
#define VALGRIND_MAKE_MEM_DEFINED(addr, size)
Definition memdebug.h:26
void _bt_relbuf(Relation rel, Buffer buf)
Definition nbtpage.c:1044
void _bt_pageinit(Page page, Size size)
Definition nbtpage.c:1157
bool _bt_conditionallockbuf(Relation rel, Buffer buf)
Definition nbtpage.c:1121
static FullTransactionId BTPageGetDeleteXid(Page page)
Definition nbtree.h:261
static bool BTPageIsRecyclable(Page page, Relation heaprel)
Definition nbtree.h:292
#define XLOG_BTREE_REUSE_PAGE
Definition nbtxlog.h:40
#define SizeOfBtreeReusePage
Definition nbtxlog.h:192
#define RelationIsAccessibleInLogicalDecoding(relation)
Definition rel.h:695
#define RelationNeedsWAL(relation)
Definition rel.h:639
#define RelationUsesLocalBuffers(relation)
Definition rel.h:648
@ MAIN_FORKNUM
Definition relpath.h:58
RelFileLocator rd_locator
Definition rel.h:57
RelFileLocator locator
Definition nbtxlog.h:185
#define XLogStandbyInfoActive()
Definition xlog.h:126
XLogRecPtr XLogInsert(RmgrId rmid, uint8 info)
Definition xloginsert.c:482
void XLogRegisterData(const void *data, uint32 len)
Definition xloginsert.c:372
void XLogBeginInsert(void)
Definition xloginsert.c:153

References _bt_conditionallockbuf(), _bt_pageinit(), _bt_relbuf(), Assert, BMR_REL, BTPageGetDeleteXid(), BTPageIsRecyclable(), buf, BufferGetPage(), BufferGetPageSize(), DEBUG2, EB_LOCK_FIRST, elog, ExtendBufferedRel(), fb(), GetFreeIndexPage(), InvalidBlockNumber, xl_btree_reuse_page::locator, MAIN_FORKNUM, PageIsNew(), RelationData::rd_locator, ReadBuffer(), RelationIsAccessibleInLogicalDecoding, RelationNeedsWAL, RelationUsesLocalBuffers, ReleaseBuffer(), SizeOfBtreeReusePage, VALGRIND_MAKE_MEM_DEFINED, XLOG_BTREE_REUSE_PAGE, XLogBeginInsert(), XLogInsert(), XLogRegisterData(), and XLogStandbyInfoActive.

Referenced by _bt_getroot(), _bt_newlevel(), and _bt_split().

◆ _bt_binsrch_array_skey()

int _bt_binsrch_array_skey ( FmgrInfo orderproc,
bool  cur_elem_trig,
ScanDirection  dir,
Datum  tupdatum,
bool  tupnull,
BTArrayKeyInfo array,
ScanKey  cur,
int32 set_elem_result 
)
extern

Definition at line 3415 of file nbtreadpage.c.

3420{
3421 int low_elem = 0,
3422 mid_elem = -1,
3423 high_elem = array->num_elems - 1,
3424 result = 0;
3426
3427 Assert(cur->sk_flags & SK_SEARCHARRAY);
3428 Assert(!(cur->sk_flags & SK_BT_SKIP));
3429 Assert(!(cur->sk_flags & SK_ISNULL)); /* SAOP arrays never have NULLs */
3430 Assert(cur->sk_strategy == BTEqualStrategyNumber);
3431
3432 if (cur_elem_trig)
3433 {
3435 Assert(cur->sk_flags & SK_BT_REQFWD);
3436
3437 /*
3438 * When the scan key that triggered array advancement is a required
3439 * array scan key, it is now certain that the current array element
3440 * (plus all prior elements relative to the current scan direction)
3441 * cannot possibly be at or ahead of the corresponding tuple value.
3442 * (_bt_checkkeys must have called _bt_tuple_before_array_skeys, which
3443 * makes sure this is true as a condition of advancing the arrays.)
3444 *
3445 * This makes it safe to exclude array elements up to and including
3446 * the former-current array element from our search.
3447 *
3448 * Separately, when array advancement was triggered by a required scan
3449 * key, the array element immediately after the former-current element
3450 * is often either an exact tupdatum match, or a "close by" near-match
3451 * (a near-match tupdatum is one whose key space falls _between_ the
3452 * former-current and new-current array elements). We'll detect both
3453 * cases via an optimistic comparison of the new search lower bound
3454 * (or new search upper bound in the case of backwards scans).
3455 */
3456 if (ScanDirectionIsForward(dir))
3457 {
3458 low_elem = array->cur_elem + 1; /* old cur_elem exhausted */
3459
3460 /* Compare prospective new cur_elem (also the new lower bound) */
3461 if (high_elem >= low_elem)
3462 {
3463 arrdatum = array->elem_values[low_elem];
3465 arrdatum, cur);
3466
3467 if (result <= 0)
3468 {
3469 /* Optimistic comparison optimization worked out */
3471 return low_elem;
3472 }
3473 mid_elem = low_elem;
3474 low_elem++; /* this cur_elem exhausted, too */
3475 }
3476
3477 if (high_elem < low_elem)
3478 {
3479 /* Caller needs to perform "beyond end" array advancement */
3480 *set_elem_result = 1;
3481 return high_elem;
3482 }
3483 }
3484 else
3485 {
3486 high_elem = array->cur_elem - 1; /* old cur_elem exhausted */
3487
3488 /* Compare prospective new cur_elem (also the new upper bound) */
3489 if (high_elem >= low_elem)
3490 {
3491 arrdatum = array->elem_values[high_elem];
3493 arrdatum, cur);
3494
3495 if (result >= 0)
3496 {
3497 /* Optimistic comparison optimization worked out */
3499 return high_elem;
3500 }
3501 mid_elem = high_elem;
3502 high_elem--; /* this cur_elem exhausted, too */
3503 }
3504
3505 if (high_elem < low_elem)
3506 {
3507 /* Caller needs to perform "beyond end" array advancement */
3508 *set_elem_result = -1;
3509 return low_elem;
3510 }
3511 }
3512 }
3513
3514 while (high_elem > low_elem)
3515 {
3516 mid_elem = low_elem + ((high_elem - low_elem) / 2);
3517 arrdatum = array->elem_values[mid_elem];
3518
3520 arrdatum, cur);
3521
3522 if (result == 0)
3523 {
3524 /*
3525 * It's safe to quit as soon as we see an equal array element.
3526 * This often saves an extra comparison or two...
3527 */
3528 low_elem = mid_elem;
3529 break;
3530 }
3531
3532 if (result > 0)
3533 low_elem = mid_elem + 1;
3534 else
3535 high_elem = mid_elem;
3536 }
3537
3538 /*
3539 * ...but our caller also cares about how its searched-for tuple datum
3540 * compares to the low_elem datum. Must always set *set_elem_result with
3541 * the result of that comparison specifically.
3542 */
3543 if (low_elem != mid_elem)
3545 array->elem_values[low_elem], cur);
3546
3548
3549 return low_elem;
3550}
uint32 result
struct cursor * cur
Definition ecpg.c:29
static int32 _bt_compare_array_skey(FmgrInfo *orderproc, Datum tupdatum, bool tupnull, Datum arrdatum, ScanKey cur)
#define SK_BT_SKIP
Definition nbtree.h:1106
#define SK_BT_REQFWD
Definition nbtree.h:1104
uint64_t Datum
Definition postgres.h:70
#define ScanDirectionIsForward(direction)
Definition sdir.h:64
#define ScanDirectionIsNoMovement(direction)
Definition sdir.h:57
#define SK_SEARCHARRAY
Definition skey.h:120
#define SK_ISNULL
Definition skey.h:115
#define BTEqualStrategyNumber
Definition stratnum.h:31
Datum * elem_values
Definition nbtree.h:1041

References _bt_compare_array_skey(), Assert, BTEqualStrategyNumber, cur, BTArrayKeyInfo::cur_elem, BTArrayKeyInfo::elem_values, fb(), BTArrayKeyInfo::num_elems, result, ScanDirectionIsForward, ScanDirectionIsNoMovement, SK_BT_REQFWD, SK_BT_SKIP, SK_ISNULL, and SK_SEARCHARRAY.

Referenced by _bt_advance_array_keys(), _bt_saoparray_shrink(), and _bt_set_startikey().

◆ _bt_binsrch_insert()

OffsetNumber _bt_binsrch_insert ( Relation  rel,
BTInsertState  insertstate 
)
extern

Definition at line 475 of file nbtsearch.c.

476{
477 BTScanInsert key = insertstate->itup_key;
478 Page page;
479 BTPageOpaque opaque;
480 OffsetNumber low,
481 high,
482 stricthigh;
484 cmpval;
485
486 page = BufferGetPage(insertstate->buf);
487 opaque = BTPageGetOpaque(page);
488
489 Assert(P_ISLEAF(opaque));
490 Assert(!key->nextkey);
491 Assert(insertstate->postingoff == 0);
492
493 if (!insertstate->bounds_valid)
494 {
495 /* Start new binary search */
496 low = P_FIRSTDATAKEY(opaque);
497 high = PageGetMaxOffsetNumber(page);
498 }
499 else
500 {
501 /* Restore result of previous binary search against same page */
502 low = insertstate->low;
503 high = insertstate->stricthigh;
504 }
505
506 /* If there are no keys on the page, return the first available slot */
507 if (unlikely(high < low))
508 {
509 /* Caller can't reuse bounds */
511 insertstate->stricthigh = InvalidOffsetNumber;
512 insertstate->bounds_valid = false;
513 return low;
514 }
515
516 /*
517 * Binary search to find the first key on the page >= scan key. (nextkey
518 * is always false when inserting).
519 *
520 * The loop invariant is: all slots before 'low' are < scan key, all slots
521 * at or after 'high' are >= scan key. 'stricthigh' is > scan key, and is
522 * maintained to save additional search effort for caller.
523 *
524 * We can fall out when high == low.
525 */
526 if (!insertstate->bounds_valid)
527 high++; /* establish the loop invariant for high */
528 stricthigh = high; /* high initially strictly higher */
529
530 cmpval = 1; /* !nextkey comparison value */
531
532 while (high > low)
533 {
534 OffsetNumber mid = low + ((high - low) / 2);
535
536 /* We have low <= mid < high, so mid points at a real slot */
537
538 result = _bt_compare(rel, key, page, mid);
539
540 if (result >= cmpval)
541 low = mid + 1;
542 else
543 {
544 high = mid;
545 if (result != 0)
546 stricthigh = high;
547 }
548
549 /*
550 * If tuple at offset located by binary search is a posting list whose
551 * TID range overlaps with caller's scantid, perform posting list
552 * binary search to set postingoff for caller. Caller must split the
553 * posting list when postingoff is set. This should happen
554 * infrequently.
555 */
556 if (unlikely(result == 0 && key->scantid != NULL))
557 {
558 /*
559 * postingoff should never be set more than once per leaf page
560 * binary search. That would mean that there are duplicate table
561 * TIDs in the index, which is never okay. Check for that here.
562 */
563 if (insertstate->postingoff != 0)
566 errmsg_internal("table tid from new index tuple (%u,%u) cannot find insert offset between offsets %u and %u of block %u in index \"%s\"",
569 low, stricthigh,
572
573 insertstate->postingoff = _bt_binsrch_posting(key, page, mid);
574 }
575 }
576
577 /*
578 * On a leaf page, a binary search always returns the first key >= scan
579 * key (at least in !nextkey case), which could be the last slot + 1. This
580 * is also the lower bound of cached search.
581 *
582 * stricthigh may also be the last slot + 1, which prevents caller from
583 * using bounds directly, but is still useful to us if we're called a
584 * second time with cached bounds (cached low will be < stricthigh when
585 * that happens).
586 */
587 insertstate->low = low;
588 insertstate->stricthigh = stricthigh;
589 insertstate->bounds_valid = true;
590
591 return low;
592}
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition bufmgr.c:4446
static OffsetNumber PageGetMaxOffsetNumber(const PageData *page)
Definition bufpage.h:396
int32_t int32
Definition c.h:620
#define unlikely(x)
Definition c.h:438
int errcode(int sqlerrcode)
Definition elog.c:874
int int errmsg_internal(const char *fmt,...) pg_attribute_printf(1
#define ERROR
Definition elog.h:40
#define ereport(elevel,...)
Definition elog.h:152
static OffsetNumber ItemPointerGetOffsetNumber(const ItemPointerData *pointer)
Definition itemptr.h:124
static BlockNumber ItemPointerGetBlockNumber(const ItemPointerData *pointer)
Definition itemptr.h:103
#define P_ISLEAF(opaque)
Definition nbtree.h:221
#define BTPageGetOpaque(page)
Definition nbtree.h:74
#define P_FIRSTDATAKEY(opaque)
Definition nbtree.h:370
static int _bt_binsrch_posting(BTScanInsert key, Page page, OffsetNumber offnum)
Definition nbtsearch.c:603
int32 _bt_compare(Relation rel, BTScanInsert key, Page page, OffsetNumber offnum)
Definition nbtsearch.c:689
#define InvalidOffsetNumber
Definition off.h:26
uint16 OffsetNumber
Definition off.h:24

References _bt_binsrch_posting(), _bt_compare(), Assert, BTPageGetOpaque, BufferGetBlockNumber(), BufferGetPage(), ereport, errcode(), errmsg_internal(), ERROR, fb(), InvalidOffsetNumber, ItemPointerGetBlockNumber(), ItemPointerGetOffsetNumber(), P_FIRSTDATAKEY, P_ISLEAF, PageGetMaxOffsetNumber(), RelationGetRelationName, result, and unlikely.

Referenced by _bt_check_unique(), _bt_findinsertloc(), and bt_rootdescend().

◆ _bt_bottomupdel_pass()

bool _bt_bottomupdel_pass ( Relation  rel,
Buffer  buf,
Relation  heapRel,
Size  newitemsz 
)
extern

Definition at line 309 of file nbtdedup.c.

311{
312 OffsetNumber offnum,
313 minoff,
314 maxoff;
315 Page page = BufferGetPage(buf);
316 BTPageOpaque opaque = BTPageGetOpaque(page);
319 bool neverdedup;
321
322 /* Passed-in newitemsz is MAXALIGNED but does not include line pointer */
323 newitemsz += sizeof(ItemIdData);
324
325 /* Initialize deduplication state */
327 state->deduplicate = true;
328 state->nmaxitems = 0;
329 state->maxpostingsize = BLCKSZ; /* We're not really deduplicating */
330 state->base = NULL;
331 state->baseoff = InvalidOffsetNumber;
332 state->basetupsize = 0;
333 state->htids = palloc(state->maxpostingsize);
334 state->nhtids = 0;
335 state->nitems = 0;
336 state->phystupsize = 0;
337 state->nintervals = 0;
338
339 /*
340 * Initialize tableam state that describes bottom-up index deletion
341 * operation.
342 *
343 * We'll go on to ask the tableam to search for TIDs whose index tuples we
344 * can safely delete. The tableam will search until our leaf page space
345 * target is satisfied, or until the cost of continuing with the tableam
346 * operation seems too high. It focuses its efforts on TIDs associated
347 * with duplicate index tuples that we mark "promising".
348 *
349 * This space target is a little arbitrary. The tableam must be able to
350 * keep the costs and benefits in balance. We provide the tableam with
351 * exhaustive information about what might work, without directly
352 * concerning ourselves with avoiding work during the tableam call. Our
353 * role in costing the bottom-up deletion process is strictly advisory.
354 */
355 delstate.irel = rel;
357 delstate.bottomup = true;
358 delstate.bottomupfreespace = Max(BLCKSZ / 16, newitemsz);
359 delstate.ndeltids = 0;
362
363 minoff = P_FIRSTDATAKEY(opaque);
364 maxoff = PageGetMaxOffsetNumber(page);
365 for (offnum = minoff;
366 offnum <= maxoff;
367 offnum = OffsetNumberNext(offnum))
368 {
369 ItemId itemid = PageGetItemId(page, offnum);
370 IndexTuple itup = (IndexTuple) PageGetItem(page, itemid);
371
372 Assert(!ItemIdIsDead(itemid));
373
374 if (offnum == minoff)
375 {
376 /* itup starts first pending interval */
377 _bt_dedup_start_pending(state, itup, offnum);
378 }
379 else if (_bt_keep_natts_fast(rel, state->base, itup) > nkeyatts &&
381 {
382 /* Tuple is equal; just added its TIDs to pending interval */
383 }
384 else
385 {
386 /* Finalize interval -- move its TIDs to delete state */
388
389 /* itup starts new pending interval */
390 _bt_dedup_start_pending(state, itup, offnum);
391 }
392 }
393 /* Finalize final interval -- move its TIDs to delete state */
395
396 /*
397 * We don't give up now in the event of having few (or even zero)
398 * promising tuples for the tableam because it's not up to us as the index
399 * AM to manage costs (note that the tableam might have heuristics of its
400 * own that work out what to do). We should at least avoid having our
401 * caller do a useless deduplication pass after we return in the event of
402 * zero promising tuples, though.
403 */
404 neverdedup = false;
405 if (state->nintervals == 0)
406 neverdedup = true;
407
408 pfree(state->htids);
409 pfree(state);
410
411 /* Ask tableam which TIDs are deletable, then physically delete them */
412 _bt_delitems_delete_check(rel, buf, heapRel, &delstate);
413
414 pfree(delstate.deltids);
415 pfree(delstate.status);
416
417 /* Report "success" to caller unconditionally to avoid deduplication */
418 if (neverdedup)
419 return true;
420
421 /* Don't dedup when we won't end up back here any time soon anyway */
422 return PageGetExactFreeSpace(page) >= Max(BLCKSZ / 24, newitemsz);
423}
Size PageGetExactFreeSpace(const PageData *page)
Definition bufpage.c:967
static ItemId PageGetItemId(Page page, OffsetNumber offsetNumber)
Definition bufpage.h:268
static void * PageGetItem(PageData *page, const ItemIdData *itemId)
Definition bufpage.h:378
#define Max(x, y)
Definition c.h:1085
#define palloc_object(type)
Definition fe_memutils.h:74
#define palloc_array(type, count)
Definition fe_memutils.h:76
#define ItemIdIsDead(itemId)
Definition itemid.h:113
IndexTupleData * IndexTuple
Definition itup.h:53
void pfree(void *pointer)
Definition mcxt.c:1616
void * palloc(Size size)
Definition mcxt.c:1387
bool _bt_dedup_save_htid(BTDedupState state, IndexTuple itup)
Definition nbtdedup.c:486
void _bt_dedup_start_pending(BTDedupState state, IndexTuple base, OffsetNumber baseoff)
Definition nbtdedup.c:435
static void _bt_bottomupdel_finish_pending(Page page, BTDedupState state, TM_IndexDeleteOp *delstate)
Definition nbtdedup.c:648
void _bt_delitems_delete_check(Relation rel, Buffer buf, Relation heapRel, TM_IndexDeleteOp *delstate)
Definition nbtpage.c:1543
#define MaxTIDsPerBTreePage
Definition nbtree.h:186
int _bt_keep_natts_fast(Relation rel, IndexTuple lastleft, IndexTuple firstright)
Definition nbtutils.c:911
#define OffsetNumberNext(offsetNumber)
Definition off.h:52

References _bt_bottomupdel_finish_pending(), _bt_dedup_save_htid(), _bt_dedup_start_pending(), _bt_delitems_delete_check(), _bt_keep_natts_fast(), Assert, BTPageGetOpaque, buf, BufferGetBlockNumber(), BufferGetPage(), fb(), IndexRelationGetNumberOfKeyAttributes, InvalidOffsetNumber, ItemIdIsDead, Max, MaxTIDsPerBTreePage, OffsetNumberNext, P_FIRSTDATAKEY, PageGetExactFreeSpace(), PageGetItem(), PageGetItemId(), PageGetMaxOffsetNumber(), palloc(), palloc_array, palloc_object, and pfree().

Referenced by _bt_delete_or_dedup_one_page().

◆ _bt_check_natts()

bool _bt_check_natts ( Relation  rel,
bool  heapkeyspace,
Page  page,
OffsetNumber  offnum 
)
extern

Definition at line 958 of file nbtutils.c.

959{
962 BTPageOpaque opaque = BTPageGetOpaque(page);
963 IndexTuple itup;
964 int tupnatts;
965
966 /*
967 * We cannot reliably test a deleted or half-dead page, since they have
968 * dummy high keys
969 */
970 if (P_IGNORE(opaque))
971 return true;
972
973 Assert(offnum >= FirstOffsetNumber &&
974 offnum <= PageGetMaxOffsetNumber(page));
975
976 itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
977 tupnatts = BTreeTupleGetNAtts(itup, rel);
978
979 /* !heapkeyspace indexes do not support deduplication */
980 if (!heapkeyspace && BTreeTupleIsPosting(itup))
981 return false;
982
983 /* Posting list tuples should never have "pivot heap TID" bit set */
984 if (BTreeTupleIsPosting(itup) &&
987 return false;
988
989 /* INCLUDE indexes do not support deduplication */
990 if (natts != nkeyatts && BTreeTupleIsPosting(itup))
991 return false;
992
993 if (P_ISLEAF(opaque))
994 {
995 if (offnum >= P_FIRSTDATAKEY(opaque))
996 {
997 /*
998 * Non-pivot tuple should never be explicitly marked as a pivot
999 * tuple
1000 */
1001 if (BTreeTupleIsPivot(itup))
1002 return false;
1003
1004 /*
1005 * Leaf tuples that are not the page high key (non-pivot tuples)
1006 * should never be truncated. (Note that tupnatts must have been
1007 * inferred, even with a posting list tuple, because only pivot
1008 * tuples store tupnatts directly.)
1009 */
1010 return tupnatts == natts;
1011 }
1012 else
1013 {
1014 /*
1015 * Rightmost page doesn't contain a page high key, so tuple was
1016 * checked above as ordinary leaf tuple
1017 */
1018 Assert(!P_RIGHTMOST(opaque));
1019
1020 /*
1021 * !heapkeyspace high key tuple contains only key attributes. Note
1022 * that tupnatts will only have been explicitly represented in
1023 * !heapkeyspace indexes that happen to have non-key attributes.
1024 */
1025 if (!heapkeyspace)
1026 return tupnatts == nkeyatts;
1027
1028 /* Use generic heapkeyspace pivot tuple handling */
1029 }
1030 }
1031 else /* !P_ISLEAF(opaque) */
1032 {
1033 if (offnum == P_FIRSTDATAKEY(opaque))
1034 {
1035 /*
1036 * The first tuple on any internal page (possibly the first after
1037 * its high key) is its negative infinity tuple. Negative
1038 * infinity tuples are always truncated to zero attributes. They
1039 * are a particular kind of pivot tuple.
1040 */
1041 if (heapkeyspace)
1042 return tupnatts == 0;
1043
1044 /*
1045 * The number of attributes won't be explicitly represented if the
1046 * negative infinity tuple was generated during a page split that
1047 * occurred with a version of Postgres before v11. There must be
1048 * a problem when there is an explicit representation that is
1049 * non-zero, or when there is no explicit representation and the
1050 * tuple is evidently not a pre-pg_upgrade tuple.
1051 *
1052 * Prior to v11, downlinks always had P_HIKEY as their offset.
1053 * Accept that as an alternative indication of a valid
1054 * !heapkeyspace negative infinity tuple.
1055 */
1056 return tupnatts == 0 ||
1058 }
1059 else
1060 {
1061 /*
1062 * !heapkeyspace downlink tuple with separator key contains only
1063 * key attributes. Note that tupnatts will only have been
1064 * explicitly represented in !heapkeyspace indexes that happen to
1065 * have non-key attributes.
1066 */
1067 if (!heapkeyspace)
1068 return tupnatts == nkeyatts;
1069
1070 /* Use generic heapkeyspace pivot tuple handling */
1071 }
1072 }
1073
1074 /* Handle heapkeyspace pivot tuples (excluding minus infinity items) */
1075 Assert(heapkeyspace);
1076
1077 /*
1078 * Explicit representation of the number of attributes is mandatory with
1079 * heapkeyspace index pivot tuples, regardless of whether or not there are
1080 * non-key attributes.
1081 */
1082 if (!BTreeTupleIsPivot(itup))
1083 return false;
1084
1085 /* Pivot tuple should not use posting list representation (redundant) */
1086 if (BTreeTupleIsPosting(itup))
1087 return false;
1088
1089 /*
1090 * Heap TID is a tiebreaker key attribute, so it cannot be untruncated
1091 * when any other key attribute is truncated
1092 */
1093 if (BTreeTupleGetHeapTID(itup) != NULL && tupnatts != nkeyatts)
1094 return false;
1095
1096 /*
1097 * Pivot tuple must have at least one untruncated key attribute (minus
1098 * infinity pivot tuples are the only exception). Pivot tuples can never
1099 * represent that there is a value present for a key attribute that
1100 * exceeds pg_index.indnkeyatts for the index.
1101 */
1102 return tupnatts > 0 && tupnatts <= nkeyatts;
1103}
int16_t int16
Definition c.h:619
#define BT_PIVOT_HEAP_TID_ATTR
Definition nbtree.h:466
#define P_HIKEY
Definition nbtree.h:368
#define P_RIGHTMOST(opaque)
Definition nbtree.h:220
#define P_IGNORE(opaque)
Definition nbtree.h:226
static bool BTreeTupleIsPosting(IndexTuple itup)
Definition nbtree.h:493
static ItemPointer BTreeTupleGetHeapTID(IndexTuple itup)
Definition nbtree.h:639
#define BTreeTupleGetNAtts(itup, rel)
Definition nbtree.h:578
#define FirstOffsetNumber
Definition off.h:27
ItemPointerData t_tid
Definition itup.h:37

References Assert, BT_PIVOT_HEAP_TID_ATTR, BTPageGetOpaque, BTreeTupleGetHeapTID(), BTreeTupleGetNAtts, BTreeTupleIsPivot(), BTreeTupleIsPosting(), fb(), FirstOffsetNumber, IndexRelationGetNumberOfAttributes, IndexRelationGetNumberOfKeyAttributes, ItemPointerGetOffsetNumber(), ItemPointerGetOffsetNumberNoCheck(), P_FIRSTDATAKEY, P_HIKEY, P_IGNORE, P_ISLEAF, P_RIGHTMOST, PageGetItem(), PageGetItemId(), PageGetMaxOffsetNumber(), and IndexTupleData::t_tid.

Referenced by _bt_compare(), and bt_target_page_check().

◆ _bt_check_third_page()

void _bt_check_third_page ( Relation  rel,
Relation  heap,
bool  needheaptidspace,
Page  page,
IndexTuple  newtup 
)
extern

Definition at line 1118 of file nbtutils.c.

1120{
1121 Size itemsz;
1122 BTPageOpaque opaque;
1123
1124 itemsz = MAXALIGN(IndexTupleSize(newtup));
1125
1126 /* Double check item size against limit */
1127 if (itemsz <= BTMaxItemSize)
1128 return;
1129
1130 /*
1131 * Tuple is probably too large to fit on page, but it's possible that the
1132 * index uses version 2 or version 3, or that page is an internal page, in
1133 * which case a slightly higher limit applies.
1134 */
1135 if (!needheaptidspace && itemsz <= BTMaxItemSizeNoHeapTid)
1136 return;
1137
1138 /*
1139 * Internal page insertions cannot fail here, because that would mean that
1140 * an earlier leaf level insertion that should have failed didn't
1141 */
1142 opaque = BTPageGetOpaque(page);
1143 if (!P_ISLEAF(opaque))
1144 elog(ERROR, "cannot insert oversized tuple of size %zu on internal page of index \"%s\"",
1145 itemsz, RelationGetRelationName(rel));
1146
1147 ereport(ERROR,
1149 errmsg("index row size %zu exceeds btree version %u maximum %zu for index \"%s\"",
1150 itemsz,
1154 errdetail("Index row references tuple (%u,%u) in relation \"%s\".",
1158 errhint("Values larger than 1/3 of a buffer page cannot be indexed.\n"
1159 "Consider a function index of an MD5 hash of the value, "
1160 "or use full text indexing."),
1162}
size_t Size
Definition c.h:689
int errhint(const char *fmt,...) pg_attribute_printf(1
int errdetail(const char *fmt,...) pg_attribute_printf(1
static Size IndexTupleSize(const IndexTupleData *itup)
Definition itup.h:71
#define BTREE_VERSION
Definition nbtree.h:151
#define BTREE_NOVAC_VERSION
Definition nbtree.h:153
#define BTMaxItemSizeNoHeapTid
Definition nbtree.h:170
#define BTMaxItemSize
Definition nbtree.h:165
static char * errmsg
int errtableconstraint(Relation rel, const char *conname)
Definition relcache.c:6117

References BTMaxItemSize, BTMaxItemSizeNoHeapTid, BTPageGetOpaque, BTREE_NOVAC_VERSION, BTREE_VERSION, BTreeTupleGetHeapTID(), elog, ereport, errcode(), errdetail(), errhint(), errmsg, ERROR, errtableconstraint(), fb(), IndexTupleSize(), ItemPointerGetBlockNumber(), ItemPointerGetOffsetNumber(), MAXALIGN, P_ISLEAF, and RelationGetRelationName.

Referenced by _bt_buildadd(), and _bt_findinsertloc().

◆ _bt_checkpage()

void _bt_checkpage ( Relation  rel,
Buffer  buf 
)
extern

Definition at line 802 of file nbtpage.c.

803{
804 Page page = BufferGetPage(buf);
805
806 /*
807 * ReadBuffer verifies that every newly-read page passes
808 * PageHeaderIsValid, which means it either contains a reasonably sane
809 * page header or is all-zero. We have to defend against the all-zero
810 * case, however.
811 */
812 if (PageIsNew(page))
815 errmsg("index \"%s\" contains unexpected zero page at block %u",
818 errhint("Please REINDEX it.")));
819
820 /*
821 * Additionally check that the special area looks sane.
822 */
823 if (PageGetSpecialSize(page) != MAXALIGN(sizeof(BTPageOpaqueData)))
826 errmsg("index \"%s\" contains corrupted page at block %u",
829 errhint("Please REINDEX it.")));
830}
static uint16 PageGetSpecialSize(const PageData *page)
Definition bufpage.h:341

References buf, BufferGetBlockNumber(), BufferGetPage(), ereport, errcode(), errhint(), errmsg, ERROR, fb(), MAXALIGN, PageGetSpecialSize(), PageIsNew(), and RelationGetRelationName.

Referenced by _bt_getbuf(), _bt_relandgetbuf(), _bt_search_insert(), bt_recheck_sibling_links(), btvacuumpage(), and palloc_btree_page().

◆ _bt_compare()

int32 _bt_compare ( Relation  rel,
BTScanInsert  key,
Page  page,
OffsetNumber  offnum 
)
extern

Definition at line 689 of file nbtsearch.c.

693{
695 BTPageOpaque opaque = BTPageGetOpaque(page);
696 IndexTuple itup;
697 ItemPointer heapTid;
699 int ncmpkey;
700 int ntupatts;
702
703 Assert(_bt_check_natts(rel, key->heapkeyspace, page, offnum));
705 Assert(key->heapkeyspace || key->scantid == NULL);
706
707 /*
708 * Force result ">" if target item is first data item on an internal page
709 * --- see NOTE above.
710 */
711 if (!P_ISLEAF(opaque) && offnum == P_FIRSTDATAKEY(opaque))
712 return 1;
713
714 itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
715 ntupatts = BTreeTupleGetNAtts(itup, rel);
716
717 /*
718 * The scan key is set up with the attribute number associated with each
719 * term in the key. It is important that, if the index is multi-key, the
720 * scan contain the first k key attributes, and that they be in order. If
721 * you think about how multi-key ordering works, you'll understand why
722 * this is.
723 *
724 * We don't test for violation of this condition here, however. The
725 * initial setup for the index scan had better have gotten it right (see
726 * _bt_first).
727 */
728
729 ncmpkey = Min(ntupatts, key->keysz);
730 Assert(key->heapkeyspace || ncmpkey == key->keysz);
731 Assert(!BTreeTupleIsPosting(itup) || key->allequalimage);
732 scankey = key->scankeys;
733 for (int i = 1; i <= ncmpkey; i++)
734 {
735 Datum datum;
736 bool isNull;
737
738 datum = index_getattr(itup, scankey->sk_attno, itupdesc, &isNull);
739
740 if (scankey->sk_flags & SK_ISNULL) /* key is NULL */
741 {
742 if (isNull)
743 result = 0; /* NULL "=" NULL */
744 else if (scankey->sk_flags & SK_BT_NULLS_FIRST)
745 result = -1; /* NULL "<" NOT_NULL */
746 else
747 result = 1; /* NULL ">" NOT_NULL */
748 }
749 else if (isNull) /* key is NOT_NULL and item is NULL */
750 {
751 if (scankey->sk_flags & SK_BT_NULLS_FIRST)
752 result = 1; /* NOT_NULL ">" NULL */
753 else
754 result = -1; /* NOT_NULL "<" NULL */
755 }
756 else
757 {
758 /*
759 * The sk_func needs to be passed the index value as left arg and
760 * the sk_argument as right arg (they might be of different
761 * types). Since it is convenient for callers to think of
762 * _bt_compare as comparing the scankey to the index item, we have
763 * to flip the sign of the comparison result. (Unless it's a DESC
764 * column, in which case we *don't* flip the sign.)
765 */
767 scankey->sk_collation,
768 datum,
769 scankey->sk_argument));
770
771 if (!(scankey->sk_flags & SK_BT_DESC))
773 }
774
775 /* if the keys are unequal, return the difference */
776 if (result != 0)
777 return result;
778
779 scankey++;
780 }
781
782 /*
783 * All non-truncated attributes (other than heap TID) were found to be
784 * equal. Treat truncated attributes as minus infinity when scankey has a
785 * key attribute value that would otherwise be compared directly.
786 *
787 * Note: it doesn't matter if ntupatts includes non-key attributes;
788 * scankey won't, so explicitly excluding non-key attributes isn't
789 * necessary.
790 */
791 if (key->keysz > ntupatts)
792 return 1;
793
794 /*
795 * Use the heap TID attribute and scantid to try to break the tie. The
796 * rules are the same as any other key attribute -- only the
797 * representation differs.
798 */
799 heapTid = BTreeTupleGetHeapTID(itup);
800 if (key->scantid == NULL)
801 {
802 /*
803 * Forward scans have a scankey that is considered greater than a
804 * truncated pivot tuple if and when the scankey has equal values for
805 * attributes up to and including the least significant untruncated
806 * attribute in tuple. Even attributes that were omitted from the
807 * scan key are considered greater than -inf truncated attributes.
808 * (See _bt_binsrch for an explanation of our backward scan behavior.)
809 *
810 * For example, if an index has the minimum two attributes (single
811 * user key attribute, plus heap TID attribute), and a page's high key
812 * is ('foo', -inf), and scankey is ('foo', <omitted>), the search
813 * will not descend to the page to the left. The search will descend
814 * right instead. The truncated attribute in pivot tuple means that
815 * all non-pivot tuples on the page to the left are strictly < 'foo',
816 * so it isn't necessary to descend left. In other words, search
817 * doesn't have to descend left because it isn't interested in a match
818 * that has a heap TID value of -inf.
819 *
820 * Note: the heap TID part of the test ensures that scankey is being
821 * compared to a pivot tuple with one or more truncated -inf key
822 * attributes. The heap TID attribute is the last key attribute in
823 * every index, of course, but other than that it isn't special.
824 */
825 if (!key->backward && key->keysz == ntupatts && heapTid == NULL &&
826 key->heapkeyspace)
827 return 1;
828
829 /* All provided scankey arguments found to be equal */
830 return 0;
831 }
832
833 /*
834 * Treat truncated heap TID as minus infinity, since scankey has a key
835 * attribute value (scantid) that would otherwise be compared directly
836 */
838 if (heapTid == NULL)
839 return 1;
840
841 /*
842 * Scankey must be treated as equal to a posting list tuple if its scantid
843 * value falls within the range of the posting list. In all other cases
844 * there can only be a single heap TID value, which is compared directly
845 * with scantid.
846 */
848 result = ItemPointerCompare(key->scantid, heapTid);
849 if (result <= 0 || !BTreeTupleIsPosting(itup))
850 return result;
851 else
852 {
853 result = ItemPointerCompare(key->scantid,
855 if (result > 0)
856 return 1;
857 }
858
859 return 0;
860}
#define Min(x, y)
Definition c.h:1091
#define INVERT_COMPARE_RESULT(var)
Definition c.h:1193
Datum FunctionCall2Coll(FmgrInfo *flinfo, Oid collation, Datum arg1, Datum arg2)
Definition fmgr.c:1151
int32 ItemPointerCompare(const ItemPointerData *arg1, const ItemPointerData *arg2)
Definition itemptr.c:51
static Datum index_getattr(IndexTuple tup, int attnum, TupleDesc tupleDesc, bool *isnull)
Definition itup.h:131
#define SK_BT_NULLS_FIRST
Definition nbtree.h:1117
#define SK_BT_DESC
Definition nbtree.h:1116
static ItemPointer BTreeTupleGetMaxHeapTID(IndexTuple itup)
Definition nbtree.h:665
bool _bt_check_natts(Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum)
Definition nbtutils.c:958
static int32 DatumGetInt32(Datum X)
Definition postgres.h:202
#define RelationGetDescr(relation)
Definition rel.h:542

References _bt_check_natts(), Assert, BTPageGetOpaque, BTreeTupleGetHeapTID(), BTreeTupleGetMaxHeapTID(), BTreeTupleGetNAtts, BTreeTupleIsPosting(), DatumGetInt32(), fb(), FunctionCall2Coll(), i, index_getattr(), IndexRelationGetNumberOfKeyAttributes, INVERT_COMPARE_RESULT, ItemPointerCompare(), Min, P_FIRSTDATAKEY, P_ISLEAF, PageGetItem(), PageGetItemId(), RelationGetDescr, result, SK_BT_DESC, SK_BT_NULLS_FIRST, and SK_ISNULL.

Referenced by _bt_binsrch(), _bt_binsrch_insert(), _bt_check_unique(), _bt_findinsertloc(), _bt_moveright(), _bt_search_insert(), bt_rootdescend(), bt_target_page_check(), invariant_g_offset(), invariant_l_nontarget_offset(), invariant_l_offset(), and invariant_leq_offset().

◆ _bt_conditionallockbuf()

bool _bt_conditionallockbuf ( Relation  rel,
Buffer  buf 
)
extern

Definition at line 1121 of file nbtpage.c.

1122{
1123 /* ConditionalLockBuffer() asserts that pin is held by this backend */
1125 return false;
1126
1127 if (!RelationUsesLocalBuffers(rel))
1129
1130 return true;
1131}
bool ConditionalLockBuffer(Buffer buffer)
Definition bufmgr.c:6617

References buf, BufferGetPage(), ConditionalLockBuffer(), fb(), RelationUsesLocalBuffers, and VALGRIND_MAKE_MEM_DEFINED.

Referenced by _bt_allocbuf(), and _bt_search_insert().

◆ _bt_dedup_finish_pending()

Size _bt_dedup_finish_pending ( Page  newpage,
BTDedupState  state 
)
extern

Definition at line 557 of file nbtdedup.c.

558{
562
563 Assert(state->nitems > 0);
564 Assert(state->nitems <= state->nhtids);
565 Assert(state->intervals[state->nintervals].baseoff == state->baseoff);
566
568 if (state->nitems == 1)
569 {
570 /* Use original, unchanged base tuple */
574 if (PageAddItem(newpage, state->base, tuplesz, tupoff, false, false) == InvalidOffsetNumber)
575 elog(ERROR, "deduplication failed to add tuple to page");
576
577 spacesaving = 0;
578 }
579 else
580 {
581 IndexTuple final;
582
583 /* Form a tuple with a posting list */
584 final = _bt_form_posting(state->base, state->htids, state->nhtids);
585 tuplesz = IndexTupleSize(final);
586 Assert(tuplesz <= state->maxpostingsize);
587
588 /* Save final number of items for posting list */
589 state->intervals[state->nintervals].nitems = state->nitems;
590
593 if (PageAddItem(newpage, final, tuplesz, tupoff, false, false) == InvalidOffsetNumber)
594 elog(ERROR, "deduplication failed to add tuple to page");
595
596 pfree(final);
597 spacesaving = state->phystupsize - (tuplesz + sizeof(ItemIdData));
598 /* Increment nintervals, since we wrote a new posting list tuple */
599 state->nintervals++;
601 }
602
603 /* Reset state for next pending posting list */
604 state->nhtids = 0;
605 state->nitems = 0;
606 state->phystupsize = 0;
607
608 return spacesaving;
609}
#define PageAddItem(page, item, size, offsetNumber, overwrite, is_heap)
Definition bufpage.h:504
IndexTuple _bt_form_posting(IndexTuple base, const ItemPointerData *htids, int nhtids)
Definition nbtdedup.c:864

References _bt_form_posting(), Assert, BTMaxItemSize, elog, ERROR, fb(), IndexTupleSize(), InvalidOffsetNumber, MAXALIGN, OffsetNumberNext, PageAddItem, PageGetMaxOffsetNumber(), and pfree().

Referenced by _bt_dedup_pass(), and btree_xlog_dedup().

◆ _bt_dedup_pass()

void _bt_dedup_pass ( Relation  rel,
Buffer  buf,
IndexTuple  newitem,
Size  newitemsz,
bool  bottomupdedup 
)
extern

Definition at line 59 of file nbtdedup.c.

61{
62 OffsetNumber offnum,
63 minoff,
64 maxoff;
65 Page page = BufferGetPage(buf);
66 BTPageOpaque opaque = BTPageGetOpaque(page);
70 bool singlevalstrat = false;
73
74 /* Passed-in newitemsz is MAXALIGNED but does not include line pointer */
75 newitemsz += sizeof(ItemIdData);
76
77 /*
78 * Initialize deduplication state.
79 *
80 * It would be possible for maxpostingsize (limit on posting list tuple
81 * size) to be set to one third of the page. However, it seems like a
82 * good idea to limit the size of posting lists to one sixth of a page.
83 * That ought to leave us with a good split point when pages full of
84 * duplicates can be split several times.
85 */
87 state->deduplicate = true;
88 state->nmaxitems = 0;
89 state->maxpostingsize = Min(BTMaxItemSize / 2, INDEX_SIZE_MASK);
90 /* Metadata about base tuple of current pending posting list */
91 state->base = NULL;
92 state->baseoff = InvalidOffsetNumber;
93 state->basetupsize = 0;
94 /* Metadata about current pending posting list TIDs */
95 state->htids = palloc(state->maxpostingsize);
96 state->nhtids = 0;
97 state->nitems = 0;
98 /* Size of all physical tuples to be replaced by pending posting list */
99 state->phystupsize = 0;
100 /* nintervals should be initialized to zero */
101 state->nintervals = 0;
102
103 minoff = P_FIRSTDATAKEY(opaque);
104 maxoff = PageGetMaxOffsetNumber(page);
105
106 /*
107 * Consider applying "single value" strategy, though only if the page
108 * seems likely to be split in the near future
109 */
110 if (!bottomupdedup)
111 singlevalstrat = _bt_do_singleval(rel, page, state, minoff, newitem);
112
113 /*
114 * Deduplicate items from page, and write them to newpage.
115 *
116 * Copy the original page's LSN into newpage copy. This will become the
117 * updated version of the page. We need this because XLogInsert will
118 * examine the LSN and possibly dump it in a page image.
119 */
122
123 /* Copy high key, if any */
124 if (!P_RIGHTMOST(opaque))
125 {
129
131 elog(ERROR, "deduplication failed to add highkey");
132 }
133
134 for (offnum = minoff;
135 offnum <= maxoff;
136 offnum = OffsetNumberNext(offnum))
137 {
138 ItemId itemid = PageGetItemId(page, offnum);
139 IndexTuple itup = (IndexTuple) PageGetItem(page, itemid);
140
141 Assert(!ItemIdIsDead(itemid));
142
143 if (offnum == minoff)
144 {
145 /*
146 * No previous/base tuple for the data item -- use the data item
147 * as base tuple of pending posting list
148 */
149 _bt_dedup_start_pending(state, itup, offnum);
150 }
151 else if (state->deduplicate &&
152 _bt_keep_natts_fast(rel, state->base, itup) > nkeyatts &&
154 {
155 /*
156 * Tuple is equal to base tuple of pending posting list. Heap
157 * TID(s) for itup have been saved in state.
158 */
159 }
160 else
161 {
162 /*
163 * Tuple is not equal to pending posting list tuple, or
164 * _bt_dedup_save_htid() opted to not merge current item into
165 * pending posting list for some other reason (e.g., adding more
166 * TIDs would have caused posting list to exceed current
167 * maxpostingsize).
168 *
169 * If state contains pending posting list with more than one item,
170 * form new posting tuple and add it to our temp page (newpage).
171 * Else add pending interval's base tuple to the temp page as-is.
172 */
174
175 if (singlevalstrat)
176 {
177 /*
178 * Single value strategy's extra steps.
179 *
180 * Lower maxpostingsize for sixth and final large posting list
181 * tuple at the point where 5 maxpostingsize-capped tuples
182 * have either been formed or observed.
183 *
184 * When a sixth maxpostingsize-capped item is formed/observed,
185 * stop merging together tuples altogether. The few tuples
186 * that remain at the end of the page won't be merged together
187 * at all (at least not until after a future page split takes
188 * place, when this page's newly allocated right sibling page
189 * gets its first deduplication pass).
190 */
191 if (state->nmaxitems == 5)
192 _bt_singleval_fillfactor(page, state, newitemsz);
193 else if (state->nmaxitems == 6)
194 {
195 state->deduplicate = false;
196 singlevalstrat = false; /* won't be back here */
197 }
198 }
199
200 /* itup starts new pending posting list */
201 _bt_dedup_start_pending(state, itup, offnum);
202 }
203 }
204
205 /* Handle the last item */
207
208 /*
209 * If no items suitable for deduplication were found, newpage must be
210 * exactly the same as the original page, so just return from function.
211 *
212 * We could determine whether or not to proceed on the basis the space
213 * savings being sufficient to avoid an immediate page split instead. We
214 * don't do that because there is some small value in nbtsplitloc.c always
215 * operating against a page that is fully deduplicated (apart from
216 * newitem). Besides, most of the cost has already been paid.
217 */
218 if (state->nintervals == 0)
219 {
220 /* cannot leak memory here */
221 pfree(newpage);
222 pfree(state->htids);
223 pfree(state);
224 return;
225 }
226
227 /*
228 * By here, it's clear that deduplication will definitely go ahead.
229 *
230 * Clear the BTP_HAS_GARBAGE page flag. The index must be a heapkeyspace
231 * index, and as such we'll never pay attention to BTP_HAS_GARBAGE anyway.
232 * But keep things tidy.
233 */
234 if (P_HAS_GARBAGE(opaque))
235 {
237
238 nopaque->btpo_flags &= ~BTP_HAS_GARBAGE;
239 }
240
242
245
246 /* XLOG stuff */
247 if (RelationNeedsWAL(rel))
248 {
250
251 xlrec_dedup.nintervals = state->nintervals;
252
256
257 /*
258 * The intervals array is not in the buffer, but pretend that it is.
259 * When XLogInsert stores the whole buffer, the array need not be
260 * stored too.
261 */
262 XLogRegisterBufData(0, state->intervals,
263 state->nintervals * sizeof(BTDedupInterval));
264
266 }
267 else
268 recptr = XLogGetFakeLSN(rel);
269
270 PageSetLSN(page, recptr);
271
273
274 /* Local space accounting should agree with page accounting */
275 Assert(pagesaving < newitemsz || PageGetExactFreeSpace(page) >= newitemsz);
276
277 /* cannot leak memory here */
278 pfree(state->htids);
279 pfree(state);
280}
void MarkBufferDirty(Buffer buffer)
Definition bufmgr.c:3147
void PageRestoreTempPage(Page tempPage, Page oldPage)
Definition bufpage.c:433
Page PageGetTempPageCopySpecial(const PageData *page)
Definition bufpage.c:411
static void PageSetLSN(Page page, XLogRecPtr lsn)
Definition bufpage.h:416
static XLogRecPtr PageGetLSN(const PageData *page)
Definition bufpage.h:410
#define PG_USED_FOR_ASSERTS_ONLY
Definition c.h:249
#define ItemIdGetLength(itemId)
Definition itemid.h:59
#define INDEX_SIZE_MASK
Definition itup.h:65
#define START_CRIT_SECTION()
Definition miscadmin.h:152
#define END_CRIT_SECTION()
Definition miscadmin.h:154
static bool _bt_do_singleval(Relation rel, Page page, BTDedupState state, OffsetNumber minoff, IndexTuple newitem)
Definition nbtdedup.c:782
Size _bt_dedup_finish_pending(Page newpage, BTDedupState state)
Definition nbtdedup.c:557
static void _bt_singleval_fillfactor(Page page, BTDedupState state, Size newitemsz)
Definition nbtdedup.c:822
#define P_HAS_GARBAGE(opaque)
Definition nbtree.h:227
#define XLOG_BTREE_DEDUP
Definition nbtxlog.h:33
#define SizeOfBtreeDedup
Definition nbtxlog.h:174
uint16 nintervals
Definition nbtxlog.h:169
uint64 XLogRecPtr
Definition xlogdefs.h:21
void XLogRegisterBufData(uint8 block_id, const void *data, uint32 len)
Definition xloginsert.c:413
void XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags)
Definition xloginsert.c:246
XLogRecPtr XLogGetFakeLSN(Relation rel)
Definition xloginsert.c:562
#define REGBUF_STANDARD
Definition xloginsert.h:35

References _bt_dedup_finish_pending(), _bt_dedup_save_htid(), _bt_dedup_start_pending(), _bt_do_singleval(), _bt_keep_natts_fast(), _bt_singleval_fillfactor(), Assert, BTMaxItemSize, BTPageGetOpaque, buf, BufferGetPage(), elog, END_CRIT_SECTION, ERROR, fb(), INDEX_SIZE_MASK, IndexRelationGetNumberOfKeyAttributes, InvalidOffsetNumber, ItemIdGetLength, ItemIdIsDead, MarkBufferDirty(), Min, xl_btree_dedup::nintervals, OffsetNumberNext, P_FIRSTDATAKEY, P_HAS_GARBAGE, P_HIKEY, P_RIGHTMOST, PageAddItem, PageGetExactFreeSpace(), PageGetItem(), PageGetItemId(), PageGetLSN(), PageGetMaxOffsetNumber(), PageGetTempPageCopySpecial(), PageRestoreTempPage(), PageSetLSN(), palloc(), palloc_object, pfree(), PG_USED_FOR_ASSERTS_ONLY, REGBUF_STANDARD, RelationNeedsWAL, SizeOfBtreeDedup, START_CRIT_SECTION, XLOG_BTREE_DEDUP, XLogBeginInsert(), XLogGetFakeLSN(), XLogInsert(), XLogRegisterBufData(), XLogRegisterBuffer(), and XLogRegisterData().

Referenced by _bt_delete_or_dedup_one_page().

◆ _bt_dedup_save_htid()

bool _bt_dedup_save_htid ( BTDedupState  state,
IndexTuple  itup 
)
extern

Definition at line 486 of file nbtdedup.c.

487{
488 int nhtids;
489 ItemPointer htids;
491
493
494 if (!BTreeTupleIsPosting(itup))
495 {
496 nhtids = 1;
497 htids = &itup->t_tid;
498 }
499 else
500 {
501 nhtids = BTreeTupleGetNPosting(itup);
502 htids = BTreeTupleGetPosting(itup);
503 }
504
505 /*
506 * Don't append (have caller finish pending posting list as-is) if
507 * appending heap TID(s) from itup would put us over maxpostingsize limit.
508 *
509 * This calculation needs to match the code used within _bt_form_posting()
510 * for new posting list tuples.
511 */
512 mergedtupsz = MAXALIGN(state->basetupsize +
513 (state->nhtids + nhtids) * sizeof(ItemPointerData));
514
515 if (mergedtupsz > state->maxpostingsize)
516 {
517 /*
518 * Count this as an oversized item for single value strategy, though
519 * only when there are 50 TIDs in the final posting list tuple. This
520 * limit (which is fairly arbitrary) avoids confusion about how many
521 * 1/6 of a page tuples have been encountered/created by the current
522 * deduplication pass.
523 *
524 * Note: We deliberately don't consider which deduplication pass
525 * merged together tuples to create this item (could be a previous
526 * deduplication pass, or current pass). See _bt_do_singleval()
527 * comments.
528 */
529 if (state->nhtids > 50)
530 state->nmaxitems++;
531
532 return false;
533 }
534
535 /*
536 * Save heap TIDs to pending posting list tuple -- itup can be merged into
537 * pending posting list
538 */
539 state->nitems++;
540 memcpy(state->htids + state->nhtids, htids,
541 sizeof(ItemPointerData) * nhtids);
542 state->nhtids += nhtids;
543 state->phystupsize += MAXALIGN(IndexTupleSize(itup)) + sizeof(ItemIdData);
544
545 return true;
546}
memcpy(sums, checksumBaseOffsets, sizeof(checksumBaseOffsets))
static uint16 BTreeTupleGetNPosting(IndexTuple posting)
Definition nbtree.h:519
static ItemPointer BTreeTupleGetPosting(IndexTuple posting)
Definition nbtree.h:538

References Assert, BTreeTupleGetNPosting(), BTreeTupleGetPosting(), BTreeTupleIsPivot(), BTreeTupleIsPosting(), fb(), IndexTupleSize(), MAXALIGN, memcpy(), and IndexTupleData::t_tid.

Referenced by _bt_bottomupdel_pass(), _bt_dedup_pass(), _bt_load(), and btree_xlog_dedup().

◆ _bt_dedup_start_pending()

void _bt_dedup_start_pending ( BTDedupState  state,
IndexTuple  base,
OffsetNumber  baseoff 
)
extern

Definition at line 435 of file nbtdedup.c.

437{
438 Assert(state->nhtids == 0);
439 Assert(state->nitems == 0);
441
442 /*
443 * Copy heap TID(s) from new base tuple for new candidate posting list
444 * into working state's array
445 */
446 if (!BTreeTupleIsPosting(base))
447 {
448 memcpy(state->htids, &base->t_tid, sizeof(ItemPointerData));
449 state->nhtids = 1;
450 state->basetupsize = IndexTupleSize(base);
451 }
452 else
453 {
454 int nposting;
455
457 memcpy(state->htids, BTreeTupleGetPosting(base),
458 sizeof(ItemPointerData) * nposting);
459 state->nhtids = nposting;
460 /* basetupsize should not include existing posting list */
461 state->basetupsize = BTreeTupleGetPostingOffset(base);
462 }
463
464 /*
465 * Save new base tuple itself -- it'll be needed if we actually create a
466 * new posting list from new pending posting list.
467 *
468 * Must maintain physical size of all existing tuples (including line
469 * pointer overhead) so that we can calculate space savings on page.
470 */
471 state->nitems = 1;
472 state->base = base;
473 state->baseoff = baseoff;
474 state->phystupsize = MAXALIGN(IndexTupleSize(base)) + sizeof(ItemIdData);
475 /* Also save baseoff in pending state for interval */
476 state->intervals[state->nintervals].baseoff = state->baseoff;
477}
static uint32 BTreeTupleGetPostingOffset(IndexTuple posting)
Definition nbtree.h:530

References Assert, BTreeTupleGetNPosting(), BTreeTupleGetPosting(), BTreeTupleGetPostingOffset(), BTreeTupleIsPivot(), BTreeTupleIsPosting(), fb(), IndexTupleSize(), MAXALIGN, memcpy(), and IndexTupleData::t_tid.

Referenced by _bt_bottomupdel_pass(), _bt_dedup_pass(), _bt_load(), and btree_xlog_dedup().

◆ _bt_delitems_delete_check()

void _bt_delitems_delete_check ( Relation  rel,
Buffer  buf,
Relation  heapRel,
struct TM_IndexDeleteOp delstate 
)
extern

Definition at line 1543 of file nbtpage.c.

1545{
1546 Page page = BufferGetPage(buf);
1547 TransactionId snapshotConflictHorizon;
1548 bool isCatalogRel;
1550 int ndeletable = 0,
1551 nupdatable = 0;
1554
1555 /* Use tableam interface to determine which tuples to delete first */
1556 snapshotConflictHorizon = table_index_delete_tuples(heapRel, delstate);
1557 isCatalogRel = RelationIsAccessibleInLogicalDecoding(heapRel);
1558
1559 /* Should not WAL-log snapshotConflictHorizon unless it's required */
1560 if (!XLogStandbyInfoActive())
1561 snapshotConflictHorizon = InvalidTransactionId;
1562
1563 /*
1564 * Construct a leaf-page-wise description of what _bt_delitems_delete()
1565 * needs to do to physically delete index tuples from the page.
1566 *
1567 * Must sort deltids array to restore leaf-page-wise order (original order
1568 * before call to tableam). This is the order that the loop expects.
1569 *
1570 * Note that deltids array might be a lot smaller now. It might even have
1571 * no entries at all (with bottom-up deletion caller), in which case there
1572 * is nothing left to do.
1573 */
1574 qsort(delstate->deltids, delstate->ndeltids, sizeof(TM_IndexDelete),
1576 if (delstate->ndeltids == 0)
1577 {
1578 Assert(delstate->bottomup);
1579 return;
1580 }
1581
1582 /* We definitely have to delete at least one index tuple (or one TID) */
1583 for (int i = 0; i < delstate->ndeltids; i++)
1584 {
1585 TM_IndexStatus *dstatus = delstate->status + delstate->deltids[i].id;
1586 OffsetNumber idxoffnum = dstatus->idxoffnum;
1587 ItemId itemid = PageGetItemId(page, idxoffnum);
1588 IndexTuple itup = (IndexTuple) PageGetItem(page, itemid);
1589 int nestedi,
1590 nitem;
1592
1593 Assert(OffsetNumberIsValid(idxoffnum));
1594
1595 if (idxoffnum == postingidxoffnum)
1596 {
1597 /*
1598 * This deltid entry is a TID from a posting list tuple that has
1599 * already been completely processed
1600 */
1603 &delstate->deltids[i].tid) < 0);
1605 &delstate->deltids[i].tid) >= 0);
1606 continue;
1607 }
1608
1609 if (!BTreeTupleIsPosting(itup))
1610 {
1611 /* Plain non-pivot tuple */
1612 Assert(ItemPointerEquals(&itup->t_tid, &delstate->deltids[i].tid));
1613 if (dstatus->knowndeletable)
1614 deletable[ndeletable++] = idxoffnum;
1615 continue;
1616 }
1617
1618 /*
1619 * itup is a posting list tuple whose lowest deltids entry (which may
1620 * or may not be for the first TID from itup) is considered here now.
1621 * We should process all of the deltids entries for the posting list
1622 * together now, though (not just the lowest). Remember to skip over
1623 * later itup-related entries during later iterations of outermost
1624 * loop.
1625 */
1626 postingidxoffnum = idxoffnum; /* Remember work in outermost loop */
1627 nestedi = i; /* Initialize for first itup deltids entry */
1628 vacposting = NULL; /* Describes final action for itup */
1629 nitem = BTreeTupleGetNPosting(itup);
1630 for (int p = 0; p < nitem; p++)
1631 {
1633 int ptidcmp = -1;
1634
1635 /*
1636 * This nested loop reuses work across ptid TIDs taken from itup.
1637 * We take advantage of the fact that both itup's TIDs and deltids
1638 * entries (within a single itup/posting list grouping) must both
1639 * be in ascending TID order.
1640 */
1641 for (; nestedi < delstate->ndeltids; nestedi++)
1642 {
1644 TM_IndexStatus *tdstatus = (delstate->status + tcdeltid->id);
1645
1646 /* Stop once we get past all itup related deltids entries */
1647 Assert(tdstatus->idxoffnum >= idxoffnum);
1648 if (tdstatus->idxoffnum != idxoffnum)
1649 break;
1650
1651 /* Skip past non-deletable itup related entries up front */
1652 if (!tdstatus->knowndeletable)
1653 continue;
1654
1655 /* Entry is first partial ptid match (or an exact match)? */
1657 if (ptidcmp >= 0)
1658 {
1659 /* Greater than or equal (partial or exact) match... */
1660 break;
1661 }
1662 }
1663
1664 /* ...exact ptid match to a deletable deltids entry? */
1665 if (ptidcmp != 0)
1666 continue;
1667
1668 /* Exact match for deletable deltids entry -- ptid gets deleted */
1669 if (vacposting == NULL)
1670 {
1672 nitem * sizeof(uint16));
1673 vacposting->itup = itup;
1674 vacposting->updatedoffset = idxoffnum;
1675 vacposting->ndeletedtids = 0;
1676 }
1677 vacposting->deletetids[vacposting->ndeletedtids++] = p;
1678 }
1679
1680 /* Final decision on itup, a posting list tuple */
1681
1682 if (vacposting == NULL)
1683 {
1684 /* No TIDs to delete from itup -- do nothing */
1685 }
1686 else if (vacposting->ndeletedtids == nitem)
1687 {
1688 /* Straight delete of itup (to delete all TIDs) */
1689 deletable[ndeletable++] = idxoffnum;
1690 /* Turns out we won't need granular information */
1692 }
1693 else
1694 {
1695 /* Delete some (but not all) TIDs from itup */
1696 Assert(vacposting->ndeletedtids > 0 &&
1697 vacposting->ndeletedtids < nitem);
1698 updatable[nupdatable++] = vacposting;
1699 }
1700 }
1701
1702 /* Physically delete tuples (or TIDs) using deletable (or updatable) */
1703 _bt_delitems_delete(rel, buf, snapshotConflictHorizon, isCatalogRel,
1704 deletable, ndeletable, updatable, nupdatable);
1705
1706 /* be tidy */
1707 for (int i = 0; i < nupdatable; i++)
1708 pfree(updatable[i]);
1709}
uint16_t uint16
Definition c.h:623
uint32 TransactionId
Definition c.h:736
bool ItemPointerEquals(const ItemPointerData *pointer1, const ItemPointerData *pointer2)
Definition itemptr.c:35
#define MaxIndexTuplesPerPage
Definition itup.h:181
static void _bt_delitems_delete(Relation rel, Buffer buf, TransactionId snapshotConflictHorizon, bool isCatalogRel, OffsetNumber *deletable, int ndeletable, BTVacuumPosting *updatable, int nupdatable)
Definition nbtpage.c:1313
static int _bt_delitems_cmp(const void *a, const void *b)
Definition nbtpage.c:1494
static ItemPointer BTreeTupleGetPostingN(IndexTuple posting, int n)
Definition nbtree.h:545
#define OffsetNumberIsValid(offsetNumber)
Definition off.h:39
#define qsort(a, b, c, d)
Definition port.h:495
OffsetNumber idxoffnum
Definition tableam.h:240
static TransactionId table_index_delete_tuples(Relation rel, TM_IndexDeleteOp *delstate)
Definition tableam.h:1412
#define InvalidTransactionId
Definition transam.h:31

References _bt_delitems_cmp(), _bt_delitems_delete(), Assert, BTreeTupleGetHeapTID(), BTreeTupleGetMaxHeapTID(), BTreeTupleGetNPosting(), BTreeTupleGetPostingN(), BTreeTupleIsPosting(), buf, BufferGetPage(), fb(), i, TM_IndexStatus::idxoffnum, InvalidOffsetNumber, InvalidTransactionId, ItemPointerCompare(), ItemPointerEquals(), MaxIndexTuplesPerPage, OffsetNumberIsValid, PageGetItem(), PageGetItemId(), palloc(), pfree(), qsort, RelationIsAccessibleInLogicalDecoding, IndexTupleData::t_tid, table_index_delete_tuples(), and XLogStandbyInfoActive.

Referenced by _bt_bottomupdel_pass(), and _bt_simpledel_pass().

◆ _bt_delitems_vacuum()

void _bt_delitems_vacuum ( Relation  rel,
Buffer  buf,
OffsetNumber deletable,
int  ndeletable,
BTVacuumPosting updatable,
int  nupdatable 
)
extern

Definition at line 1182 of file nbtpage.c.

1185{
1186 Page page = BufferGetPage(buf);
1187 BTPageOpaque opaque;
1188 bool needswal = RelationNeedsWAL(rel);
1189 char *updatedbuf = NULL;
1190 Size updatedbuflen = 0;
1193
1194 /* Shouldn't be called unless there's something to do */
1195 Assert(ndeletable > 0 || nupdatable > 0);
1196
1197 /* Generate new version of posting lists without deleted TIDs */
1198 if (nupdatable > 0)
1201 needswal);
1202
1203 /* No ereport(ERROR) until changes are logged */
1205
1206 /*
1207 * Handle posting tuple updates.
1208 *
1209 * Deliberately do this before handling simple deletes. If we did it the
1210 * other way around (i.e. WAL record order -- simple deletes before
1211 * updates) then we'd have to make compensating changes to the 'updatable'
1212 * array of offset numbers.
1213 *
1214 * PageIndexTupleOverwrite() won't unset each item's LP_DEAD bit when it
1215 * happens to already be set. It's important that we not interfere with
1216 * any future simple index tuple deletion operations.
1217 */
1218 for (int i = 0; i < nupdatable; i++)
1219 {
1220 OffsetNumber updatedoffset = updatedoffsets[i];
1221 IndexTuple itup;
1222 Size itemsz;
1223
1224 itup = updatable[i]->itup;
1225 itemsz = MAXALIGN(IndexTupleSize(itup));
1226 if (!PageIndexTupleOverwrite(page, updatedoffset, itup, itemsz))
1227 elog(PANIC, "failed to update partially dead item in block %u of index \"%s\"",
1229 }
1230
1231 /* Now handle simple deletes of entire tuples */
1232 if (ndeletable > 0)
1234
1235 /*
1236 * We can clear the vacuum cycle ID since this page has certainly been
1237 * processed by the current vacuum scan.
1238 */
1239 opaque = BTPageGetOpaque(page);
1240 opaque->btpo_cycleid = 0;
1241
1242 /*
1243 * Clear the BTP_HAS_GARBAGE page flag.
1244 *
1245 * This flag indicates the presence of LP_DEAD items on the page (though
1246 * not reliably). Note that we only rely on it with pg_upgrade'd
1247 * !heapkeyspace indexes. That's why clearing it here won't usually
1248 * interfere with simple index tuple deletion.
1249 */
1250 opaque->btpo_flags &= ~BTP_HAS_GARBAGE;
1251
1253
1254 /* XLOG stuff */
1255 if (needswal)
1256 {
1258
1260 xlrec_vacuum.nupdated = nupdatable;
1261
1265
1266 if (ndeletable > 0)
1268 ndeletable * sizeof(OffsetNumber));
1269
1270 if (nupdatable > 0)
1271 {
1273 nupdatable * sizeof(OffsetNumber));
1275 }
1276
1278 }
1279 else
1280 recptr = XLogGetFakeLSN(rel);
1281
1282 PageSetLSN(page, recptr);
1283
1285
1286 /* can't leak memory here */
1287 if (updatedbuf != NULL)
1289 /* free tuples allocated within _bt_delitems_update() */
1290 for (int i = 0; i < nupdatable; i++)
1291 pfree(updatable[i]->itup);
1292}
void PageIndexMultiDelete(Page page, OffsetNumber *itemnos, int nitems)
Definition bufpage.c:1170
bool PageIndexTupleOverwrite(Page page, OffsetNumber offnum, const void *newtup, Size newsize)
Definition bufpage.c:1414
#define PANIC
Definition elog.h:44
static char * _bt_delitems_update(BTVacuumPosting *updatable, int nupdatable, OffsetNumber *updatedoffsets, Size *updatedbuflen, bool needswal)
Definition nbtpage.c:1435
#define SizeOfBtreeVacuum
Definition nbtxlog.h:234
#define XLOG_BTREE_VACUUM
Definition nbtxlog.h:39
uint16 btpo_flags
Definition nbtree.h:68
BTCycleId btpo_cycleid
Definition nbtree.h:69
IndexTuple itup
Definition nbtree.h:917
uint16 ndeleted
Definition nbtxlog.h:222

References _bt_delitems_update(), Assert, BTPageGetOpaque, BTPageOpaqueData::btpo_cycleid, BTPageOpaqueData::btpo_flags, buf, BufferGetBlockNumber(), BufferGetPage(), elog, END_CRIT_SECTION, fb(), i, IndexTupleSize(), BTVacuumPostingData::itup, MarkBufferDirty(), MAXALIGN, MaxIndexTuplesPerPage, xl_btree_vacuum::ndeleted, PageIndexMultiDelete(), PageIndexTupleOverwrite(), PageSetLSN(), PANIC, pfree(), REGBUF_STANDARD, RelationGetRelationName, RelationNeedsWAL, SizeOfBtreeVacuum, START_CRIT_SECTION, XLOG_BTREE_VACUUM, XLogBeginInsert(), XLogGetFakeLSN(), XLogInsert(), XLogRegisterBufData(), XLogRegisterBuffer(), and XLogRegisterData().

Referenced by btvacuumpage().

◆ _bt_doinsert()

bool _bt_doinsert ( Relation  rel,
IndexTuple  itup,
IndexUniqueCheck  checkUnique,
bool  indexUnchanged,
Relation  heapRel 
)
extern

Definition at line 105 of file nbtinsert.c.

108{
109 bool is_unique = false;
111 BTScanInsert itup_key;
112 BTStack stack;
114
115 /* we need an insertion scan key to do our search, so build one */
116 itup_key = _bt_mkscankey(rel, itup);
117
118 if (checkingunique)
119 {
120 if (!itup_key->anynullkeys)
121 {
122 /* No (heapkeyspace) scantid until uniqueness established */
123 itup_key->scantid = NULL;
124 }
125 else
126 {
127 /*
128 * Scan key for new tuple contains NULL key values. Bypass
129 * checkingunique steps. They are unnecessary because core code
130 * considers NULL unequal to every value, including NULL.
131 *
132 * This optimization avoids O(N^2) behavior within the
133 * _bt_findinsertloc() heapkeyspace path when a unique index has a
134 * large number of "duplicates" with NULL key values.
135 */
136 checkingunique = false;
137 /* Tuple is unique in the sense that core code cares about */
139 is_unique = true;
140 }
141 }
142
143 /*
144 * Fill in the BTInsertState working area, to track the current page and
145 * position within the page to insert on.
146 *
147 * Note that itemsz is passed down to lower level code that deals with
148 * inserting the item. It must be MAXALIGN()'d. This ensures that space
149 * accounting code consistently considers the alignment overhead that we
150 * expect PageAddItem() will add later. (Actually, index_form_tuple() is
151 * already conservative about alignment, but we don't rely on that from
152 * this distance. Besides, preserving the "true" tuple size in index
153 * tuple headers for the benefit of nbtsplitloc.c might happen someday.
154 * Note that heapam does not MAXALIGN() each heap tuple's lp_len field.)
155 */
156 insertstate.itup = itup;
157 insertstate.itemsz = MAXALIGN(IndexTupleSize(itup));
158 insertstate.itup_key = itup_key;
159 insertstate.bounds_valid = false;
161 insertstate.postingoff = 0;
162
163search:
164
165 /*
166 * Find and lock the leaf page that the tuple should be added to by
167 * searching from the root page. insertstate.buf will hold a buffer that
168 * is locked in exclusive mode afterwards.
169 */
170 stack = _bt_search_insert(rel, heapRel, &insertstate);
171
172 /*
173 * checkingunique inserts are not allowed to go ahead when two tuples with
174 * equal key attribute values would be visible to new MVCC snapshots once
175 * the xact commits. Check for conflicts in the locked page/buffer (if
176 * needed) here.
177 *
178 * It might be necessary to check a page to the right in _bt_check_unique,
179 * though that should be very rare. In practice the first page the value
180 * could be on (with scantid omitted) is almost always also the only page
181 * that a matching tuple might be found on. This is due to the behavior
182 * of _bt_findsplitloc with duplicate tuples -- a group of duplicates can
183 * only be allowed to cross a page boundary when there is no candidate
184 * leaf page split point that avoids it. Also, _bt_check_unique can use
185 * the leaf page high key to determine that there will be no duplicates on
186 * the right sibling without actually visiting it (it uses the high key in
187 * cases where the new item happens to belong at the far right of the leaf
188 * page).
189 *
190 * NOTE: obviously, _bt_check_unique can only detect keys that are already
191 * in the index; so it cannot defend against concurrent insertions of the
192 * same key. We protect against that by means of holding a write lock on
193 * the first page the value could be on, with omitted/-inf value for the
194 * implicit heap TID tiebreaker attribute. Any other would-be inserter of
195 * the same key must acquire a write lock on the same page, so only one
196 * would-be inserter can be making the check at one time. Furthermore,
197 * once we are past the check we hold write locks continuously until we
198 * have performed our insertion, so no later inserter can fail to see our
199 * insertion. (This requires some care in _bt_findinsertloc.)
200 *
201 * If we must wait for another xact, we release the lock while waiting,
202 * and then must perform a new search.
203 *
204 * For a partial uniqueness check, we don't wait for the other xact. Just
205 * let the tuple in and return false for possibly non-unique, or true for
206 * definitely unique.
207 */
208 if (checkingunique)
209 {
211 uint32 speculativeToken;
212
214 &is_unique, &speculativeToken);
215
217 {
218 /* Have to wait for the other guy ... */
219 _bt_relbuf(rel, insertstate.buf);
221
222 /*
223 * If it's a speculative insertion, wait for it to finish (ie. to
224 * go ahead with the insertion, or kill the tuple). Otherwise
225 * wait for the transaction to finish as usual.
226 */
227 if (speculativeToken)
228 SpeculativeInsertionWait(xwait, speculativeToken);
229 else
231
232 /* start over... */
233 if (stack)
234 _bt_freestack(stack);
235 goto search;
236 }
237
238 /* Uniqueness is established -- restore heap tid as scantid */
239 if (itup_key->heapkeyspace)
240 itup_key->scantid = &itup->t_tid;
241 }
242
244 {
245 OffsetNumber newitemoff;
246
247 /*
248 * The only conflict predicate locking cares about for indexes is when
249 * an index tuple insert conflicts with an existing lock. We don't
250 * know the actual page we're going to insert on for sure just yet in
251 * checkingunique and !heapkeyspace cases, but it's okay to use the
252 * first page the value could be on (with scantid omitted) instead.
253 */
255
256 /*
257 * Do the insertion. Note that insertstate contains cached binary
258 * search bounds established within _bt_check_unique when insertion is
259 * checkingunique.
260 */
261 newitemoff = _bt_findinsertloc(rel, &insertstate, checkingunique,
262 indexUnchanged, stack, heapRel);
263 _bt_insertonpg(rel, heapRel, itup_key, insertstate.buf, InvalidBuffer,
264 stack, itup, insertstate.itemsz, newitemoff,
265 insertstate.postingoff, false);
266 }
267 else
268 {
269 /* just release the buffer */
270 _bt_relbuf(rel, insertstate.buf);
271 }
272
273 /* be tidy */
274 if (stack)
275 _bt_freestack(stack);
276 pfree(itup_key);
277
278 return is_unique;
279}
uint32_t uint32
Definition c.h:624
@ UNIQUE_CHECK_NO
Definition genam.h:125
@ UNIQUE_CHECK_EXISTING
Definition genam.h:128
void SpeculativeInsertionWait(TransactionId xid, uint32 token)
Definition lmgr.c:828
void XactLockTableWait(TransactionId xid, Relation rel, const ItemPointerData *ctid, XLTW_Oper oper)
Definition lmgr.c:663
@ XLTW_InsertIndex
Definition lmgr.h:31
static void _bt_freestack(BTStack stack)
Definition nbtinsert.c:2461
static BTStack _bt_search_insert(Relation rel, Relation heaprel, BTInsertState insertstate)
Definition nbtinsert.c:320
static OffsetNumber _bt_findinsertloc(Relation rel, BTInsertState insertstate, bool checkingunique, bool indexUnchanged, BTStack stack, Relation heapRel)
Definition nbtinsert.c:829
static void _bt_insertonpg(Relation rel, Relation heaprel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, BTStack stack, IndexTuple itup, Size itemsz, OffsetNumber newitemoff, int postingoff, bool split_only_page)
Definition nbtinsert.c:1119
static TransactionId _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel, IndexUniqueCheck checkUnique, bool *is_unique, uint32 *speculativeToken)
Definition nbtinsert.c:411
BTScanInsert _bt_mkscankey(Relation rel, IndexTuple itup)
Definition nbtutils.c:61
void CheckForSerializableConflictIn(Relation relation, const ItemPointerData *tid, BlockNumber blkno)
Definition predicate.c:4266
ItemPointer scantid
Definition nbtree.h:802
bool heapkeyspace
Definition nbtree.h:797
#define TransactionIdIsValid(xid)
Definition transam.h:41

References _bt_check_unique(), _bt_findinsertloc(), _bt_freestack(), _bt_insertonpg(), _bt_mkscankey(), _bt_relbuf(), _bt_search_insert(), BTScanInsertData::anynullkeys, Assert, BufferGetBlockNumber(), CheckForSerializableConflictIn(), fb(), BTScanInsertData::heapkeyspace, IndexTupleSize(), InvalidBuffer, MAXALIGN, pfree(), BTScanInsertData::scantid, SpeculativeInsertionWait(), IndexTupleData::t_tid, TransactionIdIsValid, UNIQUE_CHECK_EXISTING, UNIQUE_CHECK_NO, unlikely, XactLockTableWait(), and XLTW_InsertIndex.

Referenced by btinsert().

◆ _bt_end_vacuum()

void _bt_end_vacuum ( Relation  rel)
extern

Definition at line 530 of file nbtutils.c.

531{
532 int i;
533
535
536 /* Find the array entry */
537 for (i = 0; i < btvacinfo->num_vacuums; i++)
538 {
540
541 if (vac->relid.relId == rel->rd_lockInfo.lockRelId.relId &&
542 vac->relid.dbId == rel->rd_lockInfo.lockRelId.dbId)
543 {
544 /* Remove it by shifting down the last entry */
547 break;
548 }
549 }
550
552}
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition lwlock.c:1150
void LWLockRelease(LWLock *lock)
Definition lwlock.c:1767
@ LW_EXCLUSIVE
Definition lwlock.h:104
static BTVacInfo * btvacinfo
Definition nbtutils.c:419
LockRelId relid
Definition nbtutils.c:407
int num_vacuums
Definition nbtutils.c:414
BTOneVacInfo vacuums[FLEXIBLE_ARRAY_MEMBER]
Definition nbtutils.c:416
LockRelId lockRelId
Definition rel.h:46
Oid relId
Definition rel.h:40
Oid dbId
Definition rel.h:41
LockInfoData rd_lockInfo
Definition rel.h:114

References btvacinfo, LockRelId::dbId, fb(), i, LockInfoData::lockRelId, LW_EXCLUSIVE, LWLockAcquire(), LWLockRelease(), BTVacInfo::num_vacuums, RelationData::rd_lockInfo, BTOneVacInfo::relid, LockRelId::relId, and BTVacInfo::vacuums.

Referenced by _bt_end_vacuum_callback(), and btbulkdelete().

◆ _bt_end_vacuum_callback()

void _bt_end_vacuum_callback ( int  code,
Datum  arg 
)
extern

Definition at line 558 of file nbtutils.c.

559{
561}
Datum arg
Definition elog.c:1322
void _bt_end_vacuum(Relation rel)
Definition nbtutils.c:530
static Pointer DatumGetPointer(Datum X)
Definition postgres.h:332

References _bt_end_vacuum(), arg, and DatumGetPointer().

Referenced by btbulkdelete().

◆ _bt_findsplitloc()

OffsetNumber _bt_findsplitloc ( Relation  rel,
Page  origpage,
OffsetNumber  newitemoff,
Size  newitemsz,
IndexTuple  newitem,
bool newitemonleft 
)
extern

Definition at line 130 of file nbtsplitloc.c.

136{
137 BTPageOpaque opaque;
138 int leftspace,
139 rightspace,
140 olddataitemstotal,
145 FindSplitStrat strategy;
146 ItemId itemid;
147 OffsetNumber offnum,
148 maxoff,
149 firstrightoff;
150 double fillfactormult;
151 bool usemult;
153 rightpage;
154
155 opaque = BTPageGetOpaque(origpage);
156 maxoff = PageGetMaxOffsetNumber(origpage);
157
158 /* Total free space available on a btree page, after fixed overhead */
159 leftspace = rightspace =
161 MAXALIGN(sizeof(BTPageOpaqueData));
162
163 /* The right page will have the same high key as the old page */
164 if (!P_RIGHTMOST(opaque))
165 {
166 itemid = PageGetItemId(origpage, P_HIKEY);
167 rightspace -= (int) (MAXALIGN(ItemIdGetLength(itemid)) +
168 sizeof(ItemIdData));
169 }
170
171 /* Count up total space in data items before actually scanning 'em */
172 olddataitemstotal = rightspace - (int) PageGetExactFreeSpace(origpage);
174
175 /* Passed-in newitemsz is MAXALIGNED but does not include line pointer */
176 newitemsz += sizeof(ItemIdData);
177 state.rel = rel;
178 state.origpage = origpage;
179 state.newitem = newitem;
180 state.newitemsz = newitemsz;
181 state.is_leaf = P_ISLEAF(opaque);
182 state.is_rightmost = P_RIGHTMOST(opaque);
183 state.leftspace = leftspace;
184 state.rightspace = rightspace;
185 state.olddataitemstotal = olddataitemstotal;
186 state.minfirstrightsz = SIZE_MAX;
187 state.newitemoff = newitemoff;
188
189 /* newitem cannot be a posting list item */
190 Assert(!BTreeTupleIsPosting(newitem));
191
192 /*
193 * nsplits should never exceed maxoff because there will be at most as
194 * many candidate split points as there are points _between_ tuples, once
195 * you imagine that the new item is already on the original page (the
196 * final number of splits may be slightly lower because not all points
197 * between tuples will be legal).
198 */
199 state.maxsplits = maxoff;
200 state.splits = palloc_array(SplitPoint, state.maxsplits);
201 state.nsplits = 0;
202
203 /*
204 * Scan through the data items and calculate space usage for a split at
205 * each possible position
206 */
208
209 for (offnum = P_FIRSTDATAKEY(opaque);
210 offnum <= maxoff;
211 offnum = OffsetNumberNext(offnum))
212 {
213 Size itemsz;
214
215 itemid = PageGetItemId(origpage, offnum);
216 itemsz = MAXALIGN(ItemIdGetLength(itemid)) + sizeof(ItemIdData);
217
218 /*
219 * When item offset number is not newitemoff, neither side of the
220 * split can be newitem. Record a split after the previous data item
221 * from original page, but before the current data item from original
222 * page. (_bt_recsplitloc() will reject the split when there are no
223 * previous items, which we rely on.)
224 */
225 if (offnum < newitemoff)
226 _bt_recsplitloc(&state, offnum, false, olddataitemstoleft, itemsz);
227 else if (offnum > newitemoff)
228 _bt_recsplitloc(&state, offnum, true, olddataitemstoleft, itemsz);
229 else
230 {
231 /*
232 * Record a split after all "offnum < newitemoff" original page
233 * data items, but before newitem
234 */
235 _bt_recsplitloc(&state, offnum, false, olddataitemstoleft, itemsz);
236
237 /*
238 * Record a split after newitem, but before data item from
239 * original page at offset newitemoff/current offset
240 */
241 _bt_recsplitloc(&state, offnum, true, olddataitemstoleft, itemsz);
242 }
243
244 olddataitemstoleft += itemsz;
245 }
246
247 /*
248 * Record a split after all original page data items, but before newitem.
249 * (Though only when it's possible that newitem will end up alone on new
250 * right page.)
251 */
252 Assert(olddataitemstoleft == olddataitemstotal);
253 if (newitemoff > maxoff)
254 _bt_recsplitloc(&state, newitemoff, false, olddataitemstotal, 0);
255
256 /*
257 * I believe it is not possible to fail to find a feasible split, but just
258 * in case ...
259 */
260 if (state.nsplits == 0)
261 elog(ERROR, "could not find a feasible split point for index \"%s\"",
263
264 /*
265 * Start search for a split point among list of legal split points. Give
266 * primary consideration to equalizing available free space in each half
267 * of the split initially (start with default strategy), while applying
268 * rightmost and split-after-new-item optimizations where appropriate.
269 * Either of the two other fallback strategies may be required for cases
270 * with a large number of duplicates around the original/space-optimal
271 * split point.
272 *
273 * Default strategy gives some weight to suffix truncation in deciding a
274 * split point on leaf pages. It attempts to select a split point where a
275 * distinguishing attribute appears earlier in the new high key for the
276 * left side of the split, in order to maximize the number of trailing
277 * attributes that can be truncated away. Only candidate split points
278 * that imply an acceptable balance of free space on each side are
279 * considered. See _bt_defaultinterval().
280 */
281 if (!state.is_leaf)
282 {
283 /* fillfactormult only used on rightmost page */
284 usemult = state.is_rightmost;
286 }
287 else if (state.is_rightmost)
288 {
289 /* Rightmost leaf page -- fillfactormult always used */
290 usemult = true;
292 }
293 else if (_bt_afternewitemoff(&state, maxoff, leaffillfactor, &usemult))
294 {
295 /*
296 * New item inserted at rightmost point among a localized grouping on
297 * a leaf page -- apply "split after new item" optimization, either by
298 * applying leaf fillfactor multiplier, or by choosing the exact split
299 * point that leaves newitem as lastleft. (usemult is set for us.)
300 */
301 if (usemult)
302 {
303 /* fillfactormult should be set based on leaf fillfactor */
305 }
306 else
307 {
308 /* find precise split point after newitemoff */
309 for (int i = 0; i < state.nsplits; i++)
310 {
311 SplitPoint *split = state.splits + i;
312
313 if (split->newitemonleft &&
314 newitemoff == split->firstrightoff)
315 {
316 pfree(state.splits);
317 *newitemonleft = true;
318 return newitemoff;
319 }
320 }
321
322 /*
323 * Cannot legally split after newitemoff; proceed with split
324 * without using fillfactor multiplier. This is defensive, and
325 * should never be needed in practice.
326 */
327 fillfactormult = 0.50;
328 }
329 }
330 else
331 {
332 /* Other leaf page. 50:50 page split. */
333 usemult = false;
334 /* fillfactormult not used, but be tidy */
335 fillfactormult = 0.50;
336 }
337
338 /*
339 * Save leftmost and rightmost splits for page before original ordinal
340 * sort order is lost by delta/fillfactormult sort
341 */
342 leftpage = state.splits[0];
343 rightpage = state.splits[state.nsplits - 1];
344
345 /* Give split points a fillfactormult-wise delta, and sort on deltas */
347
348 /* Determine split interval for default strategy */
349 state.interval = _bt_defaultinterval(&state);
350
351 /*
352 * Determine if default strategy/split interval will produce a
353 * sufficiently distinguishing split, or if we should change strategies.
354 * Alternative strategies change the range of split points that are
355 * considered acceptable (split interval), and possibly change
356 * fillfactormult, in order to deal with pages with a large number of
357 * duplicates gracefully.
358 *
359 * Pass low and high splits for the entire page (actually, they're for an
360 * imaginary version of the page that includes newitem). These are used
361 * when the initial split interval encloses split points that are full of
362 * duplicates, and we need to consider if it's even possible to avoid
363 * appending a heap TID.
364 */
366
367 if (strategy == SPLIT_DEFAULT)
368 {
369 /*
370 * Default strategy worked out (always works out with internal page).
371 * Original split interval still stands.
372 */
373 }
374
375 /*
376 * Many duplicates strategy is used when a heap TID would otherwise be
377 * appended, but the page isn't completely full of logical duplicates.
378 *
379 * The split interval is widened to include all legal candidate split
380 * points. There might be a few as two distinct values in the whole-page
381 * split interval, though it's also possible that most of the values on
382 * the page are unique. The final split point will either be to the
383 * immediate left or to the immediate right of the group of duplicate
384 * tuples that enclose the first/delta-optimal split point (perfect
385 * penalty was set so that the lowest delta split point that avoids
386 * appending a heap TID will be chosen). Maximizing the number of
387 * attributes that can be truncated away is not a goal of the many
388 * duplicates strategy.
389 *
390 * Single value strategy is used when it is impossible to avoid appending
391 * a heap TID. It arranges to leave the left page very full. This
392 * maximizes space utilization in cases where tuples with the same
393 * attribute values span many pages. Newly inserted duplicates will tend
394 * to have higher heap TID values, so we'll end up splitting to the right
395 * consistently. (Single value strategy is harmless though not
396 * particularly useful with !heapkeyspace indexes.)
397 */
398 else if (strategy == SPLIT_MANY_DUPLICATES)
399 {
400 Assert(state.is_leaf);
401 /* Shouldn't try to truncate away extra user attributes */
404 /* No need to resort splits -- no change in fillfactormult/deltas */
405 state.interval = state.nsplits;
406 }
407 else if (strategy == SPLIT_SINGLE_VALUE)
408 {
409 Assert(state.is_leaf);
410 /* Split near the end of the page */
411 usemult = true;
413 /* Resort split points with new delta */
415 /* Appending a heap TID is unavoidable, so interval of 1 is fine */
416 state.interval = 1;
417 }
418
419 /*
420 * Search among acceptable split points (using final split interval) for
421 * the entry that has the lowest penalty, and is therefore expected to
422 * maximize fan-out. Sets *newitemonleft for us.
423 */
424 firstrightoff = _bt_bestsplitloc(&state, perfectpenalty, newitemonleft,
425 strategy);
426 pfree(state.splits);
427
428 return firstrightoff;
429}
static Size PageGetPageSize(const PageData *page)
Definition bufpage.h:301
#define BTREE_SINGLEVAL_FILLFACTOR
Definition nbtree.h:203
#define BTGetFillFactor(relation)
Definition nbtree.h:1127
#define BTREE_NONLEAF_FILLFACTOR
Definition nbtree.h:202
static void _bt_deltasortsplits(FindSplitData *state, double fillfactormult, bool usemult)
static int _bt_strategy(FindSplitData *state, SplitPoint *leftpage, SplitPoint *rightpage, FindSplitStrat *strategy)
static bool _bt_afternewitemoff(FindSplitData *state, OffsetNumber maxoff, int leaffillfactor, bool *usemult)
static void _bt_recsplitloc(FindSplitData *state, OffsetNumber firstrightoff, bool newitemonleft, int olddataitemstoleft, Size firstrightofforigpagetuplesz)
FindSplitStrat
Definition nbtsplitloc.c:22
@ SPLIT_DEFAULT
Definition nbtsplitloc.c:24
@ SPLIT_MANY_DUPLICATES
Definition nbtsplitloc.c:25
@ SPLIT_SINGLE_VALUE
Definition nbtsplitloc.c:26
static OffsetNumber _bt_bestsplitloc(FindSplitData *state, int perfectpenalty, bool *newitemonleft, FindSplitStrat strategy)
static int _bt_defaultinterval(FindSplitData *state)

References _bt_afternewitemoff(), _bt_bestsplitloc(), _bt_defaultinterval(), _bt_deltasortsplits(), _bt_recsplitloc(), _bt_strategy(), Assert, BTGetFillFactor, BTPageGetOpaque, BTREE_NONLEAF_FILLFACTOR, BTREE_SINGLEVAL_FILLFACTOR, BTreeTupleIsPosting(), elog, ERROR, fb(), i, IndexRelationGetNumberOfKeyAttributes, ItemIdGetLength, MAXALIGN, OffsetNumberNext, P_FIRSTDATAKEY, P_HIKEY, P_ISLEAF, P_RIGHTMOST, PageGetExactFreeSpace(), PageGetItemId(), PageGetMaxOffsetNumber(), PageGetPageSize(), palloc_array, pfree(), RelationGetRelationName, SizeOfPageHeaderData, SPLIT_DEFAULT, SPLIT_MANY_DUPLICATES, and SPLIT_SINGLE_VALUE.

Referenced by _bt_split().

◆ _bt_finish_split()

void _bt_finish_split ( Relation  rel,
Relation  heaprel,
Buffer  lbuf,
BTStack  stack 
)
extern

Definition at line 2272 of file nbtinsert.c.

2273{
2276 Buffer rbuf;
2277 Page rpage;
2279 bool wasroot;
2280 bool wasonly;
2281
2283 Assert(heaprel != NULL);
2284
2285 /* Lock right sibling, the one missing the downlink */
2286 rbuf = _bt_getbuf(rel, lpageop->btpo_next, BT_WRITE);
2289
2290 /* Could this be a root split? */
2291 if (!stack)
2292 {
2294 Page metapg;
2296
2297 /* acquire lock on the metapage */
2301
2302 wasroot = (metad->btm_root == BufferGetBlockNumber(lbuf));
2303
2304 _bt_relbuf(rel, metabuf);
2305 }
2306 else
2307 wasroot = false;
2308
2309 /* Was this the only page on the level before split? */
2311
2312 INJECTION_POINT("nbtree-finish-incomplete-split", NULL);
2313 elog(DEBUG1, "finishing incomplete split of %u/%u",
2315
2316 _bt_insert_parent(rel, heaprel, lbuf, rbuf, stack, wasroot, wasonly);
2317}
#define INJECTION_POINT(name, arg)
static void _bt_insert_parent(Relation rel, Relation heaprel, Buffer buf, Buffer rbuf, BTStack stack, bool isroot, bool isonly)
Definition nbtinsert.c:2130
Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access)
Definition nbtpage.c:850
#define BTPageGetMeta(p)
Definition nbtree.h:122
#define P_LEFTMOST(opaque)
Definition nbtree.h:219
#define P_INCOMPLETE_SPLIT(opaque)
Definition nbtree.h:228
#define BTREE_METAPAGE
Definition nbtree.h:149
#define BT_WRITE
Definition nbtree.h:731

References _bt_getbuf(), _bt_insert_parent(), _bt_relbuf(), Assert, BT_WRITE, BTPageGetMeta, BTPageGetOpaque, BTREE_METAPAGE, BufferGetBlockNumber(), BufferGetPage(), DEBUG1, elog, fb(), INJECTION_POINT, P_INCOMPLETE_SPLIT, P_LEFTMOST, and P_RIGHTMOST.

Referenced by _bt_getstackbuf(), _bt_moveright(), and _bt_stepright().

◆ _bt_first()

bool _bt_first ( IndexScanDesc  scan,
ScanDirection  dir 
)
extern

Definition at line 883 of file nbtsearch.c.

884{
885 Relation rel = scan->indexRelation;
887 OffsetNumber offnum;
888 BTScanInsertData inskey;
891 int keysz = 0;
895
896 Assert(!BTScanPosIsValid(so->currPos));
897
898 /*
899 * Examine the scan keys and eliminate any redundant keys; also mark the
900 * keys that must be matched to continue the scan.
901 */
903
904 /*
905 * Quit now if _bt_preprocess_keys() discovered that the scan keys can
906 * never be satisfied (eg, x == 1 AND x > 2).
907 */
908 if (!so->qual_ok)
909 {
910 Assert(!so->needPrimScan);
911 _bt_parallel_done(scan);
912 return false;
913 }
914
915 /*
916 * If this is a parallel scan, we must seize the scan. _bt_readfirstpage
917 * will likely release the parallel scan later on.
918 */
919 if (scan->parallel_scan != NULL &&
920 !_bt_parallel_seize(scan, &blkno, &lastcurrblkno, true))
921 return false;
922
923 /*
924 * Initialize the scan's arrays (if any) for the current scan direction
925 * (except when they were already set to later values as part of
926 * scheduling the primitive index scan that is now underway)
927 */
928 if (so->numArrayKeys && !so->needPrimScan)
929 _bt_start_array_keys(scan, dir);
930
931 if (blkno != InvalidBlockNumber)
932 {
933 /*
934 * We anticipated calling _bt_search, but another worker bet us to it.
935 * _bt_readnextpage releases the scan for us (not _bt_readfirstpage).
936 */
937 Assert(scan->parallel_scan != NULL);
938 Assert(!so->needPrimScan);
939 Assert(blkno != P_NONE);
940
941 if (!_bt_readnextpage(scan, blkno, lastcurrblkno, dir, true))
942 return false;
943
944 _bt_returnitem(scan, so);
945 return true;
946 }
947
948 /*
949 * Count an indexscan for stats, now that we know that we'll call
950 * _bt_search/_bt_endpoint below
951 */
953 if (scan->instrument)
954 scan->instrument->nsearches++;
955
956 /*----------
957 * Examine the scan keys to discover where we need to start the scan.
958 * The selected scan keys (at most one per index column) are remembered by
959 * storing their addresses into the local startKeys[] array. The final
960 * startKeys[] entry's strategy is set in strat_total. (Actually, there
961 * are a couple of cases where we force a less/more restrictive strategy.)
962 *
963 * We must use the key that was marked required (in the direction opposite
964 * our own scan's) during preprocessing. Each index attribute can only
965 * have one such required key. In general, the keys that we use to find
966 * an initial position when scanning forwards are the same keys that end
967 * the scan on the leaf level when scanning backwards (and vice-versa).
968 *
969 * When the scan keys include cross-type operators, _bt_preprocess_keys
970 * may not be able to eliminate redundant keys; in such cases it will
971 * arbitrarily pick a usable key for each attribute (and scan direction),
972 * ensuring that there is no more than one key required in each direction.
973 * We stop considering further keys once we reach the first nonrequired
974 * key (which must come after all required keys), so this can't affect us.
975 *
976 * The required keys that we use as starting boundaries have to be =, >,
977 * or >= keys for a forward scan or =, <, <= keys for a backwards scan.
978 * We can use keys for multiple attributes so long as the prior attributes
979 * had only =, >= (resp. =, <=) keys. These rules are very similar to the
980 * rules that preprocessing used to determine which keys to mark required.
981 * We cannot always use every required key as a positioning key, though.
982 * Skip arrays necessitate independently applying our own rules here.
983 * Skip arrays are always generally considered = array keys, but we'll
984 * nevertheless treat them as inequalities at certain points of the scan.
985 * When that happens, it _might_ have implications for the number of
986 * required keys that we can safely use for initial positioning purposes.
987 *
988 * For example, a forward scan with a skip array on its leading attribute
989 * (with no low_compare/high_compare) will have at least two required scan
990 * keys, but we won't use any of them as boundary keys during the scan's
991 * initial call here. Our positioning key during the first call here can
992 * be thought of as representing "> -infinity". Similarly, if such a skip
993 * array's low_compare is "a > 'foo'", then we position using "a > 'foo'"
994 * during the scan's initial call here; a lower-order key such as "b = 42"
995 * can't be used until the "a" array advances beyond MINVAL/low_compare.
996 *
997 * On the other hand, if such a skip array's low_compare was "a >= 'foo'",
998 * then we _can_ use "a >= 'foo' AND b = 42" during the initial call here.
999 * A subsequent call here might have us use "a = 'fop' AND b = 42". Note
1000 * that we treat = and >= as equivalent when scanning forwards (just as we
1001 * treat = and <= as equivalent when scanning backwards). We effectively
1002 * do the same thing (though with a distinct "a" element/value) each time.
1003 *
1004 * All keys (with the exception of SK_SEARCHNULL keys and SK_BT_SKIP
1005 * array keys whose array is "null_elem=true") imply a NOT NULL qualifier.
1006 * If the index stores nulls at the end of the index we'll be starting
1007 * from, and we have no boundary key for the column (which means the key
1008 * we deduced NOT NULL from is an inequality key that constrains the other
1009 * end of the index), then we cons up an explicit SK_SEARCHNOTNULL key to
1010 * use as a boundary key. If we didn't do this, we might find ourselves
1011 * traversing a lot of null entries at the start of the scan.
1012 *
1013 * In this loop, row-comparison keys are treated the same as keys on their
1014 * first (leftmost) columns. We'll add all lower-order columns of the row
1015 * comparison that were marked required during preprocessing below.
1016 *
1017 * _bt_advance_array_keys needs to know exactly how we'll reposition the
1018 * scan (should it opt to schedule another primitive index scan). It is
1019 * critical that primscans only be scheduled when they'll definitely make
1020 * some useful progress. _bt_advance_array_keys does this by calling
1021 * _bt_checkkeys routines that report whether a tuple is past the end of
1022 * matches for the scan's keys (given the scan's current array elements).
1023 * If the page's final tuple is "after the end of matches" for a scan that
1024 * uses the *opposite* scan direction, then it must follow that it's also
1025 * "before the start of matches" for the actual current scan direction.
1026 * It is therefore essential that all of our initial positioning rules are
1027 * symmetric with _bt_checkkeys's corresponding continuescan=false rule.
1028 * If you update anything here, _bt_checkkeys/_bt_advance_array_keys might
1029 * need to be kept in sync.
1030 *----------
1031 */
1032 if (so->numberOfKeys > 0)
1033 {
1035 ScanKey bkey;
1037 ScanKey cur;
1038
1039 /*
1040 * bkey will be set to the key that preprocessing left behind as the
1041 * boundary key for this attribute, in this scan direction (if any)
1042 */
1043 cur = so->keyData;
1044 curattr = 1;
1045 bkey = NULL;
1046 /* Also remember any scankey that implies a NOT NULL constraint */
1047 impliesNN = NULL;
1048
1049 /*
1050 * Loop iterates from 0 to numberOfKeys inclusive; we use the last
1051 * pass to handle after-last-key processing. Actual exit from the
1052 * loop is at one of the "break" statements below.
1053 */
1054 for (int i = 0;; cur++, i++)
1055 {
1056 if (i >= so->numberOfKeys || cur->sk_attno != curattr)
1057 {
1058 /* Done looking for the curattr boundary key */
1059 Assert(bkey == NULL ||
1060 (bkey->sk_attno == curattr &&
1061 (bkey->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD))));
1062 Assert(impliesNN == NULL ||
1063 (impliesNN->sk_attno == curattr &&
1064 (impliesNN->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD))));
1065
1066 /*
1067 * If this is a scan key for a skip array whose current
1068 * element is MINVAL, choose low_compare (when scanning
1069 * backwards it'll be MAXVAL, and we'll choose high_compare).
1070 *
1071 * Note: if the array's low_compare key makes 'bkey' NULL,
1072 * then we behave as if the array's first element is -inf,
1073 * except when !array->null_elem implies a usable NOT NULL
1074 * constraint.
1075 */
1076 if (bkey != NULL &&
1077 (bkey->sk_flags & (SK_BT_MINVAL | SK_BT_MAXVAL)))
1078 {
1079 int ikey = bkey - so->keyData;
1081 BTArrayKeyInfo *array = NULL;
1082
1083 for (int arridx = 0; arridx < so->numArrayKeys; arridx++)
1084 {
1085 array = &so->arrayKeys[arridx];
1086 if (array->scan_key == ikey)
1087 break;
1088 }
1089
1090 if (ScanDirectionIsForward(dir))
1091 {
1092 Assert(!(skipequalitykey->sk_flags & SK_BT_MAXVAL));
1093 bkey = array->low_compare;
1094 }
1095 else
1096 {
1097 Assert(!(skipequalitykey->sk_flags & SK_BT_MINVAL));
1098 bkey = array->high_compare;
1099 }
1100
1101 Assert(bkey == NULL ||
1102 bkey->sk_attno == skipequalitykey->sk_attno);
1103
1104 if (!array->null_elem)
1106 else
1107 Assert(bkey == NULL && impliesNN == NULL);
1108 }
1109
1110 /*
1111 * If we didn't find a usable boundary key, see if we can
1112 * deduce a NOT NULL key
1113 */
1114 if (bkey == NULL && impliesNN != NULL &&
1115 ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
1118 {
1119 /* Final startKeys[] entry will be deduced NOT NULL key */
1120 bkey = &notnullkey;
1123 (impliesNN->sk_flags &
1125 curattr,
1128 InvalidOid,
1129 InvalidOid,
1130 InvalidOid,
1131 (Datum) 0);
1132 }
1133
1134 /*
1135 * If preprocessing didn't leave a usable boundary key, quit;
1136 * else save the boundary key pointer in startKeys[]
1137 */
1138 if (bkey == NULL)
1139 break;
1140 startKeys[keysz++] = bkey;
1141
1142 /*
1143 * We can only consider adding more boundary keys when the one
1144 * that we just chose to add uses either the = or >= strategy
1145 * (during backwards scans we can only do so when the key that
1146 * we just added to startKeys[] uses the = or <= strategy)
1147 */
1148 strat_total = bkey->sk_strategy;
1151 break;
1152
1153 /*
1154 * If the key that we just added to startKeys[] is a skip
1155 * array = key whose current element is marked NEXT or PRIOR,
1156 * make strat_total > or < (and stop adding boundary keys).
1157 * This can only happen with opclasses that lack skip support.
1158 */
1159 if (bkey->sk_flags & (SK_BT_NEXT | SK_BT_PRIOR))
1160 {
1161 Assert(bkey->sk_flags & SK_BT_SKIP);
1163
1164 if (ScanDirectionIsForward(dir))
1165 {
1166 Assert(!(bkey->sk_flags & SK_BT_PRIOR));
1168 }
1169 else
1170 {
1171 Assert(!(bkey->sk_flags & SK_BT_NEXT));
1173 }
1174
1175 /*
1176 * We're done. We'll never find an exact = match for a
1177 * NEXT or PRIOR sentinel sk_argument value. There's no
1178 * sense in trying to add more keys to startKeys[].
1179 */
1180 break;
1181 }
1182
1183 /*
1184 * Done if that was the last scan key output by preprocessing.
1185 * Also done if we've now examined all keys marked required.
1186 */
1187 if (i >= so->numberOfKeys ||
1188 !(cur->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)))
1189 break;
1190
1191 /*
1192 * Reset for next attr.
1193 */
1194 Assert(cur->sk_attno == curattr + 1);
1195 curattr = cur->sk_attno;
1196 bkey = NULL;
1197 impliesNN = NULL;
1198 }
1199
1200 /*
1201 * If we've located the starting boundary key for curattr, we have
1202 * no interest in curattr's other required key
1203 */
1204 if (bkey != NULL)
1205 continue;
1206
1207 /*
1208 * Is this key the starting boundary key for curattr?
1209 *
1210 * If not, does it imply a NOT NULL constraint? (Because
1211 * SK_SEARCHNULL keys are always assigned BTEqualStrategyNumber,
1212 * *any* inequality key works for that; we need not test.)
1213 */
1214 switch (cur->sk_strategy)
1215 {
1218 if (ScanDirectionIsBackward(dir))
1219 bkey = cur;
1220 else if (impliesNN == NULL)
1221 impliesNN = cur;
1222 break;
1224 bkey = cur;
1225 break;
1228 if (ScanDirectionIsForward(dir))
1229 bkey = cur;
1230 else if (impliesNN == NULL)
1231 impliesNN = cur;
1232 break;
1233 }
1234 }
1235 }
1236
1237 /*
1238 * If we found no usable boundary keys, we have to start from one end of
1239 * the tree. Walk down that edge to the first or last key, and scan from
1240 * there.
1241 *
1242 * Note: calls _bt_readfirstpage for us, which releases the parallel scan.
1243 */
1244 if (keysz == 0)
1245 return _bt_endpoint(scan, dir);
1246
1247 /*
1248 * We want to start the scan somewhere within the index. Set up an
1249 * insertion scankey we can use to search for the boundary point we
1250 * identified above. The insertion scankey is built using the keys
1251 * identified by startKeys[]. (Remaining insertion scankey fields are
1252 * initialized after initial-positioning scan keys are finalized.)
1253 */
1254 Assert(keysz <= INDEX_MAX_KEYS);
1255 for (int i = 0; i < keysz; i++)
1256 {
1258
1259 Assert(bkey->sk_attno == i + 1);
1260
1261 if (bkey->sk_flags & SK_ROW_HEADER)
1262 {
1263 /*
1264 * Row comparison header: look to the first row member instead
1265 */
1266 ScanKey subkey = (ScanKey) DatumGetPointer(bkey->sk_argument);
1267 bool loosen_strat = false,
1268 tighten_strat = false;
1269
1270 /*
1271 * Cannot be a NULL in the first row member: _bt_preprocess_keys
1272 * would've marked the qual as unsatisfiable, preventing us from
1273 * ever getting this far
1274 */
1275 Assert(subkey->sk_flags & SK_ROW_MEMBER);
1276 Assert(subkey->sk_attno == bkey->sk_attno);
1277 Assert(!(subkey->sk_flags & SK_ISNULL));
1278
1279 /*
1280 * This is either a > or >= key (during backwards scans it is
1281 * either < or <=) that was marked required during preprocessing.
1282 * Later so->keyData[] keys can't have been marked required, so
1283 * our row compare header key must be the final startKeys[] entry.
1284 */
1285 Assert(subkey->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD));
1286 Assert(subkey->sk_strategy == bkey->sk_strategy);
1287 Assert(subkey->sk_strategy == strat_total);
1288 Assert(i == keysz - 1);
1289
1290 /*
1291 * The member scankeys are already in insertion format (ie, they
1292 * have sk_func = 3-way-comparison function)
1293 */
1294 memcpy(inskey.scankeys + i, subkey, sizeof(ScanKeyData));
1295
1296 /*
1297 * Now look to later row compare members.
1298 *
1299 * If there's an "index attribute gap" between two row compare
1300 * members, the second member won't have been marked required, and
1301 * so can't be used as a starting boundary key here. The part of
1302 * the row comparison that we do still use has to be treated as a
1303 * ">=" or "<=" condition. For example, a qual "(a, c) > (1, 42)"
1304 * with an omitted intervening index attribute "b" will use an
1305 * insertion scan key "a >= 1". Even the first "a = 1" tuple on
1306 * the leaf level might satisfy the row compare qual.
1307 *
1308 * We're able to use a _more_ restrictive strategy when we reach a
1309 * NULL row compare member, since they're always unsatisfiable.
1310 * For example, a qual "(a, b, c) >= (1, NULL, 77)" will use an
1311 * insertion scan key "a > 1". All tuples where "a = 1" cannot
1312 * possibly satisfy the row compare qual, so this is safe.
1313 */
1314 Assert(!(subkey->sk_flags & SK_ROW_END));
1315 for (;;)
1316 {
1317 subkey++;
1318 Assert(subkey->sk_flags & SK_ROW_MEMBER);
1319
1320 if (subkey->sk_flags & SK_ISNULL)
1321 {
1322 /*
1323 * NULL member key, can only use earlier keys.
1324 *
1325 * We deliberately avoid checking if this key is marked
1326 * required. All earlier keys are required, and this key
1327 * is unsatisfiable either way, so we can't miss anything.
1328 */
1329 tighten_strat = true;
1330 break;
1331 }
1332
1333 if (!(subkey->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)))
1334 {
1335 /* nonrequired member key, can only use earlier keys */
1336 loosen_strat = true;
1337 break;
1338 }
1339
1340 Assert(subkey->sk_attno == keysz + 1);
1341 Assert(subkey->sk_strategy == bkey->sk_strategy);
1342 Assert(keysz < INDEX_MAX_KEYS);
1343
1344 memcpy(inskey.scankeys + keysz, subkey, sizeof(ScanKeyData));
1345 keysz++;
1346
1347 if (subkey->sk_flags & SK_ROW_END)
1348 break;
1349 }
1351 if (loosen_strat)
1352 {
1353 /* Use less restrictive strategy (and fewer member keys) */
1354 switch (strat_total)
1355 {
1358 break;
1361 break;
1362 }
1363 }
1364 if (tighten_strat)
1365 {
1366 /* Use more restrictive strategy (and fewer member keys) */
1367 switch (strat_total)
1368 {
1371 break;
1374 break;
1375 }
1376 }
1377
1378 /* Done (row compare header key is always last startKeys[] key) */
1379 break;
1380 }
1381
1382 /*
1383 * Ordinary comparison key/search-style key.
1384 *
1385 * Transform the search-style scan key to an insertion scan key by
1386 * replacing the sk_func with the appropriate btree 3-way-comparison
1387 * function.
1388 *
1389 * If scankey operator is not a cross-type comparison, we can use the
1390 * cached comparison function; otherwise gotta look it up in the
1391 * catalogs. (That can't lead to infinite recursion, since no
1392 * indexscan initiated by syscache lookup will use cross-data-type
1393 * operators.)
1394 *
1395 * We support the convention that sk_subtype == InvalidOid means the
1396 * opclass input type; this hack simplifies life for ScanKeyInit().
1397 */
1398 if (bkey->sk_subtype == rel->rd_opcintype[i] ||
1399 bkey->sk_subtype == InvalidOid)
1400 {
1402
1403 procinfo = index_getprocinfo(rel, bkey->sk_attno, BTORDER_PROC);
1405 bkey->sk_flags,
1406 bkey->sk_attno,
1408 bkey->sk_subtype,
1409 bkey->sk_collation,
1410 procinfo,
1411 bkey->sk_argument);
1412 }
1413 else
1414 {
1415 RegProcedure cmp_proc;
1416
1417 cmp_proc = get_opfamily_proc(rel->rd_opfamily[i],
1418 rel->rd_opcintype[i],
1419 bkey->sk_subtype, BTORDER_PROC);
1420 if (!RegProcedureIsValid(cmp_proc))
1421 elog(ERROR, "missing support function %d(%u,%u) for attribute %d of index \"%s\"",
1422 BTORDER_PROC, rel->rd_opcintype[i], bkey->sk_subtype,
1423 bkey->sk_attno, RelationGetRelationName(rel));
1425 bkey->sk_flags,
1426 bkey->sk_attno,
1428 bkey->sk_subtype,
1429 bkey->sk_collation,
1430 cmp_proc,
1431 bkey->sk_argument);
1432 }
1433 }
1434
1435 /*----------
1436 * Examine the selected initial-positioning strategy to determine exactly
1437 * where we need to start the scan, and set flag variables to control the
1438 * initial descent by _bt_search (and our _bt_binsrch call for the leaf
1439 * page _bt_search returns).
1440 *----------
1441 */
1442 _bt_metaversion(rel, &inskey.heapkeyspace, &inskey.allequalimage);
1443 inskey.anynullkeys = false; /* unused */
1444 inskey.scantid = NULL;
1445 inskey.keysz = keysz;
1446 switch (strat_total)
1447 {
1449
1450 inskey.nextkey = false;
1451 inskey.backward = true;
1452 break;
1453
1455
1456 inskey.nextkey = true;
1457 inskey.backward = true;
1458 break;
1459
1461
1462 /*
1463 * If a backward scan was specified, need to start with last equal
1464 * item not first one.
1465 */
1466 if (ScanDirectionIsBackward(dir))
1467 {
1468 /*
1469 * This is the same as the <= strategy
1470 */
1471 inskey.nextkey = true;
1472 inskey.backward = true;
1473 }
1474 else
1475 {
1476 /*
1477 * This is the same as the >= strategy
1478 */
1479 inskey.nextkey = false;
1480 inskey.backward = false;
1481 }
1482 break;
1483
1485
1486 /*
1487 * Find first item >= scankey
1488 */
1489 inskey.nextkey = false;
1490 inskey.backward = false;
1491 break;
1492
1494
1495 /*
1496 * Find first item > scankey
1497 */
1498 inskey.nextkey = true;
1499 inskey.backward = false;
1500 break;
1501
1502 default:
1503 /* can't get here, but keep compiler quiet */
1504 elog(ERROR, "unrecognized strat_total: %d", (int) strat_total);
1505 return false;
1506 }
1507
1508 /*
1509 * Use the manufactured insertion scan key to descend the tree and
1510 * position ourselves on the target leaf page.
1511 */
1512 Assert(ScanDirectionIsBackward(dir) == inskey.backward);
1513 _bt_search(rel, NULL, &inskey, &so->currPos.buf, BT_READ, false);
1514
1515 if (!BufferIsValid(so->currPos.buf))
1516 {
1517 Assert(!so->needPrimScan);
1518
1519 /*
1520 * We only get here if the index is completely empty. Lock relation
1521 * because nothing finer to lock exists. Without a buffer lock, it's
1522 * possible for another transaction to insert data between
1523 * _bt_search() and PredicateLockRelation(). We have to try again
1524 * after taking the relation-level predicate lock, to close a narrow
1525 * window where we wouldn't scan concurrently inserted tuples, but the
1526 * writer wouldn't see our predicate lock.
1527 */
1529 {
1531 _bt_search(rel, NULL, &inskey, &so->currPos.buf, BT_READ, false);
1532 }
1533
1534 if (!BufferIsValid(so->currPos.buf))
1535 {
1536 _bt_parallel_done(scan);
1537 return false;
1538 }
1539 }
1540
1541 /* position to the precise item on the page */
1542 offnum = _bt_binsrch(rel, &inskey, so->currPos.buf);
1543
1544 /*
1545 * Now load data from the first page of the scan (usually the page
1546 * currently in so->currPos.buf).
1547 *
1548 * If inskey.nextkey = false and inskey.backward = false, offnum is
1549 * positioned at the first non-pivot tuple >= inskey.scankeys.
1550 *
1551 * If inskey.nextkey = false and inskey.backward = true, offnum is
1552 * positioned at the last non-pivot tuple < inskey.scankeys.
1553 *
1554 * If inskey.nextkey = true and inskey.backward = false, offnum is
1555 * positioned at the first non-pivot tuple > inskey.scankeys.
1556 *
1557 * If inskey.nextkey = true and inskey.backward = true, offnum is
1558 * positioned at the last non-pivot tuple <= inskey.scankeys.
1559 *
1560 * It's possible that _bt_binsrch returned an offnum that is out of bounds
1561 * for the page. For example, when inskey is both < the leaf page's high
1562 * key and > all of its non-pivot tuples, offnum will be "maxoff + 1".
1563 */
1564 if (!_bt_readfirstpage(scan, offnum, dir))
1565 return false;
1566
1567 _bt_returnitem(scan, so);
1568 return true;
1569}
int16 AttrNumber
Definition attnum.h:21
#define RegProcedureIsValid(p)
Definition c.h:862
regproc RegProcedure
Definition c.h:734
FmgrInfo * index_getprocinfo(Relation irel, AttrNumber attnum, uint16 procnum)
Definition indexam.c:885
void _bt_metaversion(Relation rel, bool *heapkeyspace, bool *allequalimage)
Definition nbtpage.c:744
void _bt_preprocess_keys(IndexScanDesc scan)
void _bt_start_array_keys(IndexScanDesc scan, ScanDirection dir)
bool _bt_parallel_seize(IndexScanDesc scan, BlockNumber *next_scan_page, BlockNumber *last_curr_page, bool first)
Definition nbtree.c:870
void _bt_parallel_done(IndexScanDesc scan)
Definition nbtree.c:1035
#define BTORDER_PROC
Definition nbtree.h:717
#define SK_BT_PRIOR
Definition nbtree.h:1112
#define SK_BT_NEXT
Definition nbtree.h:1111
#define BTScanPosIsValid(scanpos)
Definition nbtree.h:1021
#define P_NONE
Definition nbtree.h:213
#define SK_BT_REQBKWD
Definition nbtree.h:1105
#define SK_BT_MAXVAL
Definition nbtree.h:1110
#define BT_READ
Definition nbtree.h:730
#define SK_BT_MINVAL
Definition nbtree.h:1109
BTScanOpaqueData * BTScanOpaque
Definition nbtree.h:1097
static bool _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, BlockNumber lastcurrblkno, ScanDirection dir, bool seized)
Definition nbtsearch.c:1840
static OffsetNumber _bt_binsrch(Relation rel, BTScanInsert key, Buffer buf)
Definition nbtsearch.c:344
static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
Definition nbtsearch.c:2178
static bool _bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum, ScanDirection dir)
Definition nbtsearch.c:1747
static void _bt_returnitem(IndexScanDesc scan, BTScanOpaque so)
Definition nbtsearch.c:1622
BTStack _bt_search(Relation rel, Relation heaprel, BTScanInsert key, Buffer *bufP, int access, bool returnstack)
Definition nbtsearch.c:100
#define INDEX_MAX_KEYS
#define pgstat_count_index_scan(rel)
Definition pgstat.h:732
#define InvalidOid
void PredicateLockRelation(Relation relation, Snapshot snapshot)
Definition predicate.c:2506
void ScanKeyEntryInitialize(ScanKey entry, int flags, AttrNumber attributeNumber, StrategyNumber strategy, Oid subtype, Oid collation, RegProcedure procedure, Datum argument)
Definition scankey.c:32
void ScanKeyEntryInitializeWithInfo(ScanKey entry, int flags, AttrNumber attributeNumber, StrategyNumber strategy, Oid subtype, Oid collation, FmgrInfo *finfo, Datum argument)
Definition scankey.c:101
#define ScanDirectionIsBackward(direction)
Definition sdir.h:50
#define SK_ROW_HEADER
Definition skey.h:117
#define SK_ROW_MEMBER
Definition skey.h:118
#define SK_SEARCHNOTNULL
Definition skey.h:122
#define SK_ROW_END
Definition skey.h:119
ScanKeyData * ScanKey
Definition skey.h:75
uint16 StrategyNumber
Definition stratnum.h:22
#define BTGreaterStrategyNumber
Definition stratnum.h:33
#define InvalidStrategy
Definition stratnum.h:24
#define BTLessStrategyNumber
Definition stratnum.h:29
#define BTLessEqualStrategyNumber
Definition stratnum.h:30
#define BTGreaterEqualStrategyNumber
Definition stratnum.h:32
ScanKey high_compare
Definition nbtree.h:1050
ScanKey low_compare
Definition nbtree.h:1049
bool allequalimage
Definition nbtree.h:798
ScanKeyData scankeys[INDEX_MAX_KEYS]
Definition nbtree.h:804
struct ParallelIndexScanDescData * parallel_scan
Definition relscan.h:204
struct IndexScanInstrumentation * instrument
Definition relscan.h:172
Relation indexRelation
Definition relscan.h:150
struct SnapshotData * xs_snapshot
Definition relscan.h:151
int sk_flags
Definition skey.h:66
#define IsolationIsSerializable()
Definition xact.h:53

References _bt_binsrch(), _bt_endpoint(), _bt_metaversion(), _bt_parallel_done(), _bt_parallel_seize(), _bt_preprocess_keys(), _bt_readfirstpage(), _bt_readnextpage(), _bt_returnitem(), _bt_search(), _bt_start_array_keys(), BTScanInsertData::allequalimage, BTScanInsertData::anynullkeys, Assert, BTScanInsertData::backward, BT_READ, BTEqualStrategyNumber, BTGreaterEqualStrategyNumber, BTGreaterStrategyNumber, BTLessEqualStrategyNumber, BTLessStrategyNumber, BTORDER_PROC, BTScanPosIsValid, BufferIsValid(), cur, DatumGetPointer(), elog, ERROR, fb(), get_opfamily_proc(), BTScanInsertData::heapkeyspace, BTArrayKeyInfo::high_compare, i, index_getprocinfo(), INDEX_MAX_KEYS, IndexScanDescData::indexRelation, IndexScanDescData::instrument, InvalidBlockNumber, InvalidOid, InvalidStrategy, IsolationIsSerializable, BTScanInsertData::keysz, BTArrayKeyInfo::low_compare, memcpy(), BTScanInsertData::nextkey, IndexScanInstrumentation::nsearches, BTArrayKeyInfo::null_elem, IndexScanDescData::opaque, P_NONE, IndexScanDescData::parallel_scan, pgstat_count_index_scan, PredicateLockRelation(), RelationData::rd_opcintype, RelationData::rd_opfamily, RegProcedureIsValid, RelationGetRelationName, BTArrayKeyInfo::scan_key, ScanDirectionIsBackward, ScanDirectionIsForward, ScanKeyEntryInitialize(), ScanKeyEntryInitializeWithInfo(), BTScanInsertData::scankeys, BTScanInsertData::scantid, SK_BT_DESC, SK_BT_MAXVAL, SK_BT_MINVAL, SK_BT_NEXT, SK_BT_NULLS_FIRST, SK_BT_PRIOR, SK_BT_REQBKWD, SK_BT_REQFWD, SK_BT_SKIP, ScanKeyData::sk_flags, SK_ISNULL, SK_ROW_END, SK_ROW_HEADER, SK_ROW_MEMBER, SK_SEARCHNOTNULL, and IndexScanDescData::xs_snapshot.

Referenced by btgetbitmap(), and btgettuple().

◆ _bt_form_posting()

IndexTuple _bt_form_posting ( IndexTuple  base,
const ItemPointerData htids,
int  nhtids 
)
extern

Definition at line 864 of file nbtdedup.c.

865{
866 uint32 keysize,
867 newsize;
868 IndexTuple itup;
869
870 if (BTreeTupleIsPosting(base))
871 keysize = BTreeTupleGetPostingOffset(base);
872 else
873 keysize = IndexTupleSize(base);
874
876 Assert(nhtids > 0 && nhtids <= PG_UINT16_MAX);
877 Assert(keysize == MAXALIGN(keysize));
878
879 /* Determine final size of new tuple */
880 if (nhtids > 1)
881 newsize = MAXALIGN(keysize +
882 nhtids * sizeof(ItemPointerData));
883 else
884 newsize = keysize;
885
888
889 /* Allocate memory using palloc0() (matches index_form_tuple()) */
890 itup = palloc0(newsize);
891 memcpy(itup, base, keysize);
892 itup->t_info &= ~INDEX_SIZE_MASK;
893 itup->t_info |= newsize;
894 if (nhtids > 1)
895 {
896 /* Form posting list tuple */
897 BTreeTupleSetPosting(itup, nhtids, keysize);
898 memcpy(BTreeTupleGetPosting(itup), htids,
899 sizeof(ItemPointerData) * nhtids);
901 }
902 else
903 {
904 /* Form standard non-pivot tuple */
906 ItemPointerCopy(htids, &itup->t_tid);
908 }
909
910 return itup;
911}
#define PG_UINT16_MAX
Definition c.h:671
static void ItemPointerCopy(const ItemPointerData *fromPointer, ItemPointerData *toPointer)
Definition itemptr.h:172
static bool ItemPointerIsValid(const ItemPointerData *pointer)
Definition itemptr.h:83
void * palloc0(Size size)
Definition mcxt.c:1417
static void BTreeTupleSetPosting(IndexTuple itup, uint16 nhtids, int postingoffset)
Definition nbtree.h:505
unsigned short t_info
Definition itup.h:49

References Assert, BTreeTupleGetPosting(), BTreeTupleGetPostingOffset(), BTreeTupleIsPivot(), BTreeTupleIsPosting(), BTreeTupleSetPosting(), fb(), INDEX_SIZE_MASK, IndexTupleSize(), ItemPointerCopy(), ItemPointerIsValid(), MAXALIGN, memcpy(), palloc0(), PG_UINT16_MAX, IndexTupleData::t_info, and IndexTupleData::t_tid.

Referenced by _bt_dedup_finish_pending(), _bt_sort_dedup_finish_pending(), and bt_posting_plain_tuple().

◆ _bt_get_endpoint()

Buffer _bt_get_endpoint ( Relation  rel,
uint32  level,
bool  rightmost 
)
extern

Definition at line 2092 of file nbtsearch.c.

2093{
2094 Buffer buf;
2095 Page page;
2096 BTPageOpaque opaque;
2097 OffsetNumber offnum;
2098 BlockNumber blkno;
2099 IndexTuple itup;
2100
2101 /*
2102 * If we are looking for a leaf page, okay to descend from fast root;
2103 * otherwise better descend from true root. (There is no point in being
2104 * smarter about intermediate levels.)
2105 */
2106 if (level == 0)
2107 buf = _bt_getroot(rel, NULL, BT_READ);
2108 else
2109 buf = _bt_gettrueroot(rel);
2110
2111 if (!BufferIsValid(buf))
2112 return InvalidBuffer;
2113
2114 page = BufferGetPage(buf);
2115 opaque = BTPageGetOpaque(page);
2116
2117 for (;;)
2118 {
2119 /*
2120 * If we landed on a deleted page, step right to find a live page
2121 * (there must be one). Also, if we want the rightmost page, step
2122 * right if needed to get to it (this could happen if the page split
2123 * since we obtained a pointer to it).
2124 */
2125 while (P_IGNORE(opaque) ||
2126 (rightmost && !P_RIGHTMOST(opaque)))
2127 {
2128 blkno = opaque->btpo_next;
2129 if (blkno == P_NONE)
2130 elog(ERROR, "fell off the end of index \"%s\"",
2132 buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2133 page = BufferGetPage(buf);
2134 opaque = BTPageGetOpaque(page);
2135 }
2136
2137 /* Done? */
2138 if (opaque->btpo_level == level)
2139 break;
2140 if (opaque->btpo_level < level)
2141 ereport(ERROR,
2143 errmsg_internal("btree level %u not found in index \"%s\"",
2144 level, RelationGetRelationName(rel))));
2145
2146 /* Descend to leftmost or rightmost child page */
2147 if (rightmost)
2148 offnum = PageGetMaxOffsetNumber(page);
2149 else
2150 offnum = P_FIRSTDATAKEY(opaque);
2151
2153 elog(PANIC, "offnum out of range");
2154
2155 itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
2156 blkno = BTreeTupleGetDownLink(itup);
2157
2158 buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2159 page = BufferGetPage(buf);
2160 opaque = BTPageGetOpaque(page);
2161 }
2162
2163 return buf;
2164}
Buffer _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
Definition nbtpage.c:1008
Buffer _bt_gettrueroot(Relation rel)
Definition nbtpage.c:585
Buffer _bt_getroot(Relation rel, Relation heaprel, int access)
Definition nbtpage.c:347
static BlockNumber BTreeTupleGetDownLink(IndexTuple pivot)
Definition nbtree.h:557
BlockNumber btpo_next
Definition nbtree.h:66
uint32 btpo_level
Definition nbtree.h:67

References _bt_getroot(), _bt_gettrueroot(), _bt_relandgetbuf(), BT_READ, BTPageGetOpaque, BTPageOpaqueData::btpo_level, BTPageOpaqueData::btpo_next, BTreeTupleGetDownLink(), buf, BufferGetPage(), BufferIsValid(), elog, ereport, errcode(), errmsg_internal(), ERROR, fb(), InvalidBuffer, P_FIRSTDATAKEY, P_IGNORE, P_NONE, P_RIGHTMOST, PageGetItem(), PageGetItemId(), PageGetMaxOffsetNumber(), PANIC, and RelationGetRelationName.

Referenced by _bt_endpoint(), and _bt_insert_parent().

◆ _bt_getbuf()

Buffer _bt_getbuf ( Relation  rel,
BlockNumber  blkno,
int  access 
)
extern

Definition at line 850 of file nbtpage.c.

851{
852 Buffer buf;
853
855
856 /* Read an existing block of the relation */
857 buf = ReadBuffer(rel, blkno);
858 _bt_lockbuf(rel, buf, access);
859 _bt_checkpage(rel, buf);
860
861 return buf;
862}
void _bt_checkpage(Relation rel, Buffer buf)
Definition nbtpage.c:802
void _bt_lockbuf(Relation rel, Buffer buf, int access)
Definition nbtpage.c:1067
short access

References _bt_checkpage(), _bt_lockbuf(), Assert, BlockNumberIsValid(), buf, and ReadBuffer().

Referenced by _bt_finish_split(), _bt_getroot(), _bt_getrootheight(), _bt_getstackbuf(), _bt_gettrueroot(), _bt_insertonpg(), _bt_killitems(), _bt_leftsib_splitflag(), _bt_lock_and_validate_left(), _bt_metaversion(), _bt_moveright(), _bt_newlevel(), _bt_pagedel(), _bt_readnextpage(), _bt_rightsib_halfdeadflag(), _bt_set_cleanup_info(), _bt_split(), _bt_unlink_halfdead_page(), and _bt_vacuum_needs_cleanup().

◆ _bt_getroot()

Buffer _bt_getroot ( Relation  rel,
Relation  heaprel,
int  access 
)
extern

Definition at line 347 of file nbtpage.c.

348{
354 uint32 rootlevel;
357
358 Assert(access == BT_READ || heaprel != NULL);
359
360 /*
361 * Try to use previously-cached metapage data to find the root. This
362 * normally saves one buffer access per index search, which is a very
363 * helpful savings in bufmgr traffic and hence contention.
364 */
365 if (rel->rd_amcache != NULL)
366 {
368 /* We shouldn't have cached it if any of these fail */
369 Assert(metad->btm_magic == BTREE_MAGIC);
370 Assert(metad->btm_version >= BTREE_MIN_VERSION);
371 Assert(metad->btm_version <= BTREE_VERSION);
372 Assert(!metad->btm_allequalimage ||
373 metad->btm_version > BTREE_NOVAC_VERSION);
374 Assert(metad->btm_root != P_NONE);
375
376 rootblkno = metad->btm_fastroot;
378 rootlevel = metad->btm_fastlevel;
379
383
384 /*
385 * Since the cache might be stale, we check the page more carefully
386 * here than normal. We *must* check that it's not deleted. If it's
387 * not alone on its level, then we reject too --- this may be overly
388 * paranoid but better safe than sorry. Note we don't check P_ISROOT,
389 * because that's not set in a "fast root".
390 */
391 if (!P_IGNORE(rootopaque) &&
392 rootopaque->btpo_level == rootlevel &&
395 {
396 /* OK, accept cached page as the root */
397 return rootbuf;
398 }
399 _bt_relbuf(rel, rootbuf);
400 /* Cache is stale, throw it away */
401 if (rel->rd_amcache)
402 pfree(rel->rd_amcache);
403 rel->rd_amcache = NULL;
404 }
405
407 metad = _bt_getmeta(rel, metabuf);
408
409 /* if no root page initialized yet, do it */
410 if (metad->btm_root == P_NONE)
411 {
412 Page metapg;
413
414 /* If access = BT_READ, caller doesn't want us to create root yet */
415 if (access == BT_READ)
416 {
417 _bt_relbuf(rel, metabuf);
418 return InvalidBuffer;
419 }
420
421 /* trade in our read lock for a write lock */
424
425 /*
426 * Race condition: if someone else initialized the metadata between
427 * the time we released the read lock and acquired the write lock, we
428 * must avoid doing it again.
429 */
430 if (metad->btm_root != P_NONE)
431 {
432 /*
433 * Metadata initialized by someone else. In order to guarantee no
434 * deadlocks, we have to release the metadata page and start all
435 * over again. (Is that really true? But it's hardly worth trying
436 * to optimize this case.)
437 */
438 _bt_relbuf(rel, metabuf);
439 return _bt_getroot(rel, heaprel, access);
440 }
441
442 /*
443 * Get, initialize, write, and leave a lock of the appropriate type on
444 * the new root page. Since this is the first page in the tree, it's
445 * a leaf as well as the root.
446 */
447 rootbuf = _bt_allocbuf(rel, heaprel);
451 rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE;
452 rootopaque->btpo_flags = (BTP_LEAF | BTP_ROOT);
453 rootopaque->btpo_level = 0;
454 rootopaque->btpo_cycleid = 0;
455 /* Get raw page pointer for metapage */
457
458 /* NO ELOG(ERROR) till meta is updated */
460
461 /* upgrade metapage if needed */
462 if (metad->btm_version < BTREE_NOVAC_VERSION)
464
465 metad->btm_root = rootblkno;
466 metad->btm_level = 0;
467 metad->btm_fastroot = rootblkno;
468 metad->btm_fastlevel = 0;
469 metad->btm_last_cleanup_num_delpages = 0;
470 metad->btm_last_cleanup_num_heap_tuples = -1.0;
471
474
475 /* XLOG stuff */
476 if (RelationNeedsWAL(rel))
477 {
480
484
485 Assert(metad->btm_version >= BTREE_NOVAC_VERSION);
486 md.version = metad->btm_version;
487 md.root = rootblkno;
488 md.level = 0;
489 md.fastroot = rootblkno;
490 md.fastlevel = 0;
492 md.allequalimage = metad->btm_allequalimage;
493
494 XLogRegisterBufData(2, &md, sizeof(xl_btree_metadata));
495
496 xlrec.rootblk = rootblkno;
497 xlrec.level = 0;
498
500
502 }
503 else
504 recptr = XLogGetFakeLSN(rel);
505
508
510
511 /*
512 * swap root write lock for read lock. There is no danger of anyone
513 * else accessing the new root page while it's unlocked, since no one
514 * else knows where it is yet.
515 */
518
519 /* okay, metadata is correct, release lock on it without caching */
520 _bt_relbuf(rel, metabuf);
521 }
522 else
523 {
524 rootblkno = metad->btm_fastroot;
526 rootlevel = metad->btm_fastlevel;
527
528 /*
529 * Cache the metapage data for next time
530 */
532 sizeof(BTMetaPageData));
533 memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));
534
535 /*
536 * We are done with the metapage; arrange to release it via first
537 * _bt_relandgetbuf call
538 */
540
541 for (;;)
542 {
546
547 if (!P_IGNORE(rootopaque))
548 break;
549
550 /* it's dead, Jim. step right one page */
552 elog(ERROR, "no live root page found in index \"%s\"",
554 rootblkno = rootopaque->btpo_next;
555 }
556
557 if (rootopaque->btpo_level != rootlevel)
558 elog(ERROR, "root page %u of index \"%s\" has level %u, expected %u",
560 rootopaque->btpo_level, rootlevel);
561 }
562
563 /*
564 * By here, we have a pin and read lock on the root page, and no lock set
565 * on the metadata page. Return the root page's buffer.
566 */
567 return rootbuf;
568}
void * MemoryContextAlloc(MemoryContext context, Size size)
Definition mcxt.c:1232
void _bt_upgrademetapage(Page page)
Definition nbtpage.c:108
Buffer _bt_allocbuf(Relation rel, Relation heaprel)
Definition nbtpage.c:874
static BTMetaPageData * _bt_getmeta(Relation rel, Buffer metabuf)
Definition nbtpage.c:143
void _bt_unlockbuf(Relation rel, Buffer buf)
Definition nbtpage.c:1098
#define BTREE_MIN_VERSION
Definition nbtree.h:152
#define BTP_LEAF
Definition nbtree.h:77
#define BTREE_MAGIC
Definition nbtree.h:150
#define BTP_ROOT
Definition nbtree.h:78
#define SizeOfBtreeNewroot
Definition nbtxlog.h:347
#define XLOG_BTREE_NEWROOT
Definition nbtxlog.h:37
void * rd_amcache
Definition rel.h:229
MemoryContext rd_indexcxt
Definition rel.h:204
BlockNumber fastroot
Definition nbtxlog.h:51
uint32 fastlevel
Definition nbtxlog.h:52
BlockNumber root
Definition nbtxlog.h:49
uint32 last_cleanup_num_delpages
Definition nbtxlog.h:53
#define REGBUF_WILL_INIT
Definition xloginsert.h:34

References _bt_allocbuf(), _bt_getbuf(), _bt_getmeta(), _bt_getroot(), _bt_lockbuf(), _bt_relandgetbuf(), _bt_relbuf(), _bt_unlockbuf(), _bt_upgrademetapage(), xl_btree_metadata::allequalimage, Assert, BT_READ, BT_WRITE, BTP_LEAF, BTP_ROOT, BTPageGetOpaque, BTREE_MAGIC, BTREE_METAPAGE, BTREE_MIN_VERSION, BTREE_NOVAC_VERSION, BTREE_VERSION, BufferGetBlockNumber(), BufferGetPage(), elog, END_CRIT_SECTION, ERROR, xl_btree_metadata::fastlevel, xl_btree_metadata::fastroot, fb(), InvalidBuffer, xl_btree_metadata::last_cleanup_num_delpages, xl_btree_metadata::level, MarkBufferDirty(), memcpy(), MemoryContextAlloc(), P_IGNORE, P_LEFTMOST, P_NONE, P_RIGHTMOST, PageSetLSN(), pfree(), RelationData::rd_amcache, RelationData::rd_indexcxt, REGBUF_STANDARD, REGBUF_WILL_INIT, RelationGetRelationName, RelationNeedsWAL, xl_btree_metadata::root, SizeOfBtreeNewroot, START_CRIT_SECTION, xl_btree_metadata::version, XLOG_BTREE_NEWROOT, XLogBeginInsert(), XLogGetFakeLSN(), XLogInsert(), XLogRegisterBufData(), XLogRegisterBuffer(), and XLogRegisterData().

Referenced by _bt_get_endpoint(), _bt_getroot(), and _bt_search().

◆ _bt_getrootheight()

int _bt_getrootheight ( Relation  rel)
extern

Definition at line 680 of file nbtpage.c.

681{
683
684 if (rel->rd_amcache == NULL)
685 {
687
689 metad = _bt_getmeta(rel, metabuf);
690
691 /*
692 * If there's no root page yet, _bt_getroot() doesn't expect a cache
693 * to be made, so just stop here and report the index height is zero.
694 * (XXX perhaps _bt_getroot() should be changed to allow this case.)
695 */
696 if (metad->btm_root == P_NONE)
697 {
698 _bt_relbuf(rel, metabuf);
699 return 0;
700 }
701
702 /*
703 * Cache the metapage data for next time
704 */
706 sizeof(BTMetaPageData));
707 memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));
708 _bt_relbuf(rel, metabuf);
709 }
710
711 /* Get cached page */
713 /* We shouldn't have cached it if any of these fail */
714 Assert(metad->btm_magic == BTREE_MAGIC);
715 Assert(metad->btm_version >= BTREE_MIN_VERSION);
716 Assert(metad->btm_version <= BTREE_VERSION);
717 Assert(!metad->btm_allequalimage ||
718 metad->btm_version > BTREE_NOVAC_VERSION);
719 Assert(metad->btm_fastroot != P_NONE);
720
721 return metad->btm_fastlevel;
722}

References _bt_getbuf(), _bt_getmeta(), _bt_relbuf(), Assert, BT_READ, BTREE_MAGIC, BTREE_METAPAGE, BTREE_MIN_VERSION, BTREE_NOVAC_VERSION, BTREE_VERSION, fb(), memcpy(), MemoryContextAlloc(), P_NONE, RelationData::rd_amcache, and RelationData::rd_indexcxt.

Referenced by _bt_insertonpg(), and btgettreeheight().

◆ _bt_getstackbuf()

Buffer _bt_getstackbuf ( Relation  rel,
Relation  heaprel,
BTStack  stack,
BlockNumber  child 
)
extern

Definition at line 2351 of file nbtinsert.c.

2352{
2353 BlockNumber blkno;
2355
2356 blkno = stack->bts_blkno;
2357 start = stack->bts_offset;
2358
2359 for (;;)
2360 {
2361 Buffer buf;
2362 Page page;
2363 BTPageOpaque opaque;
2364
2365 buf = _bt_getbuf(rel, blkno, BT_WRITE);
2366 page = BufferGetPage(buf);
2367 opaque = BTPageGetOpaque(page);
2368
2369 Assert(heaprel != NULL);
2370 if (P_INCOMPLETE_SPLIT(opaque))
2371 {
2372 _bt_finish_split(rel, heaprel, buf, stack->bts_parent);
2373 continue;
2374 }
2375
2376 if (!P_IGNORE(opaque))
2377 {
2378 OffsetNumber offnum,
2379 minoff,
2380 maxoff;
2381 ItemId itemid;
2382 IndexTuple item;
2383
2384 minoff = P_FIRSTDATAKEY(opaque);
2385 maxoff = PageGetMaxOffsetNumber(page);
2386
2387 /*
2388 * start = InvalidOffsetNumber means "search the whole page". We
2389 * need this test anyway due to possibility that page has a high
2390 * key now when it didn't before.
2391 */
2392 if (start < minoff)
2393 start = minoff;
2394
2395 /*
2396 * Need this check too, to guard against possibility that page
2397 * split since we visited it originally.
2398 */
2399 if (start > maxoff)
2400 start = OffsetNumberNext(maxoff);
2401
2402 /*
2403 * These loops will check every item on the page --- but in an
2404 * order that's attuned to the probability of where it actually
2405 * is. Scan to the right first, then to the left.
2406 */
2407 for (offnum = start;
2408 offnum <= maxoff;
2409 offnum = OffsetNumberNext(offnum))
2410 {
2411 itemid = PageGetItemId(page, offnum);
2412 item = (IndexTuple) PageGetItem(page, itemid);
2413
2414 if (BTreeTupleGetDownLink(item) == child)
2415 {
2416 /* Return accurate pointer to where link is now */
2417 stack->bts_blkno = blkno;
2418 stack->bts_offset = offnum;
2419 return buf;
2420 }
2421 }
2422
2423 for (offnum = OffsetNumberPrev(start);
2424 offnum >= minoff;
2425 offnum = OffsetNumberPrev(offnum))
2426 {
2427 itemid = PageGetItemId(page, offnum);
2428 item = (IndexTuple) PageGetItem(page, itemid);
2429
2430 if (BTreeTupleGetDownLink(item) == child)
2431 {
2432 /* Return accurate pointer to where link is now */
2433 stack->bts_blkno = blkno;
2434 stack->bts_offset = offnum;
2435 return buf;
2436 }
2437 }
2438 }
2439
2440 /*
2441 * The item we're looking for moved right at least one page.
2442 *
2443 * Lehman and Yao couple/chain locks when moving right here, which we
2444 * can avoid. See nbtree/README.
2445 */
2446 if (P_RIGHTMOST(opaque))
2447 {
2448 _bt_relbuf(rel, buf);
2449 return InvalidBuffer;
2450 }
2451 blkno = opaque->btpo_next;
2453 _bt_relbuf(rel, buf);
2454 }
2455}
return str start
void _bt_finish_split(Relation rel, Relation heaprel, Buffer lbuf, BTStack stack)
Definition nbtinsert.c:2272
#define OffsetNumberPrev(offsetNumber)
Definition off.h:54
BlockNumber bts_blkno
Definition nbtree.h:745
struct BTStackData * bts_parent
Definition nbtree.h:747
OffsetNumber bts_offset
Definition nbtree.h:746

References _bt_finish_split(), _bt_getbuf(), _bt_relbuf(), Assert, BT_WRITE, BTPageGetOpaque, BTPageOpaqueData::btpo_next, BTreeTupleGetDownLink(), BTStackData::bts_blkno, BTStackData::bts_offset, BTStackData::bts_parent, buf, BufferGetPage(), fb(), InvalidBuffer, InvalidOffsetNumber, OffsetNumberNext, OffsetNumberPrev, P_FIRSTDATAKEY, P_IGNORE, P_INCOMPLETE_SPLIT, P_RIGHTMOST, PageGetItem(), PageGetItemId(), PageGetMaxOffsetNumber(), and start.

Referenced by _bt_insert_parent(), and _bt_lock_subtree_parent().

◆ _bt_gettrueroot()

Buffer _bt_gettrueroot ( Relation  rel)
extern

Definition at line 585 of file nbtpage.c.

586{
588 Page metapg;
594 uint32 rootlevel;
596
597 /*
598 * We don't try to use cached metapage data here, since (a) this path is
599 * not performance-critical, and (b) if we are here it suggests our cache
600 * is out-of-date anyway. In light of point (b), it's probably safest to
601 * actively flush any cached metapage info.
602 */
603 if (rel->rd_amcache)
604 pfree(rel->rd_amcache);
605 rel->rd_amcache = NULL;
606
611
612 if (!P_ISMETA(metaopaque) ||
613 metad->btm_magic != BTREE_MAGIC)
616 errmsg("index \"%s\" is not a btree",
618
619 if (metad->btm_version < BTREE_MIN_VERSION ||
620 metad->btm_version > BTREE_VERSION)
623 errmsg("version mismatch in index \"%s\": file version %d, "
624 "current version %d, minimal supported version %d",
626 metad->btm_version, BTREE_VERSION, BTREE_MIN_VERSION)));
627
628 /* if no root page initialized yet, fail */
629 if (metad->btm_root == P_NONE)
630 {
631 _bt_relbuf(rel, metabuf);
632 return InvalidBuffer;
633 }
634
635 rootblkno = metad->btm_root;
636 rootlevel = metad->btm_level;
637
638 /*
639 * We are done with the metapage; arrange to release it via first
640 * _bt_relandgetbuf call
641 */
643
644 for (;;)
645 {
649
650 if (!P_IGNORE(rootopaque))
651 break;
652
653 /* it's dead, Jim. step right one page */
655 elog(ERROR, "no live root page found in index \"%s\"",
657 rootblkno = rootopaque->btpo_next;
658 }
659
660 if (rootopaque->btpo_level != rootlevel)
661 elog(ERROR, "root page %u of index \"%s\" has level %u, expected %u",
663 rootopaque->btpo_level, rootlevel);
664
665 return rootbuf;
666}
#define P_ISMETA(opaque)
Definition nbtree.h:224

References _bt_getbuf(), _bt_relandgetbuf(), _bt_relbuf(), BT_READ, BTPageGetMeta, BTPageGetOpaque, BTREE_MAGIC, BTREE_METAPAGE, BTREE_MIN_VERSION, BTREE_VERSION, BufferGetPage(), elog, ereport, errcode(), errmsg, ERROR, fb(), InvalidBuffer, P_IGNORE, P_ISMETA, P_NONE, P_RIGHTMOST, pfree(), RelationData::rd_amcache, and RelationGetRelationName.

Referenced by _bt_get_endpoint().

◆ _bt_initmetapage()

void _bt_initmetapage ( Page  page,
BlockNumber  rootbknum,
uint32  level,
bool  allequalimage 
)
extern

Definition at line 68 of file nbtpage.c.

70{
73
74 _bt_pageinit(page, BLCKSZ);
75
76 metad = BTPageGetMeta(page);
77 metad->btm_magic = BTREE_MAGIC;
78 metad->btm_version = BTREE_VERSION;
79 metad->btm_root = rootbknum;
80 metad->btm_level = level;
81 metad->btm_fastroot = rootbknum;
82 metad->btm_fastlevel = level;
83 metad->btm_last_cleanup_num_delpages = 0;
84 metad->btm_last_cleanup_num_heap_tuples = -1.0;
85 metad->btm_allequalimage = allequalimage;
86
88 metaopaque->btpo_flags = BTP_META;
89
90 /*
91 * Set pd_lower just past the end of the metadata. This is essential,
92 * because without doing so, metadata will be lost if xlog.c compresses
93 * the page.
94 */
95 ((PageHeader) page)->pd_lower =
96 ((char *) metad + sizeof(BTMetaPageData)) - (char *) page;
97}
PageHeaderData * PageHeader
Definition bufpage.h:199
#define BTP_META
Definition nbtree.h:80

References _bt_pageinit(), BTP_META, BTPageGetMeta, BTPageGetOpaque, BTREE_MAGIC, BTREE_VERSION, and fb().

Referenced by _bt_uppershutdown(), and btbuildempty().

◆ _bt_keep_natts_fast()

int _bt_keep_natts_fast ( Relation  rel,
IndexTuple  lastleft,
IndexTuple  firstright 
)
extern

Definition at line 911 of file nbtutils.c.

912{
915 int keepnatts;
916
917 keepnatts = 1;
918 for (int attnum = 1; attnum <= keysz; attnum++)
919 {
920 Datum datum1,
921 datum2;
922 bool isNull1,
923 isNull2;
924 CompactAttribute *att;
925
926 datum1 = index_getattr(lastleft, attnum, itupdesc, &isNull1);
929
930 if (isNull1 != isNull2)
931 break;
932
933 if (!isNull1 &&
934 !datum_image_eq(datum1, datum2, att->attbyval, att->attlen))
935 break;
936
937 keepnatts++;
938 }
939
940 return keepnatts;
941}
bool datum_image_eq(Datum value1, Datum value2, bool typByVal, int typLen)
Definition datum.c:271
int16 attnum
static CompactAttribute * TupleDescCompactAttr(TupleDesc tupdesc, int i)
Definition tupdesc.h:195

References CompactAttribute::attbyval, CompactAttribute::attlen, attnum, datum_image_eq(), fb(), index_getattr(), IndexRelationGetNumberOfKeyAttributes, RelationGetDescr, and TupleDescCompactAttr().

Referenced by _bt_afternewitemoff(), _bt_bottomupdel_pass(), _bt_dedup_pass(), _bt_do_singleval(), _bt_keep_natts(), _bt_load(), _bt_set_startikey(), _bt_split_penalty(), and _bt_strategy().

◆ _bt_killitems()

void _bt_killitems ( IndexScanDesc  scan)
extern

Definition at line 191 of file nbtutils.c.

192{
193 Relation rel = scan->indexRelation;
195 Page page;
196 BTPageOpaque opaque;
197 OffsetNumber minoff;
198 OffsetNumber maxoff;
199 int numKilled = so->numKilled;
200 bool killedsomething = false;
201 Buffer buf;
202
203 Assert(numKilled > 0);
204 Assert(BTScanPosIsValid(so->currPos));
205 Assert(scan->heapRelation != NULL); /* can't be a bitmap index scan */
206
207 /* Always invalidate so->killedItems[] before leaving so->currPos */
208 so->numKilled = 0;
209
210 /*
211 * We need to iterate through so->killedItems[] in leaf page order; the
212 * loop below expects this (when marking posting list tuples, at least).
213 * so->killedItems[] is now in whatever order the scan returned items in.
214 * Scrollable cursor scans might have even saved the same item/TID twice.
215 *
216 * Sort and unique-ify so->killedItems[] to deal with all this.
217 */
218 if (numKilled > 1)
219 {
220 qsort(so->killedItems, numKilled, sizeof(int), _bt_compare_int);
221 numKilled = qunique(so->killedItems, numKilled, sizeof(int),
223 }
224
225 if (!so->dropPin)
226 {
227 /*
228 * We have held the pin on this page since we read the index tuples,
229 * so all we need to do is lock it. The pin will have prevented
230 * concurrent VACUUMs from recycling any of the TIDs on the page.
231 */
232 Assert(BTScanPosIsPinned(so->currPos));
233 buf = so->currPos.buf;
234 _bt_lockbuf(rel, buf, BT_READ);
235 }
236 else
237 {
239
240 Assert(!BTScanPosIsPinned(so->currPos));
241 buf = _bt_getbuf(rel, so->currPos.currPage, BT_READ);
242
244 Assert(so->currPos.lsn <= latestlsn);
245 if (so->currPos.lsn != latestlsn)
246 {
247 /* Modified, give up on hinting */
248 _bt_relbuf(rel, buf);
249 return;
250 }
251
252 /* Unmodified, hinting is safe */
253 }
254
255 page = BufferGetPage(buf);
256 opaque = BTPageGetOpaque(page);
257 minoff = P_FIRSTDATAKEY(opaque);
258 maxoff = PageGetMaxOffsetNumber(page);
259
260 /* Iterate through so->killedItems[] in leaf page order */
261 for (int i = 0; i < numKilled; i++)
262 {
263 int itemIndex = so->killedItems[i];
264 BTScanPosItem *kitem = &so->currPos.items[itemIndex];
266
267 Assert(itemIndex >= so->currPos.firstItem &&
268 itemIndex <= so->currPos.lastItem);
269 Assert(i == 0 ||
270 offnum >= so->currPos.items[so->killedItems[i - 1]].indexOffset);
271
272 if (offnum < minoff)
273 continue; /* pure paranoia */
274 while (offnum <= maxoff)
275 {
276 ItemId iid = PageGetItemId(page, offnum);
278 bool killtuple = false;
279
281 {
282 int pi = i + 1;
284 int j;
285
286 /*
287 * Note that the page may have been modified in almost any way
288 * since we first read it (in the !so->dropPin case), so it's
289 * possible that this posting list tuple wasn't a posting list
290 * tuple when we first encountered its heap TIDs.
291 */
292 for (j = 0; j < nposting; j++)
293 {
295
296 if (!ItemPointerEquals(item, &kitem->heapTid))
297 break; /* out of posting list loop */
298
299 /*
300 * kitem must have matching offnum when heap TIDs match,
301 * though only in the common case where the page can't
302 * have been concurrently modified
303 */
304 Assert(kitem->indexOffset == offnum || !so->dropPin);
305
306 /*
307 * Read-ahead to later kitems here.
308 *
309 * We rely on the assumption that not advancing kitem here
310 * will prevent us from considering the posting list tuple
311 * fully dead by not matching its next heap TID in next
312 * loop iteration.
313 *
314 * If, on the other hand, this is the final heap TID in
315 * the posting list tuple, then tuple gets killed
316 * regardless (i.e. we handle the case where the last
317 * kitem is also the last heap TID in the last index tuple
318 * correctly -- posting tuple still gets killed).
319 */
320 if (pi < numKilled)
321 kitem = &so->currPos.items[so->killedItems[pi++]];
322 }
323
324 /*
325 * Don't bother advancing the outermost loop's int iterator to
326 * avoid processing killed items that relate to the same
327 * offnum/posting list tuple. This micro-optimization hardly
328 * seems worth it. (Further iterations of the outermost loop
329 * will fail to match on this same posting list's first heap
330 * TID instead, so we'll advance to the next offnum/index
331 * tuple pretty quickly.)
332 */
333 if (j == nposting)
334 killtuple = true;
335 }
336 else if (ItemPointerEquals(&ituple->t_tid, &kitem->heapTid))
337 killtuple = true;
338
339 /*
340 * Mark index item as dead, if it isn't already. Since this
341 * happens while holding a buffer lock possibly in shared mode,
342 * it's possible that multiple processes attempt to do this
343 * simultaneously, leading to multiple full-page images being sent
344 * to WAL (if wal_log_hints or data checksums are enabled), which
345 * is undesirable.
346 */
347 if (killtuple && !ItemIdIsDead(iid))
348 {
349 if (!killedsomething)
350 {
351 /*
352 * Use the hint bit infrastructure to check if we can
353 * update the page while just holding a share lock. If we
354 * are not allowed, there's no point continuing.
355 */
357 goto unlock_page;
358 }
359
360 /* found the item/all posting list items */
362 killedsomething = true;
363 break; /* out of inner search loop */
364 }
365 offnum = OffsetNumberNext(offnum);
366 }
367 }
368
369 /*
370 * Since this can be redone later if needed, mark as dirty hint.
371 *
372 * Whenever we mark anything LP_DEAD, we also set the page's
373 * BTP_HAS_GARBAGE flag, which is likewise just a hint. (Note that we
374 * only rely on the page-level flag in !heapkeyspace indexes.)
375 */
376 if (killedsomething)
377 {
378 opaque->btpo_flags |= BTP_HAS_GARBAGE;
379 BufferFinishSetHintBits(buf, true, true);
380 }
381
383 if (!so->dropPin)
384 _bt_unlockbuf(rel, buf);
385 else
386 _bt_relbuf(rel, buf);
387}
void BufferFinishSetHintBits(Buffer buffer, bool mark_dirty, bool buffer_std)
Definition bufmgr.c:7070
XLogRecPtr BufferGetLSNAtomic(Buffer buffer)
Definition bufmgr.c:4713
bool BufferBeginSetHintBits(Buffer buffer)
Definition bufmgr.c:7042
int j
Definition isn.c:78
#define ItemIdMarkDead(itemId)
Definition itemid.h:179
#define BTP_HAS_GARBAGE
Definition nbtree.h:83
static int _bt_compare_int(const void *va, const void *vb)
Definition nbtutils.c:153
static size_t qunique(void *array, size_t elements, size_t width, int(*compare)(const void *, const void *))
Definition qunique.h:21
OffsetNumber indexOffset
Definition nbtree.h:958
Relation heapRelation
Definition relscan.h:149

References _bt_compare_int(), _bt_getbuf(), _bt_lockbuf(), _bt_relbuf(), _bt_unlockbuf(), Assert, BT_READ, BTP_HAS_GARBAGE, BTPageGetOpaque, BTPageOpaqueData::btpo_flags, BTreeTupleGetNPosting(), BTreeTupleGetPostingN(), BTreeTupleIsPosting(), BTScanPosIsPinned, BTScanPosIsValid, buf, BufferBeginSetHintBits(), BufferFinishSetHintBits(), BufferGetLSNAtomic(), BufferGetPage(), fb(), IndexScanDescData::heapRelation, i, BTScanPosItem::indexOffset, IndexScanDescData::indexRelation, ItemIdIsDead, ItemIdMarkDead, ItemPointerEquals(), j, OffsetNumberNext, IndexScanDescData::opaque, P_FIRSTDATAKEY, PageGetItem(), PageGetItemId(), PageGetMaxOffsetNumber(), qsort, and qunique().

Referenced by _bt_steppage(), btendscan(), btrescan(), and btrestrpos().

◆ _bt_lockbuf()

void _bt_lockbuf ( Relation  rel,
Buffer  buf,
int  access 
)
extern

Definition at line 1067 of file nbtpage.c.

1068{
1069 /* LockBuffer() asserts that pin is held by this backend */
1071
1072 /*
1073 * It doesn't matter that _bt_unlockbuf() won't get called in the event of
1074 * an nbtree error (e.g. a unique violation error). That won't cause
1075 * Valgrind false positives.
1076 *
1077 * The nbtree client requests are superimposed on top of the bufmgr.c
1078 * buffer pin client requests. In the event of an nbtree error the buffer
1079 * will certainly get marked as defined when the backend once again
1080 * acquires its first pin on the buffer. (Of course, if the backend never
1081 * touches the buffer again then it doesn't matter that it remains
1082 * non-accessible to Valgrind.)
1083 *
1084 * Note: When an IndexTuple C pointer gets computed using an ItemId read
1085 * from a page while a lock was held, the C pointer becomes unsafe to
1086 * dereference forever as soon as the lock is released. Valgrind can only
1087 * detect cases where the pointer gets dereferenced with no _current_
1088 * lock/pin held, though.
1089 */
1090 if (!RelationUsesLocalBuffers(rel))
1092}
static void LockBuffer(Buffer buffer, BufferLockMode mode)
Definition bufmgr.h:334

References buf, BufferGetPage(), fb(), LockBuffer(), RelationUsesLocalBuffers, and VALGRIND_MAKE_MEM_DEFINED.

Referenced by _bt_getbuf(), _bt_getroot(), _bt_killitems(), _bt_moveright(), _bt_pagedel(), _bt_relandgetbuf(), _bt_search(), _bt_set_cleanup_info(), _bt_unlink_halfdead_page(), and btvacuumpage().

◆ _bt_metaversion()

void _bt_metaversion ( Relation  rel,
bool heapkeyspace,
bool allequalimage 
)
extern

Definition at line 744 of file nbtpage.c.

745{
747
748 if (rel->rd_amcache == NULL)
749 {
751
753 metad = _bt_getmeta(rel, metabuf);
754
755 /*
756 * If there's no root page yet, _bt_getroot() doesn't expect a cache
757 * to be made, so just stop here. (XXX perhaps _bt_getroot() should
758 * be changed to allow this case.)
759 */
760 if (metad->btm_root == P_NONE)
761 {
762 *heapkeyspace = metad->btm_version > BTREE_NOVAC_VERSION;
763 *allequalimage = metad->btm_allequalimage;
764
765 _bt_relbuf(rel, metabuf);
766 return;
767 }
768
769 /*
770 * Cache the metapage data for next time
771 *
772 * An on-the-fly version upgrade performed by _bt_upgrademetapage()
773 * can change the nbtree version for an index without invalidating any
774 * local cache. This is okay because it can only happen when moving
775 * from version 2 to version 3, both of which are !heapkeyspace
776 * versions.
777 */
779 sizeof(BTMetaPageData));
780 memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));
781 _bt_relbuf(rel, metabuf);
782 }
783
784 /* Get cached page */
786 /* We shouldn't have cached it if any of these fail */
787 Assert(metad->btm_magic == BTREE_MAGIC);
788 Assert(metad->btm_version >= BTREE_MIN_VERSION);
789 Assert(metad->btm_version <= BTREE_VERSION);
790 Assert(!metad->btm_allequalimage ||
791 metad->btm_version > BTREE_NOVAC_VERSION);
792 Assert(metad->btm_fastroot != P_NONE);
793
794 *heapkeyspace = metad->btm_version > BTREE_NOVAC_VERSION;
795 *allequalimage = metad->btm_allequalimage;
796}

References _bt_getbuf(), _bt_getmeta(), _bt_relbuf(), Assert, BT_READ, BTREE_MAGIC, BTREE_METAPAGE, BTREE_MIN_VERSION, BTREE_NOVAC_VERSION, BTREE_VERSION, fb(), memcpy(), MemoryContextAlloc(), P_NONE, RelationData::rd_amcache, and RelationData::rd_indexcxt.

Referenced by _bt_first(), _bt_mkscankey(), and bt_index_check_callback().

◆ _bt_mkscankey()

BTScanInsert _bt_mkscankey ( Relation  rel,
IndexTuple  itup 
)
extern

Definition at line 61 of file nbtutils.c.

62{
66 int indnkeyatts;
68 int tupnatts;
69 int i;
70
74 tupnatts = itup ? BTreeTupleGetNAtts(itup, rel) : 0;
75
77
78 /*
79 * We'll execute search using scan key constructed on key columns.
80 * Truncated attributes and non-key attributes are omitted from the final
81 * scan key.
82 */
83 key = palloc(offsetof(BTScanInsertData, scankeys) +
84 sizeof(ScanKeyData) * indnkeyatts);
85 if (itup)
86 _bt_metaversion(rel, &key->heapkeyspace, &key->allequalimage);
87 else
88 {
89 /* Utility statement callers can set these fields themselves */
90 key->heapkeyspace = true;
91 key->allequalimage = false;
92 }
93 key->anynullkeys = false; /* initial assumption */
94 key->nextkey = false; /* usual case, required by btinsert */
95 key->backward = false; /* usual case, required by btinsert */
96 key->keysz = Min(indnkeyatts, tupnatts);
97 key->scantid = key->heapkeyspace && itup ?
99 skey = key->scankeys;
100 for (i = 0; i < indnkeyatts; i++)
101 {
103 Datum arg;
104 bool null;
105 int flags;
106
107 /*
108 * We can use the cached (default) support procs since no cross-type
109 * comparison can be needed.
110 */
112
113 /*
114 * Key arguments built from truncated attributes (or when caller
115 * provides no tuple) are defensively represented as NULL values. They
116 * should never be used.
117 */
118 if (i < tupnatts)
119 arg = index_getattr(itup, i + 1, itupdesc, &null);
120 else
121 {
122 arg = (Datum) 0;
123 null = true;
124 }
125 flags = (null ? SK_ISNULL : 0) | (indoption[i] << SK_BT_INDOPTION_SHIFT);
127 flags,
128 (AttrNumber) (i + 1),
131 rel->rd_indcollation[i],
132 procinfo,
133 arg);
134 /* Record if any key attribute is NULL (or truncated) */
135 if (null)
136 key->anynullkeys = true;
137 }
138
139 /*
140 * In NULLS NOT DISTINCT mode, we pretend that there are no null keys, so
141 * that full uniqueness check is done.
142 */
143 if (rel->rd_index->indnullsnotdistinct)
144 key->anynullkeys = false;
145
146 return key;
147}
#define SK_BT_INDOPTION_SHIFT
Definition nbtree.h:1115
int16 * rd_indoption
Definition rel.h:211
Form_pg_index rd_index
Definition rel.h:192

References _bt_metaversion(), arg, Assert, BTORDER_PROC, BTreeTupleGetHeapTID(), BTreeTupleGetNAtts, fb(), i, index_getattr(), index_getprocinfo(), IndexRelationGetNumberOfAttributes, IndexRelationGetNumberOfKeyAttributes, InvalidOid, InvalidStrategy, Min, palloc(), RelationData::rd_indcollation, RelationData::rd_index, RelationData::rd_indoption, RelationGetDescr, ScanKeyEntryInitializeWithInfo(), SK_BT_INDOPTION_SHIFT, and SK_ISNULL.

Referenced by _bt_doinsert(), _bt_leafbuild(), _bt_pagedel(), bt_mkscankey_pivotsearch(), bt_rootdescend(), tuplesort_begin_cluster(), and tuplesort_begin_index_btree().

◆ _bt_next()

bool _bt_next ( IndexScanDesc  scan,
ScanDirection  dir 
)
extern

Definition at line 1586 of file nbtsearch.c.

1587{
1589
1590 Assert(BTScanPosIsValid(so->currPos));
1591
1592 /*
1593 * Advance to next tuple on current page; or if there's no more, try to
1594 * step to the next page with data.
1595 */
1596 if (ScanDirectionIsForward(dir))
1597 {
1598 if (++so->currPos.itemIndex > so->currPos.lastItem)
1599 {
1600 if (!_bt_steppage(scan, dir))
1601 return false;
1602 }
1603 }
1604 else
1605 {
1606 if (--so->currPos.itemIndex < so->currPos.firstItem)
1607 {
1608 if (!_bt_steppage(scan, dir))
1609 return false;
1610 }
1611 }
1612
1613 _bt_returnitem(scan, so);
1614 return true;
1615}
static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir)
Definition nbtsearch.c:1647

References _bt_returnitem(), _bt_steppage(), Assert, BTScanPosIsValid, fb(), IndexScanDescData::opaque, and ScanDirectionIsForward.

Referenced by btgetbitmap(), and btgettuple().

◆ _bt_pagedel()

void _bt_pagedel ( Relation  rel,
Buffer  leafbuf,
BTVacState vstate 
)
extern

Definition at line 1832 of file nbtpage.c.

1833{
1834 BlockNumber rightsib;
1835 bool rightsib_empty;
1836 Page page;
1837 BTPageOpaque opaque;
1838
1839 /*
1840 * Save original leafbuf block number from caller. Only deleted blocks
1841 * that are <= scanblkno are added to bulk delete stat's pages_deleted
1842 * count.
1843 */
1845
1846 /*
1847 * "stack" is a search stack leading (approximately) to the target page.
1848 * It is initially NULL, but when iterating, we keep it to avoid
1849 * duplicated search effort.
1850 *
1851 * Also, when "stack" is not NULL, we have already checked that the
1852 * current page is not the right half of an incomplete split, i.e. the
1853 * left sibling does not have its INCOMPLETE_SPLIT flag set, including
1854 * when the current target page is to the right of caller's initial page
1855 * (the scanblkno page).
1856 */
1857 BTStack stack = NULL;
1858
1859 for (;;)
1860 {
1861 page = BufferGetPage(leafbuf);
1862 opaque = BTPageGetOpaque(page);
1863
1864 /*
1865 * Internal pages are never deleted directly, only as part of deleting
1866 * the whole subtree all the way down to leaf level.
1867 *
1868 * Also check for deleted pages here. Caller never passes us a fully
1869 * deleted page. Only VACUUM can delete pages, so there can't have
1870 * been a concurrent deletion. Assume that we reached any deleted
1871 * page encountered here by following a sibling link, and that the
1872 * index is corrupt.
1873 */
1874 Assert(!P_ISDELETED(opaque));
1875 if (!P_ISLEAF(opaque) || P_ISDELETED(opaque))
1876 {
1877 /*
1878 * Pre-9.4 page deletion only marked internal pages as half-dead,
1879 * but now we only use that flag on leaf pages. The old algorithm
1880 * was never supposed to leave half-dead pages in the tree, it was
1881 * just a transient state, but it was nevertheless possible in
1882 * error scenarios. We don't know how to deal with them here. They
1883 * are harmless as far as searches are considered, but inserts
1884 * into the deleted keyspace could add out-of-order downlinks in
1885 * the upper levels. Log a notice, hopefully the admin will notice
1886 * and reindex.
1887 */
1888 if (P_ISHALFDEAD(opaque))
1889 ereport(LOG,
1891 errmsg("index \"%s\" contains a half-dead internal page",
1893 errhint("This can be caused by an interrupted VACUUM in version 9.3 or older, before upgrade. Please REINDEX it.")));
1894
1895 if (P_ISDELETED(opaque))
1896 ereport(LOG,
1898 errmsg_internal("found deleted block %u while following right link from block %u in index \"%s\"",
1900 scanblkno,
1902
1903 _bt_relbuf(rel, leafbuf);
1904 return;
1905 }
1906
1907 /*
1908 * We can never delete rightmost pages nor root pages. While at it,
1909 * check that page is empty, since it's possible that the leafbuf page
1910 * was empty a moment ago, but has since had some inserts.
1911 *
1912 * To keep the algorithm simple, we also never delete an incompletely
1913 * split page (they should be rare enough that this doesn't make any
1914 * meaningful difference to disk usage):
1915 *
1916 * The INCOMPLETE_SPLIT flag on the page tells us if the page is the
1917 * left half of an incomplete split, but ensuring that it's not the
1918 * right half is more complicated. For that, we have to check that
1919 * the left sibling doesn't have its INCOMPLETE_SPLIT flag set using
1920 * _bt_leftsib_splitflag(). On the first iteration, we temporarily
1921 * release the lock on scanblkno/leafbuf, check the left sibling, and
1922 * construct a search stack to scanblkno. On subsequent iterations,
1923 * we know we stepped right from a page that passed these tests, so
1924 * it's OK.
1925 */
1926 if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) ||
1927 P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page) ||
1928 P_INCOMPLETE_SPLIT(opaque))
1929 {
1930 /* Should never fail to delete a half-dead page */
1931 Assert(!P_ISHALFDEAD(opaque));
1932
1933 _bt_relbuf(rel, leafbuf);
1934 return;
1935 }
1936
1937 /*
1938 * First, remove downlink pointing to the page (or a parent of the
1939 * page, if we are going to delete a taller subtree), and mark the
1940 * leafbuf page half-dead
1941 */
1942 if (!P_ISHALFDEAD(opaque))
1943 {
1944 /*
1945 * We need an approximate pointer to the page's parent page. We
1946 * use a variant of the standard search mechanism to search for
1947 * the page's high key; this will give us a link to either the
1948 * current parent or someplace to its left (if there are multiple
1949 * equal high keys, which is possible with !heapkeyspace indexes).
1950 *
1951 * Also check if this is the right-half of an incomplete split
1952 * (see comment above).
1953 */
1954 if (!stack)
1955 {
1956 BTScanInsert itup_key;
1957 ItemId itemid;
1959 BlockNumber leftsib,
1960 leafblkno;
1962
1963 itemid = PageGetItemId(page, P_HIKEY);
1965
1966 leftsib = opaque->btpo_prev;
1968
1969 /*
1970 * To avoid deadlocks, we'd better drop the leaf page lock
1971 * before going further.
1972 */
1973 _bt_unlockbuf(rel, leafbuf);
1974
1975 /*
1976 * Check that the left sibling of leafbuf (if any) is not
1977 * marked with INCOMPLETE_SPLIT flag before proceeding
1978 */
1980 if (_bt_leftsib_splitflag(rel, leftsib, leafblkno))
1981 {
1983 return;
1984 }
1985
1986 /*
1987 * We need an insertion scan key, so build one.
1988 *
1989 * _bt_search searches for the leaf page that contains any
1990 * matching non-pivot tuples, but we need it to "search" for
1991 * the high key pivot from the page that we're set to delete.
1992 * Compensate for the mismatch by having _bt_search locate the
1993 * last position < equal-to-untruncated-prefix non-pivots.
1994 */
1995 itup_key = _bt_mkscankey(rel, targetkey);
1996
1997 /* Set up a BTLessStrategyNumber-like insertion scan key */
1998 itup_key->nextkey = false;
1999 itup_key->backward = true;
2000 stack = _bt_search(rel, NULL, itup_key, &sleafbuf, BT_READ, true);
2001 /* won't need a second lock or pin on leafbuf */
2002 _bt_relbuf(rel, sleafbuf);
2003
2004 /*
2005 * Re-lock the leaf page, and start over to use our stack
2006 * within _bt_mark_page_halfdead. We must do it that way
2007 * because it's possible that leafbuf can no longer be
2008 * deleted. We need to recheck.
2009 *
2010 * Note: We can't simply hold on to the sleafbuf lock instead,
2011 * because it's barely possible that sleafbuf is not the same
2012 * page as leafbuf. This happens when leafbuf split after our
2013 * original lock was dropped, but before _bt_search finished
2014 * its descent. We rely on the assumption that we'll find
2015 * leafbuf isn't safe to delete anymore in this scenario.
2016 * (Page deletion can cope with the stack being to the left of
2017 * leafbuf, but not to the right of leafbuf.)
2018 */
2020 continue;
2021 }
2022
2023 /*
2024 * See if it's safe to delete the leaf page, and determine how
2025 * many parent/internal pages above the leaf level will be
2026 * deleted. If it's safe then _bt_mark_page_halfdead will also
2027 * perform the first phase of deletion, which includes marking the
2028 * leafbuf page half-dead.
2029 */
2030 Assert(P_ISLEAF(opaque) && !P_IGNORE(opaque));
2031 if (!_bt_mark_page_halfdead(rel, vstate->info->heaprel, leafbuf,
2032 stack))
2033 {
2034 _bt_relbuf(rel, leafbuf);
2035 return;
2036 }
2037 }
2038 else
2039 {
2040 INJECTION_POINT("nbtree-finish-half-dead-page-vacuum", NULL);
2041 }
2042
2043 /*
2044 * Then unlink it from its siblings. Each call to
2045 * _bt_unlink_halfdead_page unlinks the topmost page from the subtree,
2046 * making it shallower. Iterate until the leafbuf page is deleted.
2047 */
2048 rightsib_empty = false;
2049 Assert(P_ISLEAF(opaque) && P_ISHALFDEAD(opaque));
2050 while (P_ISHALFDEAD(opaque))
2051 {
2052 /* Check for interrupts in _bt_unlink_halfdead_page */
2055 {
2056 /*
2057 * _bt_unlink_halfdead_page should never fail, since we
2058 * established that deletion is generally safe in
2059 * _bt_mark_page_halfdead -- index must be corrupt.
2060 *
2061 * Note that _bt_unlink_halfdead_page already released the
2062 * lock and pin on leafbuf for us.
2063 */
2064 Assert(false);
2065 return;
2066 }
2067 }
2068
2069 Assert(P_ISLEAF(opaque) && P_ISDELETED(opaque));
2070
2071 rightsib = opaque->btpo_next;
2072
2073 _bt_relbuf(rel, leafbuf);
2074
2075 /*
2076 * Check here, as calling loops will have locks held, preventing
2077 * interrupts from being processed.
2078 */
2080
2081 /*
2082 * The page has now been deleted. If its right sibling is completely
2083 * empty, it's possible that the reason we haven't deleted it earlier
2084 * is that it was the rightmost child of the parent. Now that we
2085 * removed the downlink for this page, the right sibling might now be
2086 * the only child of the parent, and could be removed. It would be
2087 * picked up by the next vacuum anyway, but might as well try to
2088 * remove it now, so loop back to process the right sibling.
2089 *
2090 * Note: This relies on the assumption that _bt_getstackbuf() will be
2091 * able to reuse our original descent stack with a different child
2092 * block (provided that the child block is to the right of the
2093 * original leaf page reached by _bt_search()). It will even update
2094 * the descent stack each time we loop around, avoiding repeated work.
2095 */
2096 if (!rightsib_empty)
2097 break;
2098
2099 leafbuf = _bt_getbuf(rel, rightsib, BT_WRITE);
2100 }
2101}
#define LOG
Definition elog.h:32
IndexTuple CopyIndexTuple(IndexTuple source)
Definition indextuple.c:479
#define CHECK_FOR_INTERRUPTS()
Definition miscadmin.h:125
static bool _bt_leftsib_splitflag(Relation rel, BlockNumber leftsib, BlockNumber target)
Definition nbtpage.c:1725
static bool _bt_mark_page_halfdead(Relation rel, Relation heaprel, Buffer leafbuf, BTStack stack)
Definition nbtpage.c:2122
static bool _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno, bool *rightsib_empty, BTVacState *vstate)
Definition nbtpage.c:2349
#define P_ISHALFDEAD(opaque)
Definition nbtree.h:225
#define P_ISDELETED(opaque)
Definition nbtree.h:223
#define P_ISROOT(opaque)
Definition nbtree.h:222
BlockNumber btpo_prev
Definition nbtree.h:65

References _bt_getbuf(), _bt_leftsib_splitflag(), _bt_lockbuf(), _bt_mark_page_halfdead(), _bt_mkscankey(), _bt_relbuf(), _bt_search(), _bt_unlink_halfdead_page(), _bt_unlockbuf(), Assert, BTScanInsertData::backward, BT_READ, BT_WRITE, BTPageGetOpaque, BTPageOpaqueData::btpo_next, BTPageOpaqueData::btpo_prev, BufferGetBlockNumber(), BufferGetPage(), CHECK_FOR_INTERRUPTS, CopyIndexTuple(), ereport, errcode(), errhint(), errmsg, errmsg_internal(), fb(), INJECTION_POINT, LOG, BTScanInsertData::nextkey, P_FIRSTDATAKEY, P_HIKEY, P_IGNORE, P_INCOMPLETE_SPLIT, P_ISDELETED, P_ISHALFDEAD, P_ISLEAF, P_ISROOT, P_RIGHTMOST, PageGetItem(), PageGetItemId(), PageGetMaxOffsetNumber(), RelationGetRelationName, and ReleaseBuffer().

Referenced by btvacuumpage().

◆ _bt_pageinit()

void _bt_pageinit ( Page  page,
Size  size 
)
extern

Definition at line 1157 of file nbtpage.c.

1158{
1159 PageInit(page, size, sizeof(BTPageOpaqueData));
1160}
void PageInit(Page page, Size pageSize, Size specialSize)
Definition bufpage.c:42

References PageInit().

Referenced by _bt_allocbuf(), _bt_blnewpage(), _bt_initmetapage(), _bt_restore_meta(), _bt_split(), btree_xlog_mark_page_halfdead(), btree_xlog_newroot(), btree_xlog_split(), and btree_xlog_unlink_page().

◆ _bt_parallel_build_main()

void _bt_parallel_build_main ( dsm_segment seg,
shm_toc toc 
)
extern

Definition at line 1744 of file nbtsort.c.

1745{
1746 char *sharedquery;
1749 BTShared *btshared;
1750 Sharedsort *sharedsort;
1751 Sharedsort *sharedsort2;
1752 Relation heapRel;
1753 Relation indexRel;
1756 WalUsage *walusage;
1757 BufferUsage *bufferusage;
1758 int sortmem;
1759
1760#ifdef BTREE_BUILD_STATS
1762 ResetUsage();
1763#endif /* BTREE_BUILD_STATS */
1764
1765 /*
1766 * The only possible status flag that can be set to the parallel worker is
1767 * PROC_IN_SAFE_IC.
1768 */
1769 Assert((MyProc->statusFlags == 0) ||
1771
1772 /* Set debug_query_string for individual workers first */
1775
1776 /* Report the query string from leader */
1778
1779 /* Look up nbtree shared state */
1780 btshared = shm_toc_lookup(toc, PARALLEL_KEY_BTREE_SHARED, false);
1781
1782 /* Open relations using lock modes known to be obtained by index.c */
1783 if (!btshared->isconcurrent)
1784 {
1787 }
1788 else
1789 {
1792 }
1793
1794 /* Track query ID */
1795 pgstat_report_query_id(btshared->queryid, false);
1796
1797 /* Open relations within worker */
1798 heapRel = table_open(btshared->heaprelid, heapLockmode);
1799 indexRel = index_open(btshared->indexrelid, indexLockmode);
1800
1801 /* Initialize worker's own spool */
1803 btspool->heap = heapRel;
1804 btspool->index = indexRel;
1805 btspool->isunique = btshared->isunique;
1806 btspool->nulls_not_distinct = btshared->nulls_not_distinct;
1807
1808 /* Look up shared state private to tuplesort.c */
1809 sharedsort = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT, false);
1810 tuplesort_attach_shared(sharedsort, seg);
1811 if (!btshared->isunique)
1812 {
1813 btspool2 = NULL;
1814 sharedsort2 = NULL;
1815 }
1816 else
1817 {
1818 /* Allocate memory for worker's own private secondary spool */
1820
1821 /* Initialize worker's own secondary spool */
1822 btspool2->heap = btspool->heap;
1823 btspool2->index = btspool->index;
1824 btspool2->isunique = false;
1825 /* Look up shared state private to tuplesort.c */
1826 sharedsort2 = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT_SPOOL2, false);
1827 tuplesort_attach_shared(sharedsort2, seg);
1828 }
1829
1830 /* Prepare to track buffer usage during parallel execution */
1832
1833 /* Perform sorting of spool, and possibly a spool2 */
1835 _bt_parallel_scan_and_sort(btspool, btspool2, btshared, sharedsort,
1836 sharedsort2, sortmem, false);
1837
1838 /* Report WAL/buffer usage during parallel execution */
1839 bufferusage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false);
1840 walusage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false);
1842 &walusage[ParallelWorkerNumber]);
1843
1844#ifdef BTREE_BUILD_STATS
1846 {
1847 ShowUsage("BTREE BUILD (Worker Partial Spool) STATISTICS");
1848 ResetUsage();
1849 }
1850#endif /* BTREE_BUILD_STATS */
1851
1852 index_close(indexRel, indexLockmode);
1853 table_close(heapRel, heapLockmode);
1854}
int ParallelWorkerNumber
Definition parallel.c:117
void pgstat_report_query_id(int64 query_id, bool force)
void pgstat_report_activity(BackendState state, const char *cmd_str)
@ STATE_RUNNING
#define palloc0_object(type)
Definition fe_memutils.h:75
int maintenance_work_mem
Definition globals.c:135
bool log_btree_build_stats
Definition guc_tables.c:553
void index_close(Relation relation, LOCKMODE lockmode)
Definition indexam.c:178
Relation index_open(Oid relationId, LOCKMODE lockmode)
Definition indexam.c:134
void InstrEndParallelQuery(BufferUsage *bufusage, WalUsage *walusage)
Definition instrument.c:287
void InstrStartParallelQuery(void)
Definition instrument.c:279
int LOCKMODE
Definition lockdefs.h:26
#define AccessExclusiveLock
Definition lockdefs.h:43
#define ShareUpdateExclusiveLock
Definition lockdefs.h:39
#define ShareLock
Definition lockdefs.h:40
#define RowExclusiveLock
Definition lockdefs.h:38
#define PARALLEL_KEY_BUFFER_USAGE
Definition nbtsort.c:70
#define PARALLEL_KEY_TUPLESORT_SPOOL2
Definition nbtsort.c:67
static void _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2, BTShared *btshared, Sharedsort *sharedsort, Sharedsort *sharedsort2, int sortmem, bool progress)
Definition nbtsort.c:1869
#define PARALLEL_KEY_BTREE_SHARED
Definition nbtsort.c:65
#define PARALLEL_KEY_TUPLESORT
Definition nbtsort.c:66
#define PARALLEL_KEY_QUERY_TEXT
Definition nbtsort.c:68
#define PARALLEL_KEY_WAL_USAGE
Definition nbtsort.c:69
const char * debug_query_string
Definition postgres.c:94
void ShowUsage(const char *title)
Definition postgres.c:5137
void ResetUsage(void)
Definition postgres.c:5130
#define PROC_IN_SAFE_IC
Definition proc.h:63
void * shm_toc_lookup(shm_toc *toc, uint64 key, bool noError)
Definition shm_toc.c:239
PGPROC * MyProc
Definition proc.c:71
bool isconcurrent
Definition nbtsort.c:108
Oid heaprelid
Definition nbtsort.c:104
int64 queryid
Definition nbtsort.c:112
bool isunique
Definition nbtsort.c:106
int scantuplesortstates
Definition nbtsort.c:109
Oid indexrelid
Definition nbtsort.c:105
bool nulls_not_distinct
Definition nbtsort.c:107
uint8 statusFlags
Definition proc.h:210
void table_close(Relation relation, LOCKMODE lockmode)
Definition table.c:126
Relation table_open(Oid relationId, LOCKMODE lockmode)
Definition table.c:40
void tuplesort_attach_shared(Sharedsort *shared, dsm_segment *seg)
Definition tuplesort.c:3269

References _bt_parallel_scan_and_sort(), AccessExclusiveLock, Assert, debug_query_string, fb(), BTShared::heaprelid, index_close(), index_open(), BTShared::indexrelid, InstrEndParallelQuery(), InstrStartParallelQuery(), BTShared::isconcurrent, BTShared::isunique, log_btree_build_stats, maintenance_work_mem, MyProc, BTShared::nulls_not_distinct, palloc0_object, PARALLEL_KEY_BTREE_SHARED, PARALLEL_KEY_BUFFER_USAGE, PARALLEL_KEY_QUERY_TEXT, PARALLEL_KEY_TUPLESORT, PARALLEL_KEY_TUPLESORT_SPOOL2, PARALLEL_KEY_WAL_USAGE, ParallelWorkerNumber, pgstat_report_activity(), pgstat_report_query_id(), PROC_IN_SAFE_IC, BTShared::queryid, ResetUsage(), RowExclusiveLock, BTShared::scantuplesortstates, ShareLock, ShareUpdateExclusiveLock, shm_toc_lookup(), ShowUsage(), STATE_RUNNING, PGPROC::statusFlags, table_close(), table_open(), and tuplesort_attach_shared().

◆ _bt_parallel_done()

void _bt_parallel_done ( IndexScanDesc  scan)
extern

Definition at line 1035 of file nbtree.c.

1036{
1038 ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
1040 bool status_changed = false;
1041
1042 Assert(!BTScanPosIsValid(so->currPos));
1043
1044 /* Do nothing, for non-parallel scans */
1045 if (parallel_scan == NULL)
1046 return;
1047
1048 /*
1049 * Should not mark parallel scan done when there's still a pending
1050 * primitive index scan
1051 */
1052 if (so->needPrimScan)
1053 return;
1054
1055 btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
1056 parallel_scan->ps_offset_am);
1057
1058 /*
1059 * Mark the parallel scan as done, unless some other process did so
1060 * already
1061 */
1062 LWLockAcquire(&btscan->btps_lock, LW_EXCLUSIVE);
1063 Assert(btscan->btps_pageStatus != BTPARALLEL_NEED_PRIMSCAN);
1064 if (btscan->btps_pageStatus != BTPARALLEL_DONE)
1065 {
1066 btscan->btps_pageStatus = BTPARALLEL_DONE;
1067 status_changed = true;
1068 }
1069 LWLockRelease(&btscan->btps_lock);
1070
1071 /* wake up all the workers associated with this parallel scan */
1072 if (status_changed)
1074}
#define OffsetToPointer(base, offset)
Definition c.h:855
void ConditionVariableBroadcast(ConditionVariable *cv)
@ BTPARALLEL_NEED_PRIMSCAN
Definition nbtree.c:59
@ BTPARALLEL_DONE
Definition nbtree.c:62
struct BTParallelScanDescData * BTParallelScanDesc
Definition nbtree.c:95

References Assert, BTPARALLEL_DONE, BTPARALLEL_NEED_PRIMSCAN, BTScanPosIsValid, ConditionVariableBroadcast(), fb(), LW_EXCLUSIVE, LWLockAcquire(), LWLockRelease(), OffsetToPointer, IndexScanDescData::opaque, IndexScanDescData::parallel_scan, and ParallelIndexScanDescData::ps_offset_am.

Referenced by _bt_endpoint(), _bt_first(), _bt_parallel_seize(), _bt_readnextpage(), and _bt_start_prim_scan().

◆ _bt_parallel_primscan_schedule()

void _bt_parallel_primscan_schedule ( IndexScanDesc  scan,
BlockNumber  curr_page 
)
extern

Definition at line 1085 of file nbtree.c.

1086{
1087 Relation rel = scan->indexRelation;
1089 ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
1091
1092 Assert(so->numArrayKeys);
1093
1094 btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
1095 parallel_scan->ps_offset_am);
1096
1097 LWLockAcquire(&btscan->btps_lock, LW_EXCLUSIVE);
1098 if (btscan->btps_lastCurrPage == curr_page &&
1099 btscan->btps_pageStatus == BTPARALLEL_IDLE)
1100 {
1101 btscan->btps_nextScanPage = InvalidBlockNumber;
1102 btscan->btps_lastCurrPage = InvalidBlockNumber;
1103 btscan->btps_pageStatus = BTPARALLEL_NEED_PRIMSCAN;
1104
1105 /* Serialize scan's current array keys */
1107 }
1108 LWLockRelease(&btscan->btps_lock);
1109}
@ BTPARALLEL_IDLE
Definition nbtree.c:61
static void _bt_parallel_serialize_arrays(Relation rel, BTParallelScanDesc btscan, BTScanOpaque so)
Definition nbtree.c:717

References _bt_parallel_serialize_arrays(), Assert, BTPARALLEL_IDLE, BTPARALLEL_NEED_PRIMSCAN, fb(), IndexScanDescData::indexRelation, InvalidBlockNumber, LW_EXCLUSIVE, LWLockAcquire(), LWLockRelease(), OffsetToPointer, IndexScanDescData::opaque, IndexScanDescData::parallel_scan, and ParallelIndexScanDescData::ps_offset_am.

Referenced by _bt_advance_array_keys(), and _bt_readpage().

◆ _bt_parallel_release()

void _bt_parallel_release ( IndexScanDesc  scan,
BlockNumber  next_scan_page,
BlockNumber  curr_page 
)
extern

Definition at line 1008 of file nbtree.c.

1010{
1011 ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
1013
1015
1016 btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
1017 parallel_scan->ps_offset_am);
1018
1019 LWLockAcquire(&btscan->btps_lock, LW_EXCLUSIVE);
1020 btscan->btps_nextScanPage = next_scan_page;
1021 btscan->btps_lastCurrPage = curr_page;
1022 btscan->btps_pageStatus = BTPARALLEL_IDLE;
1023 LWLockRelease(&btscan->btps_lock);
1025}
void ConditionVariableSignal(ConditionVariable *cv)

References Assert, BlockNumberIsValid(), BTPARALLEL_IDLE, ConditionVariableSignal(), fb(), LW_EXCLUSIVE, LWLockAcquire(), LWLockRelease(), OffsetToPointer, IndexScanDescData::parallel_scan, and ParallelIndexScanDescData::ps_offset_am.

Referenced by _bt_readnextpage(), and _bt_readpage().

◆ _bt_parallel_seize()

bool _bt_parallel_seize ( IndexScanDesc  scan,
BlockNumber next_scan_page,
BlockNumber last_curr_page,
bool  first 
)
extern

Definition at line 870 of file nbtree.c.

872{
873 Relation rel = scan->indexRelation;
875 bool exit_loop = false,
876 status = true,
877 endscan = false;
878 ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
880
883
884 /*
885 * Reset so->currPos, and initialize moreLeft/moreRight such that the next
886 * call to _bt_readnextpage treats this backend similarly to a serial
887 * backend that steps from *last_curr_page to *next_scan_page (unless this
888 * backend's so->currPos is initialized by _bt_readfirstpage before then).
889 */
890 BTScanPosInvalidate(so->currPos);
891 so->currPos.moreLeft = so->currPos.moreRight = true;
892
893 if (first)
894 {
895 /*
896 * Initialize array related state when called from _bt_first, assuming
897 * that this will be the first primitive index scan for the scan
898 */
899 so->needPrimScan = false;
900 so->scanBehind = false;
901 so->oppositeDirCheck = false;
902 }
903 else
904 {
905 /*
906 * Don't attempt to seize the scan when it requires another primitive
907 * index scan, since caller's backend cannot start it right now
908 */
909 if (so->needPrimScan)
910 return false;
911 }
912
913 btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
914 parallel_scan->ps_offset_am);
915
916 while (1)
917 {
918 LWLockAcquire(&btscan->btps_lock, LW_EXCLUSIVE);
919
920 if (btscan->btps_pageStatus == BTPARALLEL_DONE)
921 {
922 /* We're done with this parallel index scan */
923 status = false;
924 }
925 else if (btscan->btps_pageStatus == BTPARALLEL_IDLE &&
926 btscan->btps_nextScanPage == P_NONE)
927 {
928 /* End this parallel index scan */
929 status = false;
930 endscan = true;
931 }
932 else if (btscan->btps_pageStatus == BTPARALLEL_NEED_PRIMSCAN)
933 {
934 Assert(so->numArrayKeys);
935
936 if (first)
937 {
938 /* Can start scheduled primitive scan right away, so do so */
939 btscan->btps_pageStatus = BTPARALLEL_ADVANCING;
940
941 /* Restore scan's array keys from serialized values */
943 exit_loop = true;
944 }
945 else
946 {
947 /*
948 * Don't attempt to seize the scan when it requires another
949 * primitive index scan, since caller's backend cannot start
950 * it right now
951 */
952 status = false;
953 }
954
955 /*
956 * Either way, update backend local state to indicate that a
957 * pending primitive scan is required
958 */
959 so->needPrimScan = true;
960 so->scanBehind = false;
961 so->oppositeDirCheck = false;
962 }
963 else if (btscan->btps_pageStatus != BTPARALLEL_ADVANCING)
964 {
965 /*
966 * We have successfully seized control of the scan for the purpose
967 * of advancing it to a new page!
968 */
969 btscan->btps_pageStatus = BTPARALLEL_ADVANCING;
970 Assert(btscan->btps_nextScanPage != P_NONE);
971 *next_scan_page = btscan->btps_nextScanPage;
972 *last_curr_page = btscan->btps_lastCurrPage;
973 exit_loop = true;
974 }
975 LWLockRelease(&btscan->btps_lock);
976 if (exit_loop || !status)
977 break;
979 }
981
982 /* When the scan has reached the rightmost (or leftmost) page, end it */
983 if (endscan)
984 _bt_parallel_done(scan);
985
986 return status;
987}
bool ConditionVariableCancelSleep(void)
void ConditionVariableSleep(ConditionVariable *cv, uint32 wait_event_info)
@ BTPARALLEL_ADVANCING
Definition nbtree.c:60
static void _bt_parallel_restore_arrays(Relation rel, BTParallelScanDesc btscan, BTScanOpaque so)
Definition nbtree.c:760
#define BTScanPosInvalidate(scanpos)
Definition nbtree.h:1027

References _bt_parallel_done(), _bt_parallel_restore_arrays(), Assert, BTPARALLEL_ADVANCING, BTPARALLEL_DONE, BTPARALLEL_IDLE, BTPARALLEL_NEED_PRIMSCAN, BTScanPosInvalidate, ConditionVariableCancelSleep(), ConditionVariableSleep(), fb(), IndexScanDescData::indexRelation, InvalidBlockNumber, LW_EXCLUSIVE, LWLockAcquire(), LWLockRelease(), OffsetToPointer, IndexScanDescData::opaque, P_NONE, IndexScanDescData::parallel_scan, and ParallelIndexScanDescData::ps_offset_am.

Referenced by _bt_first(), and _bt_readnextpage().

◆ _bt_pendingfsm_finalize()

void _bt_pendingfsm_finalize ( Relation  rel,
BTVacState vstate 
)
extern

Definition at line 3033 of file nbtpage.c.

3034{
3035 IndexBulkDeleteResult *stats = vstate->stats;
3036 Relation heaprel = vstate->info->heaprel;
3037
3038 Assert(stats->pages_newly_deleted >= vstate->npendingpages);
3039 Assert(heaprel != NULL);
3040
3041 if (vstate->npendingpages == 0)
3042 {
3043 /* Just free memory when nothing to do */
3044 if (vstate->pendingpages)
3045 pfree(vstate->pendingpages);
3046
3047 return;
3048 }
3049
3050#ifdef DEBUG_BTREE_PENDING_FSM
3051
3052 /*
3053 * Debugging aid: Sleep for 5 seconds to greatly increase the chances of
3054 * placing pending pages in the FSM. Note that the optimization will
3055 * never be effective without some other backend concurrently consuming an
3056 * XID.
3057 */
3058 pg_usleep(5000000L);
3059#endif
3060
3061 /*
3062 * Recompute VACUUM XID boundaries.
3063 *
3064 * We don't actually care about the oldest non-removable XID. Computing
3065 * the oldest such XID has a useful side-effect that we rely on: it
3066 * forcibly updates the XID horizon state for this backend. This step is
3067 * essential; GlobalVisCheckRemovableFullXid() will not reliably recognize
3068 * that it is now safe to recycle newly deleted pages without this step.
3069 */
3071
3072 for (int i = 0; i < vstate->npendingpages; i++)
3073 {
3074 BlockNumber target = vstate->pendingpages[i].target;
3075 FullTransactionId safexid = vstate->pendingpages[i].safexid;
3076
3077 /*
3078 * Do the equivalent of checking BTPageIsRecyclable(), but without
3079 * accessing the page again a second time.
3080 *
3081 * Give up on finding the first non-recyclable page -- all later pages
3082 * must be non-recyclable too, since _bt_pendingfsm_add() adds pages
3083 * to the array in safexid order.
3084 */
3085 if (!GlobalVisCheckRemovableFullXid(heaprel, safexid))
3086 break;
3087
3088 RecordFreeIndexPage(rel, target);
3089 stats->pages_free++;
3090 }
3091
3092 pfree(vstate->pendingpages);
3093}
void RecordFreeIndexPage(Relation rel, BlockNumber freeBlock)
Definition indexfsm.c:52
TransactionId GetOldestNonRemovableTransactionId(Relation rel)
Definition procarray.c:1944
bool GlobalVisCheckRemovableFullXid(Relation rel, FullTransactionId fxid)
Definition procarray.c:4339
void pg_usleep(long microsec)
Definition signal.c:53
BlockNumber pages_newly_deleted
Definition genam.h:89
BlockNumber pages_free
Definition genam.h:91

References Assert, fb(), GetOldestNonRemovableTransactionId(), GlobalVisCheckRemovableFullXid(), i, IndexBulkDeleteResult::pages_free, IndexBulkDeleteResult::pages_newly_deleted, pfree(), pg_usleep(), and RecordFreeIndexPage().

Referenced by btvacuumscan().

◆ _bt_pendingfsm_init()

void _bt_pendingfsm_init ( Relation  rel,
BTVacState vstate,
bool  cleanuponly 
)
extern

Definition at line 2991 of file nbtpage.c.

2992{
2993 Size maxbufsize;
2994
2995 /*
2996 * Don't bother with optimization in cleanup-only case -- we don't expect
2997 * any newly deleted pages. Besides, cleanup-only calls to btvacuumscan()
2998 * can only take place because this optimization didn't work out during
2999 * the last VACUUM.
3000 */
3001 if (cleanuponly)
3002 return;
3003
3004 /*
3005 * Cap maximum size of array so that we always respect work_mem. Avoid
3006 * int overflow here.
3007 */
3008 vstate->bufsize = 256;
3009 maxbufsize = (work_mem * (Size) 1024) / sizeof(BTPendingFSM);
3010 maxbufsize = Min(maxbufsize, MaxAllocSize / sizeof(BTPendingFSM));
3011 /* BTVacState.maxbufsize has type int */
3012 maxbufsize = Min(maxbufsize, INT_MAX);
3013 /* Stay sane with small work_mem */
3014 maxbufsize = Max(maxbufsize, vstate->bufsize);
3015 vstate->maxbufsize = (int) maxbufsize;
3016
3017 /* Allocate buffer, indicate that there are currently 0 pending pages */
3018 vstate->pendingpages = palloc_array(BTPendingFSM, vstate->bufsize);
3019 vstate->npendingpages = 0;
3020}
#define MaxAllocSize
Definition fe_memutils.h:22
int work_mem
Definition globals.c:133

References fb(), Max, MaxAllocSize, Min, palloc_array, and work_mem.

Referenced by btvacuumscan().

◆ _bt_preprocess_keys()

void _bt_preprocess_keys ( IndexScanDesc  scan)
extern

Definition at line 203 of file nbtpreprocesskeys.c.

204{
206 int numberOfKeys = scan->numberOfKeys;
212 bool test_result,
213 redundant_key_kept = false;
214 AttrNumber attno;
216 int *keyDataMap = NULL;
217 int arrayidx = 0;
218
219 if (so->numberOfKeys > 0)
220 {
221 /*
222 * Only need to do preprocessing once per btrescan, at most. All
223 * calls after the first are handled as no-ops.
224 */
225 return;
226 }
227
228 /* initialize result variables */
229 so->qual_ok = true;
230 so->numberOfKeys = 0;
231
232 if (numberOfKeys < 1)
233 return; /* done if qual-less scan */
234
235 /* If any keys are SK_SEARCHARRAY type, set up array-key info */
236 arrayKeyData = _bt_preprocess_array_keys(scan, &numberOfKeys);
237 if (!so->qual_ok)
238 {
239 /* unmatchable array, so give up */
240 return;
241 }
242
243 /*
244 * Treat arrayKeyData[] (a partially preprocessed copy of scan->keyData[])
245 * as our input if _bt_preprocess_array_keys just allocated it, else just
246 * use scan->keyData[]
247 */
248 if (arrayKeyData)
249 {
251
252 /* Also maintain keyDataMap for remapping so->orderProcs[] later */
253 keyDataMap = MemoryContextAlloc(so->arrayContext,
254 numberOfKeys * sizeof(int));
255
256 /*
257 * Also enlarge output array when it might otherwise not have room for
258 * a skip array's scan key
259 */
260 if (numberOfKeys > scan->numberOfKeys)
261 so->keyData = repalloc(so->keyData,
262 numberOfKeys * sizeof(ScanKeyData));
263 }
264 else
265 inkeys = scan->keyData;
266
267 /* we check that input keys are correctly ordered */
268 if (inkeys[0].sk_attno < 1)
269 elog(ERROR, "btree index keys must be ordered by attribute");
270
271 /* We can short-circuit most of the work if there's just one key */
272 if (numberOfKeys == 1)
273 {
274 /* Apply indoption to scankey (might change sk_strategy!) */
276 so->qual_ok = false;
277 memcpy(&so->keyData[0], &inkeys[0], sizeof(ScanKeyData));
278 so->numberOfKeys = 1;
279 /* We can mark the qual as required if it's for first index col */
280 if (inkeys[0].sk_attno == 1)
281 _bt_mark_scankey_required(&so->keyData[0]);
282 if (arrayKeyData)
283 {
284 /*
285 * Don't call _bt_preprocess_array_keys_final in this fast path
286 * (we'll miss out on the single value array transformation, but
287 * that's not nearly as important when there's only one scan key)
288 */
289 Assert(so->keyData[0].sk_flags & SK_SEARCHARRAY);
290 Assert(so->keyData[0].sk_strategy != BTEqualStrategyNumber ||
291 (so->arrayKeys[0].scan_key == 0 &&
292 !(so->keyData[0].sk_flags & SK_BT_SKIP) &&
293 OidIsValid(so->orderProcs[0].fn_oid)));
294 }
295
296 return;
297 }
298
299 /*
300 * Otherwise, do the full set of pushups.
301 */
304
305 /*
306 * Initialize for processing of keys for attr 1.
307 *
308 * xform[i] points to the currently best scan key of strategy type i+1; it
309 * is NULL if we haven't yet found such a key for this attr.
310 */
311 attno = 1;
312 memset(xform, 0, sizeof(xform));
313
314 /*
315 * Loop iterates from 0 to numberOfKeys inclusive; we use the last pass to
316 * handle after-last-key processing. Actual exit from the loop is at the
317 * "break" statement below.
318 */
319 for (int i = 0;; i++)
320 {
321 ScanKey inkey = inkeys + i;
322 int j;
323
324 if (i < numberOfKeys)
325 {
326 /* Apply indoption to scankey (might change sk_strategy!) */
328 {
329 /* NULL can't be matched, so give up */
330 so->qual_ok = false;
331 return;
332 }
333 }
334
335 /*
336 * If we are at the end of the keys for a particular attr, finish up
337 * processing and emit the cleaned-up keys.
338 */
339 if (i == numberOfKeys || inkey->sk_attno != attno)
340 {
342
343 /* check input keys are correctly ordered */
344 if (i < numberOfKeys && inkey->sk_attno < attno)
345 elog(ERROR, "btree index keys must be ordered by attribute");
346
347 /*
348 * If = has been specified, all other keys can be eliminated as
349 * redundant. Note that this is no less true if the = key is
350 * SEARCHARRAY; the only real difference is that the inequality
351 * key _becomes_ redundant by making _bt_compare_scankey_args
352 * eliminate the subset of elements that won't need to be matched
353 * (with SAOP arrays and skip arrays alike).
354 *
355 * If we have a case like "key = 1 AND key > 2", we set qual_ok to
356 * false and abandon further processing. We'll do the same thing
357 * given a case like "key IN (0, 1) AND key > 2".
358 *
359 * We also have to deal with the case of "key IS NULL", which is
360 * unsatisfiable in combination with any other index condition. By
361 * the time we get here, that's been classified as an equality
362 * check, and we've rejected any combination of it with a regular
363 * equality condition; but not with other types of conditions.
364 */
365 if (xform[BTEqualStrategyNumber - 1].inkey)
366 {
368 BTArrayKeyInfo *array = NULL;
370
371 if (arrayKeyData && (eq->sk_flags & SK_SEARCHARRAY))
372 {
373 int eq_in_ikey,
375
378 array = &so->arrayKeys[eq_arrayidx - 1];
379 orderproc = so->orderProcs + eq_in_ikey;
380
381 Assert(array->scan_key == eq_in_ikey);
382 Assert(OidIsValid(orderproc->fn_oid));
383 }
384
385 for (j = BTMaxStrategyNumber; --j >= 0;)
386 {
387 ScanKey chk = xform[j].inkey;
388
389 if (!chk || j == (BTEqualStrategyNumber - 1))
390 continue;
391
392 if (eq->sk_flags & SK_SEARCHNULL)
393 {
394 /* IS NULL is contradictory to anything else */
395 so->qual_ok = false;
396 return;
397 }
398
400 array, orderproc,
401 &test_result))
402 {
403 if (!test_result)
404 {
405 /* keys proven mutually contradictory */
406 so->qual_ok = false;
407 return;
408 }
409 /* else discard the redundant non-equality key */
410 xform[j].inkey = NULL;
411 xform[j].inkeyi = -1;
412 }
413 else
414 redundant_key_kept = true;
415 }
416 /* track number of attrs for which we have "=" keys */
418 }
419
420 /* try to keep only one of <, <= */
421 if (xform[BTLessStrategyNumber - 1].inkey &&
423 {
424 ScanKey lt = xform[BTLessStrategyNumber - 1].inkey;
426
427 if (_bt_compare_scankey_args(scan, le, lt, le, NULL, NULL,
428 &test_result))
429 {
430 if (test_result)
432 else
433 xform[BTLessStrategyNumber - 1].inkey = NULL;
434 }
435 else
436 redundant_key_kept = true;
437 }
438
439 /* try to keep only one of >, >= */
440 if (xform[BTGreaterStrategyNumber - 1].inkey &&
442 {
445
447 &test_result))
448 {
449 if (test_result)
451 else
453 }
454 else
455 redundant_key_kept = true;
456 }
457
458 /*
459 * Emit the cleaned-up keys into the so->keyData[] array, and then
460 * mark them if they are required. They are required (possibly
461 * only in one direction) if all attrs before this one had "=".
462 *
463 * In practice we'll rarely output non-required scan keys here;
464 * typically, _bt_preprocess_array_keys has already added "=" keys
465 * sufficient to form an unbroken series of "=" constraints on all
466 * attrs prior to the attr from the final scan->keyData[] key.
467 */
468 for (j = BTMaxStrategyNumber; --j >= 0;)
469 {
470 if (xform[j].inkey)
471 {
472 ScanKey outkey = &so->keyData[new_numberOfKeys++];
473
474 memcpy(outkey, xform[j].inkey, sizeof(ScanKeyData));
475 if (arrayKeyData)
476 keyDataMap[new_numberOfKeys - 1] = xform[j].inkeyi;
477 if (priorNumberOfEqualCols == attno - 1)
479 }
480 }
481
482 /*
483 * Exit loop here if done.
484 */
485 if (i == numberOfKeys)
486 break;
487
488 /* Re-initialize for new attno */
489 attno = inkey->sk_attno;
490 memset(xform, 0, sizeof(xform));
491 }
492
493 /* check strategy this key's operator corresponds to */
494 j = inkey->sk_strategy - 1;
495
496 if (inkey->sk_strategy == BTEqualStrategyNumber &&
497 (inkey->sk_flags & SK_SEARCHARRAY))
498 {
499 /* must track how input scan keys map to arrays */
501 arrayidx++;
502 }
503
504 /*
505 * have we seen a scan key for this same attribute and using this same
506 * operator strategy before now?
507 */
508 if (xform[j].inkey == NULL)
509 {
510 /* nope, so this scan key wins by default (at least for now) */
511 xform[j].inkey = inkey;
512 xform[j].inkeyi = i;
513 xform[j].arrayidx = arrayidx;
514 }
515 else
516 {
518 BTArrayKeyInfo *array = NULL;
519
520 /*
521 * Seen one of these before, so keep only the more restrictive key
522 * if possible
523 */
524 if (j == (BTEqualStrategyNumber - 1) && arrayKeyData)
525 {
526 /*
527 * Have to set up array keys
528 */
529 if (inkey->sk_flags & SK_SEARCHARRAY)
530 {
531 array = &so->arrayKeys[arrayidx - 1];
532 orderproc = so->orderProcs + i;
533
534 Assert(array->scan_key == i);
535 Assert(OidIsValid(orderproc->fn_oid));
536 Assert(!(inkey->sk_flags & SK_BT_SKIP));
537 }
538 else if (xform[j].inkey->sk_flags & SK_SEARCHARRAY)
539 {
540 array = &so->arrayKeys[xform[j].arrayidx - 1];
541 orderproc = so->orderProcs + xform[j].inkeyi;
542
543 Assert(array->scan_key == xform[j].inkeyi);
544 Assert(OidIsValid(orderproc->fn_oid));
545 Assert(!(xform[j].inkey->sk_flags & SK_BT_SKIP));
546 }
547
548 /*
549 * Both scan keys might have arrays, in which case we'll
550 * arbitrarily pass only one of the arrays. That won't
551 * matter, since _bt_compare_scankey_args is aware that two
552 * SEARCHARRAY scan keys mean that _bt_preprocess_array_keys
553 * failed to eliminate redundant arrays through array merging.
554 * _bt_compare_scankey_args just returns false when it sees
555 * this; it won't even try to examine either array.
556 */
557 }
558
559 if (_bt_compare_scankey_args(scan, inkey, inkey, xform[j].inkey,
560 array, orderproc, &test_result))
561 {
562 /* Have all we need to determine redundancy */
563 if (test_result)
564 {
565 /*
566 * New key is more restrictive, and so replaces old key...
567 */
568 if (j != (BTEqualStrategyNumber - 1) ||
569 !(xform[j].inkey->sk_flags & SK_SEARCHARRAY))
570 {
571 xform[j].inkey = inkey;
572 xform[j].inkeyi = i;
573 xform[j].arrayidx = arrayidx;
574 }
575 else
576 {
577 /*
578 * ...unless we have to keep the old key because it's
579 * an array that rendered the new key redundant. We
580 * need to make sure that we don't throw away an array
581 * scan key. _bt_preprocess_array_keys_final expects
582 * us to keep all of the arrays that weren't already
583 * eliminated by _bt_preprocess_array_keys earlier on.
584 */
585 Assert(!(inkey->sk_flags & SK_SEARCHARRAY));
586 }
587 }
588 else if (j == (BTEqualStrategyNumber - 1))
589 {
590 /* key == a && key == b, but a != b */
591 so->qual_ok = false;
592 return;
593 }
594 /* else old key is more restrictive, keep it */
595 }
596 else
597 {
598 /*
599 * We can't determine which key is more restrictive. Push
600 * xform[j] directly to the output array, then set xform[j] to
601 * the new scan key.
602 *
603 * Note: We do things this way around so that our arrays are
604 * always in the same order as their corresponding scan keys.
605 * _bt_preprocess_array_keys_final expects this.
606 */
607 ScanKey outkey = &so->keyData[new_numberOfKeys++];
608
609 memcpy(outkey, xform[j].inkey, sizeof(ScanKeyData));
610 if (arrayKeyData)
611 keyDataMap[new_numberOfKeys - 1] = xform[j].inkeyi;
612 if (numberOfEqualCols == attno - 1)
614 xform[j].inkey = inkey;
615 xform[j].inkeyi = i;
616 xform[j].arrayidx = arrayidx;
617 redundant_key_kept = true;
618 }
619 }
620 }
621
622 so->numberOfKeys = new_numberOfKeys;
623
624 /*
625 * Now that we've built a temporary mapping from so->keyData[] (output
626 * scan keys) to arrayKeyData[] (our input scan keys), fix array->scan_key
627 * references. Also consolidate the so->orderProcs[] array such that it
628 * can be subscripted using so->keyData[]-wise offsets.
629 */
630 if (arrayKeyData)
632
633 /*
634 * If there are remaining redundant inequality keys, we must make sure
635 * that each index attribute has no more than one required >/>= key, and
636 * no more than one required </<= key. Attributes that have one or more
637 * required = keys now must keep only one required key (the first = key).
638 */
639 if (unlikely(redundant_key_kept) && so->qual_ok)
641
642 /* Could pfree arrayKeyData/keyDataMap now, but not worth the cycles */
643}
void * repalloc(void *pointer, Size size)
Definition mcxt.c:1632
static bool _bt_fix_scankey_strategy(ScanKey skey, int16 *indoption)
static void _bt_unmark_keys(IndexScanDesc scan, int *keyDataMap)
static void _bt_mark_scankey_required(ScanKey skey)
static ScanKey _bt_preprocess_array_keys(IndexScanDesc scan, int *new_numberOfKeys)
static void _bt_preprocess_array_keys_final(IndexScanDesc scan, int *keyDataMap)
static bool _bt_compare_scankey_args(IndexScanDesc scan, ScanKey op, ScanKey leftarg, ScanKey rightarg, BTArrayKeyInfo *array, FmgrInfo *orderproc, bool *result)
#define SK_SEARCHNULL
Definition skey.h:121
#define BTMaxStrategyNumber
Definition stratnum.h:35
struct ScanKeyData * keyData
Definition relscan.h:154
AttrNumber sk_attno
Definition skey.h:67

References _bt_compare_scankey_args(), _bt_fix_scankey_strategy(), _bt_mark_scankey_required(), _bt_preprocess_array_keys(), _bt_preprocess_array_keys_final(), _bt_unmark_keys(), Assert, BTEqualStrategyNumber, BTGreaterEqualStrategyNumber, BTGreaterStrategyNumber, BTLessEqualStrategyNumber, BTLessStrategyNumber, BTMaxStrategyNumber, elog, ERROR, fb(), i, IndexScanDescData::indexRelation, j, IndexScanDescData::keyData, memcpy(), MemoryContextAlloc(), IndexScanDescData::numberOfKeys, OidIsValid, IndexScanDescData::opaque, RelationData::rd_indoption, repalloc(), BTArrayKeyInfo::scan_key, ScanKeyData::sk_attno, SK_BT_SKIP, ScanKeyData::sk_flags, SK_SEARCHARRAY, SK_SEARCHNULL, ScanKeyData::sk_strategy, and unlikely.

Referenced by _bt_first().

◆ _bt_readpage()

bool _bt_readpage ( IndexScanDesc  scan,
ScanDirection  dir,
OffsetNumber  offnum,
bool  firstpage 
)
extern

Definition at line 134 of file nbtreadpage.c.

136{
137 Relation rel = scan->indexRelation;
139 Page page;
140 BTPageOpaque opaque;
141 OffsetNumber minoff;
142 OffsetNumber maxoff;
143 BTReadPageState pstate;
144 bool arrayKeys,
145 ignore_killed_tuples = scan->ignore_killed_tuples;
146 int itemIndex,
147 indnatts;
148
149 /* save the page/buffer block number, along with its sibling links */
150 page = BufferGetPage(so->currPos.buf);
151 opaque = BTPageGetOpaque(page);
152 so->currPos.currPage = BufferGetBlockNumber(so->currPos.buf);
153 so->currPos.prevPage = opaque->btpo_prev;
154 so->currPos.nextPage = opaque->btpo_next;
155 /* delay setting so->currPos.lsn until _bt_drop_lock_and_maybe_pin */
156 pstate.dir = so->currPos.dir = dir;
157 so->currPos.nextTupleOffset = 0;
158
159 /* either moreRight or moreLeft should be set now (may be unset later) */
160 Assert(ScanDirectionIsForward(dir) ? so->currPos.moreRight :
161 so->currPos.moreLeft);
162 Assert(!P_IGNORE(opaque));
163 Assert(BTScanPosIsPinned(so->currPos));
164 Assert(!so->needPrimScan);
165
166 /* initialize local variables */
168 arrayKeys = so->numArrayKeys != 0;
169 minoff = P_FIRSTDATAKEY(opaque);
170 maxoff = PageGetMaxOffsetNumber(page);
171
172 /* initialize page-level state that we'll pass to _bt_checkkeys */
173 pstate.minoff = minoff;
174 pstate.maxoff = maxoff;
175 pstate.finaltup = NULL;
176 pstate.page = page;
177 pstate.firstpage = firstpage;
178 pstate.forcenonrequired = false;
179 pstate.startikey = 0;
181 pstate.skip = InvalidOffsetNumber;
182 pstate.continuescan = true; /* default assumption */
183 pstate.rechecks = 0;
184 pstate.targetdistance = 0;
185 pstate.nskipadvances = 0;
186
187 if (scan->parallel_scan)
188 {
189 /* allow next/prev page to be read by other worker without delay */
190 if (ScanDirectionIsForward(dir))
191 _bt_parallel_release(scan, so->currPos.nextPage,
192 so->currPos.currPage);
193 else
194 _bt_parallel_release(scan, so->currPos.prevPage,
195 so->currPos.currPage);
196 }
197
198 PredicateLockPage(rel, so->currPos.currPage, scan->xs_snapshot);
199
200 if (ScanDirectionIsForward(dir))
201 {
202 /* SK_SEARCHARRAY forward scans must provide high key up front */
203 if (arrayKeys)
204 {
205 if (!P_RIGHTMOST(opaque))
206 {
208
209 pstate.finaltup = (IndexTuple) PageGetItem(page, iid);
210
211 if (unlikely(so->scanBehind) &&
212 !_bt_scanbehind_checkkeys(scan, dir, pstate.finaltup))
213 {
214 /* Schedule another primitive index scan after all */
215 so->currPos.moreRight = false;
216 so->needPrimScan = true;
217 if (scan->parallel_scan)
219 so->currPos.currPage);
220 return false;
221 }
222 }
223
224 so->scanBehind = so->oppositeDirCheck = false; /* reset */
225 }
226
227 /*
228 * Consider pstate.startikey optimization once the ongoing primitive
229 * index scan has already read at least one page
230 */
231 if (!pstate.firstpage && minoff < maxoff)
232 _bt_set_startikey(scan, &pstate);
233
234 /* load items[] in ascending order */
235 itemIndex = 0;
236
237 offnum = Max(offnum, minoff);
238
239 while (offnum <= maxoff)
240 {
241 ItemId iid = PageGetItemId(page, offnum);
242 IndexTuple itup;
243 bool passes_quals;
244
245 /*
246 * If the scan specifies not to return killed tuples, then we
247 * treat a killed tuple as not passing the qual
248 */
249 if (ignore_killed_tuples && ItemIdIsDead(iid))
250 {
251 offnum = OffsetNumberNext(offnum);
252 continue;
253 }
254
255 itup = (IndexTuple) PageGetItem(page, iid);
257
258 pstate.offnum = offnum;
259 passes_quals = _bt_checkkeys(scan, &pstate, arrayKeys,
260 itup, indnatts);
261
262 /*
263 * Check if we need to skip ahead to a later tuple (only possible
264 * when the scan uses array keys)
265 */
266 if (arrayKeys && OffsetNumberIsValid(pstate.skip))
267 {
268 Assert(!passes_quals && pstate.continuescan);
269 Assert(offnum < pstate.skip);
270 Assert(!pstate.forcenonrequired);
271
272 offnum = pstate.skip;
273 pstate.skip = InvalidOffsetNumber;
274 continue;
275 }
276
277 if (passes_quals)
278 {
279 /* tuple passes all scan key conditions */
280 if (!BTreeTupleIsPosting(itup))
281 {
282 /* Remember it */
283 _bt_saveitem(so, itemIndex, offnum, itup);
284 itemIndex++;
285 }
286 else
287 {
288 int tupleOffset;
289
290 /* Set up posting list state (and remember first TID) */
291 tupleOffset =
292 _bt_setuppostingitems(so, itemIndex, offnum,
293 BTreeTupleGetPostingN(itup, 0),
294 itup);
295 itemIndex++;
296
297 /* Remember all later TIDs (must be at least one) */
298 for (int i = 1; i < BTreeTupleGetNPosting(itup); i++)
299 {
300 _bt_savepostingitem(so, itemIndex, offnum,
302 tupleOffset);
303 itemIndex++;
304 }
305 }
306 }
307 /* When !continuescan, there can't be any more matches, so stop */
308 if (!pstate.continuescan)
309 break;
310
311 offnum = OffsetNumberNext(offnum);
312 }
313
314 /*
315 * We don't need to visit page to the right when the high key
316 * indicates that no more matches will be found there.
317 *
318 * Checking the high key like this works out more often than you might
319 * think. Leaf page splits pick a split point between the two most
320 * dissimilar tuples (this is weighed against the need to evenly share
321 * free space). Leaf pages with high key attribute values that can
322 * only appear on non-pivot tuples on the right sibling page are
323 * common.
324 */
325 if (pstate.continuescan && !so->scanBehind && !P_RIGHTMOST(opaque))
326 {
328 IndexTuple itup = (IndexTuple) PageGetItem(page, iid);
329 int truncatt;
330
331 /* Reset arrays, per _bt_set_startikey contract */
332 if (pstate.forcenonrequired)
333 _bt_start_array_keys(scan, dir);
334 pstate.forcenonrequired = false;
335 pstate.startikey = 0; /* _bt_set_startikey ignores P_HIKEY */
336
337 truncatt = BTreeTupleGetNAtts(itup, rel);
338 _bt_checkkeys(scan, &pstate, arrayKeys, itup, truncatt);
339 }
340
341 if (!pstate.continuescan)
342 so->currPos.moreRight = false;
343
344 Assert(itemIndex <= MaxTIDsPerBTreePage);
345 so->currPos.firstItem = 0;
346 so->currPos.lastItem = itemIndex - 1;
347 so->currPos.itemIndex = 0;
348 }
349 else
350 {
351 /* SK_SEARCHARRAY backward scans must provide final tuple up front */
352 if (arrayKeys)
353 {
354 if (minoff <= maxoff && !P_LEFTMOST(opaque))
355 {
356 ItemId iid = PageGetItemId(page, minoff);
357
358 pstate.finaltup = (IndexTuple) PageGetItem(page, iid);
359
360 if (unlikely(so->scanBehind) &&
361 !_bt_scanbehind_checkkeys(scan, dir, pstate.finaltup))
362 {
363 /* Schedule another primitive index scan after all */
364 so->currPos.moreLeft = false;
365 so->needPrimScan = true;
366 if (scan->parallel_scan)
368 so->currPos.currPage);
369 return false;
370 }
371 }
372
373 so->scanBehind = so->oppositeDirCheck = false; /* reset */
374 }
375
376 /*
377 * Consider pstate.startikey optimization once the ongoing primitive
378 * index scan has already read at least one page
379 */
380 if (!pstate.firstpage && minoff < maxoff)
381 _bt_set_startikey(scan, &pstate);
382
383 /* load items[] in descending order */
384 itemIndex = MaxTIDsPerBTreePage;
385
386 offnum = Min(offnum, maxoff);
387
388 while (offnum >= minoff)
389 {
390 ItemId iid = PageGetItemId(page, offnum);
391 IndexTuple itup;
392 bool tuple_alive;
393 bool passes_quals;
394
395 /*
396 * If the scan specifies not to return killed tuples, then we
397 * treat a killed tuple as not passing the qual. Most of the
398 * time, it's a win to not bother examining the tuple's index
399 * keys, but just skip to the next tuple (previous, actually,
400 * since we're scanning backwards). However, if this is the first
401 * tuple on the page, we do check the index keys, to prevent
402 * uselessly advancing to the page to the left. This is similar
403 * to the high key optimization used by forward scans.
404 */
405 if (ignore_killed_tuples && ItemIdIsDead(iid))
406 {
407 if (offnum > minoff)
408 {
409 offnum = OffsetNumberPrev(offnum);
410 continue;
411 }
412
413 tuple_alive = false;
414 }
415 else
416 tuple_alive = true;
417
418 itup = (IndexTuple) PageGetItem(page, iid);
420
421 pstate.offnum = offnum;
422 if (arrayKeys && offnum == minoff && pstate.forcenonrequired)
423 {
424 /* Reset arrays, per _bt_set_startikey contract */
425 pstate.forcenonrequired = false;
426 pstate.startikey = 0;
427 _bt_start_array_keys(scan, dir);
428 }
429 passes_quals = _bt_checkkeys(scan, &pstate, arrayKeys,
430 itup, indnatts);
431
432 if (arrayKeys && so->scanBehind)
433 {
434 /*
435 * Done scanning this page, but not done with the current
436 * primscan.
437 *
438 * Note: Forward scans don't check this explicitly, since they
439 * prefer to reuse pstate.skip for this instead.
440 */
441 Assert(!passes_quals && pstate.continuescan);
442 Assert(!pstate.forcenonrequired);
443
444 break;
445 }
446
447 /*
448 * Check if we need to skip ahead to a later tuple (only possible
449 * when the scan uses array keys)
450 */
451 if (arrayKeys && OffsetNumberIsValid(pstate.skip))
452 {
453 Assert(!passes_quals && pstate.continuescan);
454 Assert(offnum > pstate.skip);
455 Assert(!pstate.forcenonrequired);
456
457 offnum = pstate.skip;
458 pstate.skip = InvalidOffsetNumber;
459 continue;
460 }
461
463 {
464 /* tuple passes all scan key conditions */
465 if (!BTreeTupleIsPosting(itup))
466 {
467 /* Remember it */
468 itemIndex--;
469 _bt_saveitem(so, itemIndex, offnum, itup);
470 }
471 else
472 {
474 int tupleOffset;
475
476 /* Set up posting list state (and remember last TID) */
477 itemIndex--;
478 tupleOffset =
479 _bt_setuppostingitems(so, itemIndex, offnum,
480 BTreeTupleGetPostingN(itup, nitems - 1),
481 itup);
482
483 /* Remember all prior TIDs (must be at least one) */
484 for (int i = nitems - 2; i >= 0; i--)
485 {
486 itemIndex--;
487 _bt_savepostingitem(so, itemIndex, offnum,
489 tupleOffset);
490 }
491 }
492 }
493 /* When !continuescan, there can't be any more matches, so stop */
494 if (!pstate.continuescan)
495 break;
496
497 offnum = OffsetNumberPrev(offnum);
498 }
499
500 /*
501 * We don't need to visit page to the left when no more matches will
502 * be found there
503 */
504 if (!pstate.continuescan)
505 so->currPos.moreLeft = false;
506
507 Assert(itemIndex >= 0);
508 so->currPos.firstItem = itemIndex;
509 so->currPos.lastItem = MaxTIDsPerBTreePage - 1;
510 so->currPos.itemIndex = MaxTIDsPerBTreePage - 1;
511 }
512
513 /*
514 * If _bt_set_startikey told us to temporarily treat the scan's keys as
515 * nonrequired (possible only during scans with array keys), there must be
516 * no lasting consequences for the scan's array keys. The scan's arrays
517 * should now have exactly the same elements as they would have had if the
518 * nonrequired behavior had never been used. (In general, a scan's arrays
519 * are expected to track its progress through the index's key space.)
520 *
521 * We are required (by _bt_set_startikey) to call _bt_checkkeys against
522 * pstate.finaltup with pstate.forcenonrequired=false to allow the scan's
523 * arrays to recover. Assert that that step hasn't been missed.
524 */
525 Assert(!pstate.forcenonrequired);
526
527 return (so->currPos.firstItem <= so->currPos.lastItem);
528}
#define nitems(x)
Definition indent.h:31
static void _bt_saveitem(BTScanOpaque so, int itemIndex, OffsetNumber offnum, IndexTuple itup)
static int _bt_setuppostingitems(BTScanOpaque so, int itemIndex, OffsetNumber offnum, const ItemPointerData *heapTid, IndexTuple itup)
static bool _bt_scanbehind_checkkeys(IndexScanDesc scan, ScanDirection dir, IndexTuple finaltup)
static void _bt_savepostingitem(BTScanOpaque so, int itemIndex, OffsetNumber offnum, ItemPointer heapTid, int tupleOffset)
static void _bt_set_startikey(IndexScanDesc scan, BTReadPageState *pstate)
static bool _bt_checkkeys(IndexScanDesc scan, BTReadPageState *pstate, bool arrayKeys, IndexTuple tuple, int tupnatts)
void _bt_parallel_primscan_schedule(IndexScanDesc scan, BlockNumber curr_page)
Definition nbtree.c:1085
void _bt_parallel_release(IndexScanDesc scan, BlockNumber next_scan_page, BlockNumber curr_page)
Definition nbtree.c:1008
void PredicateLockPage(Relation relation, BlockNumber blkno, Snapshot snapshot)
Definition predicate.c:2529
IndexTuple finaltup
Definition nbtreadpage.c:37
ScanDirection dir
Definition nbtreadpage.c:34
OffsetNumber minoff
Definition nbtreadpage.c:35
OffsetNumber offnum
Definition nbtreadpage.c:44
OffsetNumber skip
Definition nbtreadpage.c:47
OffsetNumber maxoff
Definition nbtreadpage.c:36
bool ignore_killed_tuples
Definition relscan.h:161

References _bt_checkkeys(), _bt_parallel_primscan_schedule(), _bt_parallel_release(), _bt_saveitem(), _bt_savepostingitem(), _bt_scanbehind_checkkeys(), _bt_set_startikey(), _bt_setuppostingitems(), _bt_start_array_keys(), Assert, BTPageGetOpaque, BTPageOpaqueData::btpo_next, BTPageOpaqueData::btpo_prev, BTreeTupleGetNAtts, BTreeTupleGetNPosting(), BTreeTupleGetPostingN(), BTreeTupleIsPivot(), BTreeTupleIsPosting(), BTScanPosIsPinned, BufferGetBlockNumber(), BufferGetPage(), BTReadPageState::continuescan, BTReadPageState::dir, fb(), BTReadPageState::finaltup, BTReadPageState::firstpage, BTReadPageState::forcenonrequired, i, IndexScanDescData::ignore_killed_tuples, IndexScanDescData::indexRelation, IndexRelationGetNumberOfAttributes, InvalidOffsetNumber, ItemIdIsDead, Max, BTReadPageState::maxoff, MaxTIDsPerBTreePage, Min, BTReadPageState::minoff, nitems, BTReadPageState::nskipadvances, BTReadPageState::offnum, OffsetNumberIsValid, OffsetNumberNext, OffsetNumberPrev, IndexScanDescData::opaque, P_FIRSTDATAKEY, P_HIKEY, P_IGNORE, P_LEFTMOST, P_RIGHTMOST, BTReadPageState::page, PageGetItem(), PageGetItemId(), PageGetMaxOffsetNumber(), IndexScanDescData::parallel_scan, PredicateLockPage(), BTReadPageState::rechecks, ScanDirectionIsForward, BTReadPageState::skip, BTReadPageState::startikey, BTReadPageState::targetdistance, unlikely, and IndexScanDescData::xs_snapshot.

Referenced by _bt_readfirstpage(), and _bt_readnextpage().

◆ _bt_relandgetbuf()

Buffer _bt_relandgetbuf ( Relation  rel,
Buffer  obuf,
BlockNumber  blkno,
int  access 
)
extern

Definition at line 1008 of file nbtpage.c.

1009{
1010 Buffer buf;
1011
1012 Assert(BlockNumberIsValid(blkno));
1013 if (BufferIsValid(obuf))
1014 {
1015 if (BufferGetBlockNumber(obuf) == blkno)
1016 {
1017 /* trade in old lock mode for new lock */
1018 _bt_unlockbuf(rel, obuf);
1019 buf = obuf;
1020 }
1021 else
1022 {
1023 /* release lock and pin at once, that's a bit more efficient */
1024 _bt_relbuf(rel, obuf);
1025 buf = ReadBuffer(rel, blkno);
1026 }
1027 }
1028 else
1029 buf = ReadBuffer(rel, blkno);
1030
1031 _bt_lockbuf(rel, buf, access);
1032 _bt_checkpage(rel, buf);
1033
1034 return buf;
1035}

References _bt_checkpage(), _bt_lockbuf(), _bt_relbuf(), _bt_unlockbuf(), Assert, BlockNumberIsValid(), buf, BufferGetBlockNumber(), BufferIsValid(), fb(), and ReadBuffer().

Referenced by _bt_check_unique(), _bt_get_endpoint(), _bt_getroot(), _bt_gettrueroot(), _bt_lock_and_validate_left(), _bt_moveright(), _bt_search(), and _bt_stepright().

◆ _bt_relbuf()

void _bt_relbuf ( Relation  rel,
Buffer  buf 
)
extern

Definition at line 1044 of file nbtpage.c.

1045{
1046 /*
1047 * Buffer is pinned and locked, which means that it is expected to be
1048 * defined and addressable. Check that proactively.
1049 */
1051 if (!RelationUsesLocalBuffers(rel))
1053
1055}
void UnlockReleaseBuffer(Buffer buffer)
Definition bufmgr.c:5603
#define VALGRIND_CHECK_MEM_IS_DEFINED(addr, size)
Definition memdebug.h:23
#define VALGRIND_MAKE_MEM_NOACCESS(addr, size)
Definition memdebug.h:27

References buf, BufferGetPage(), fb(), RelationUsesLocalBuffers, UnlockReleaseBuffer(), VALGRIND_CHECK_MEM_IS_DEFINED, and VALGRIND_MAKE_MEM_NOACCESS.

Referenced by _bt_allocbuf(), _bt_check_unique(), _bt_doinsert(), _bt_drop_lock_and_maybe_pin(), _bt_finish_split(), _bt_getroot(), _bt_getrootheight(), _bt_getstackbuf(), _bt_gettrueroot(), _bt_insert_parent(), _bt_insertonpg(), _bt_killitems(), _bt_leftsib_splitflag(), _bt_lock_and_validate_left(), _bt_lock_subtree_parent(), _bt_mark_page_halfdead(), _bt_metaversion(), _bt_moveright(), _bt_newlevel(), _bt_pagedel(), _bt_readnextpage(), _bt_relandgetbuf(), _bt_rightsib_halfdeadflag(), _bt_search_insert(), _bt_set_cleanup_info(), _bt_split(), _bt_stepright(), _bt_unlink_halfdead_page(), _bt_vacuum_needs_cleanup(), bt_rootdescend(), btvacuumpage(), and pgstat_btree_page().

◆ _bt_search()

BTStack _bt_search ( Relation  rel,
Relation  heaprel,
BTScanInsert  key,
Buffer bufP,
int  access,
bool  returnstack 
)
extern

Definition at line 100 of file nbtsearch.c.

102{
104 int page_access = BT_READ;
105
106 /* heaprel must be set whenever _bt_allocbuf is reachable */
108 Assert(access == BT_READ || heaprel != NULL);
109
110 /* Get the root page to start with */
111 *bufP = _bt_getroot(rel, heaprel, access);
112
113 /* If index is empty and access = BT_READ, no root page is created. */
114 if (!BufferIsValid(*bufP))
115 return (BTStack) NULL;
116
117 /* Loop iterates once per level descended in the tree */
118 for (;;)
119 {
120 Page page;
121 BTPageOpaque opaque;
122 OffsetNumber offnum;
123 ItemId itemid;
124 IndexTuple itup;
125 BlockNumber child;
127
128 /*
129 * Race -- the page we just grabbed may have split since we read its
130 * downlink in its parent page (or the metapage). If it has, we may
131 * need to move right to its new sibling. Do that.
132 *
133 * In write-mode, allow _bt_moveright to finish any incomplete splits
134 * along the way. Strictly speaking, we'd only need to finish an
135 * incomplete split on the leaf page we're about to insert to, not on
136 * any of the upper levels (internal pages with incomplete splits are
137 * also taken care of in _bt_getstackbuf). But this is a good
138 * opportunity to finish splits of internal pages too.
139 */
140 *bufP = _bt_moveright(rel, heaprel, key, *bufP, (access == BT_WRITE),
142
143 /* if this is a leaf page, we're done */
144 page = BufferGetPage(*bufP);
145 opaque = BTPageGetOpaque(page);
146 if (P_ISLEAF(opaque))
147 break;
148
149 /*
150 * Find the appropriate pivot tuple on this page. Its downlink points
151 * to the child page that we're about to descend to.
152 */
153 offnum = _bt_binsrch(rel, key, *bufP);
154 itemid = PageGetItemId(page, offnum);
155 itup = (IndexTuple) PageGetItem(page, itemid);
156 Assert(BTreeTupleIsPivot(itup) || !key->heapkeyspace);
157 child = BTreeTupleGetDownLink(itup);
158
159 /*
160 * We need to save the location of the pivot tuple we chose in a new
161 * stack entry for this page/level. If caller ends up splitting a
162 * page one level down, it usually ends up inserting a new pivot
163 * tuple/downlink immediately after the location recorded here.
164 */
165 if (returnstack)
166 {
168 new_stack->bts_blkno = BufferGetBlockNumber(*bufP);
169 new_stack->bts_offset = offnum;
170 new_stack->bts_parent = stack_in;
172 }
173
174 /*
175 * Page level 1 is lowest non-leaf page level prior to leaves. So, if
176 * we're on the level 1 and asked to lock leaf page in write mode,
177 * then lock next page in write mode, because it must be a leaf.
178 */
179 if (opaque->btpo_level == 1 && access == BT_WRITE)
181
182 /* drop the read lock on the page, then acquire one on its child */
183 *bufP = _bt_relandgetbuf(rel, *bufP, child, page_access);
184
185 /* okay, all set to move down a level */
186 }
187
188 /*
189 * If we're asked to lock leaf in write mode, but didn't manage to, then
190 * relock. This should only happen when the root page is a leaf page (and
191 * the only page in the index other than the metapage).
192 */
193 if (access == BT_WRITE && page_access == BT_READ)
194 {
195 /* trade in our read lock for a write lock */
196 _bt_unlockbuf(rel, *bufP);
197 _bt_lockbuf(rel, *bufP, BT_WRITE);
198
199 /*
200 * Race -- the leaf page may have split after we dropped the read lock
201 * but before we acquired a write lock. If it has, we may need to
202 * move right to its new sibling. Do that.
203 */
204 *bufP = _bt_moveright(rel, heaprel, key, *bufP, true, stack_in, BT_WRITE);
205 }
206
207 return stack_in;
208}
BTStackData * BTStack
Definition nbtree.h:750
static Buffer _bt_moveright(Relation rel, Relation heaprel, BTScanInsert key, Buffer buf, bool forupdate, BTStack stack, int access)
Definition nbtsearch.c:242

References _bt_binsrch(), _bt_getroot(), _bt_lockbuf(), _bt_moveright(), _bt_relandgetbuf(), _bt_unlockbuf(), Assert, BT_READ, BT_WRITE, BTPageGetOpaque, BTPageOpaqueData::btpo_level, BTreeTupleGetDownLink(), BTreeTupleIsPivot(), BufferGetBlockNumber(), BufferGetPage(), BufferIsValid(), fb(), P_ISLEAF, PageGetItem(), PageGetItemId(), and palloc_object.

Referenced by _bt_first(), _bt_pagedel(), _bt_search_insert(), and bt_rootdescend().

◆ _bt_set_cleanup_info()

void _bt_set_cleanup_info ( Relation  rel,
BlockNumber  num_delpages 
)
extern

Definition at line 233 of file nbtpage.c.

234{
236 Page metapg;
239
240 /*
241 * On-disk compatibility note: The btm_last_cleanup_num_delpages metapage
242 * field started out as a TransactionId field called btm_oldest_btpo_xact.
243 * Both "versions" are just uint32 fields. It was convenient to repurpose
244 * the field when we began to use 64-bit XIDs in deleted pages.
245 *
246 * It's possible that a pg_upgrade'd database will contain an XID value in
247 * what is now recognized as the metapage's btm_last_cleanup_num_delpages
248 * field. _bt_vacuum_needs_cleanup() may even believe that this value
249 * indicates that there are lots of pages that it needs to recycle, when
250 * in reality there are only one or two. The worst that can happen is
251 * that there will be a call to btvacuumscan a little earlier, which will
252 * set btm_last_cleanup_num_delpages to a sane value when we're called.
253 *
254 * Note also that the metapage's btm_last_cleanup_num_heap_tuples field is
255 * no longer used as of PostgreSQL 14. We set it to -1.0 on rewrite, just
256 * to be consistent.
257 */
261
262 /* Don't miss chance to upgrade index/metapage when BTREE_MIN_VERSION */
263 if (metad->btm_version >= BTREE_NOVAC_VERSION &&
264 metad->btm_last_cleanup_num_delpages == num_delpages)
265 {
266 /* Usually means index continues to have num_delpages of 0 */
267 _bt_relbuf(rel, metabuf);
268 return;
269 }
270
271 /* trade in our read lock for a write lock */
274
276
277 /* upgrade meta-page if needed */
278 if (metad->btm_version < BTREE_NOVAC_VERSION)
280
281 /* update cleanup-related information */
282 metad->btm_last_cleanup_num_delpages = num_delpages;
283 metad->btm_last_cleanup_num_heap_tuples = -1.0;
285
286 /* write wal record if needed */
287 if (RelationNeedsWAL(rel))
288 {
290
293
294 Assert(metad->btm_version >= BTREE_NOVAC_VERSION);
295 md.version = metad->btm_version;
296 md.root = metad->btm_root;
297 md.level = metad->btm_level;
298 md.fastroot = metad->btm_fastroot;
299 md.fastlevel = metad->btm_fastlevel;
301 md.allequalimage = metad->btm_allequalimage;
302
303 XLogRegisterBufData(0, &md, sizeof(xl_btree_metadata));
304
306 }
307 else
308 recptr = XLogGetFakeLSN(rel);
309
311
313
314 _bt_relbuf(rel, metabuf);
315}
#define XLOG_BTREE_META_CLEANUP
Definition nbtxlog.h:41

References _bt_getbuf(), _bt_lockbuf(), _bt_relbuf(), _bt_unlockbuf(), _bt_upgrademetapage(), xl_btree_metadata::allequalimage, Assert, BT_READ, BT_WRITE, BTPageGetMeta, BTREE_METAPAGE, BTREE_NOVAC_VERSION, BufferGetPage(), END_CRIT_SECTION, xl_btree_metadata::fastlevel, xl_btree_metadata::fastroot, fb(), xl_btree_metadata::last_cleanup_num_delpages, xl_btree_metadata::level, MarkBufferDirty(), PageSetLSN(), REGBUF_STANDARD, REGBUF_WILL_INIT, RelationNeedsWAL, xl_btree_metadata::root, START_CRIT_SECTION, xl_btree_metadata::version, XLOG_BTREE_META_CLEANUP, XLogBeginInsert(), XLogGetFakeLSN(), XLogInsert(), XLogRegisterBufData(), and XLogRegisterBuffer().

Referenced by btvacuumcleanup().

◆ _bt_start_array_keys()

void _bt_start_array_keys ( IndexScanDesc  scan,
ScanDirection  dir 
)
extern

Definition at line 537 of file nbtreadpage.c.

538{
539 Relation rel = scan->indexRelation;
541
542 Assert(so->numArrayKeys);
543 Assert(so->qual_ok);
544
545 for (int i = 0; i < so->numArrayKeys; i++)
546 {
547 BTArrayKeyInfo *array = &so->arrayKeys[i];
548 ScanKey skey = &so->keyData[array->scan_key];
549
550 Assert(skey->sk_flags & SK_SEARCHARRAY);
551
554 }
555 so->scanBehind = so->oppositeDirCheck = false; /* reset */
556}
static void _bt_array_set_low_or_high(Relation rel, ScanKey skey, BTArrayKeyInfo *array, bool low_not_high)

References _bt_array_set_low_or_high(), Assert, fb(), i, IndexScanDescData::indexRelation, IndexScanDescData::opaque, BTArrayKeyInfo::scan_key, ScanDirectionIsForward, and SK_SEARCHARRAY.

Referenced by _bt_advance_array_keys_increment(), _bt_first(), _bt_readpage(), and btrestrpos().

◆ _bt_start_vacuum()

BTCycleId _bt_start_vacuum ( Relation  rel)
extern

Definition at line 473 of file nbtutils.c.

474{
476 int i;
477 BTOneVacInfo *vac;
478
480
481 /*
482 * Assign the next cycle ID, being careful to avoid zero as well as the
483 * reserved high values.
484 */
486 if (result == 0 || result > MAX_BT_CYCLE_ID)
488
489 /* Let's just make sure there's no entry already for this index */
490 for (i = 0; i < btvacinfo->num_vacuums; i++)
491 {
492 vac = &btvacinfo->vacuums[i];
493 if (vac->relid.relId == rel->rd_lockInfo.lockRelId.relId &&
494 vac->relid.dbId == rel->rd_lockInfo.lockRelId.dbId)
495 {
496 /*
497 * Unlike most places in the backend, we have to explicitly
498 * release our LWLock before throwing an error. This is because
499 * we expect _bt_end_vacuum() to be called before transaction
500 * abort cleanup can run to release LWLocks.
501 */
503 elog(ERROR, "multiple active vacuums for index \"%s\"",
505 }
506 }
507
508 /* OK, add an entry */
510 {
512 elog(ERROR, "out of btvacinfo slots");
513 }
515 vac->relid = rel->rd_lockInfo.lockRelId;
516 vac->cycleid = result;
518
520 return result;
521}
#define MAX_BT_CYCLE_ID
Definition nbtree.h:94
uint16 BTCycleId
Definition nbtree.h:30
BTCycleId cycleid
Definition nbtutils.c:408
BTCycleId cycle_ctr
Definition nbtutils.c:413
int max_vacuums
Definition nbtutils.c:415

References btvacinfo, BTVacInfo::cycle_ctr, BTOneVacInfo::cycleid, LockRelId::dbId, elog, ERROR, fb(), i, LockInfoData::lockRelId, LW_EXCLUSIVE, LWLockAcquire(), LWLockRelease(), MAX_BT_CYCLE_ID, BTVacInfo::max_vacuums, BTVacInfo::num_vacuums, RelationData::rd_lockInfo, RelationGetRelationName, BTOneVacInfo::relid, LockRelId::relId, result, and BTVacInfo::vacuums.

Referenced by btbulkdelete().

◆ _bt_swap_posting()

IndexTuple _bt_swap_posting ( IndexTuple  newitem,
IndexTuple  oposting,
int  postingoff 
)
extern

Definition at line 1022 of file nbtdedup.c.

1023{
1024 int nhtids;
1025 char *replacepos;
1026 char *replaceposright;
1029
1032
1033 /*
1034 * The postingoff argument originated as a _bt_binsrch_posting() return
1035 * value. It will be 0 in the event of corruption that makes a leaf page
1036 * contain a non-pivot tuple that's somehow identical to newitem (no two
1037 * non-pivot tuples should ever have the same TID). This has been known
1038 * to happen in the field from time to time.
1039 *
1040 * Perform a basic sanity check to catch this case now.
1041 */
1042 if (!(postingoff > 0 && postingoff < nhtids))
1043 elog(ERROR, "posting list tuple with %d items cannot be split at offset %d",
1044 nhtids, postingoff);
1045
1046 /*
1047 * Move item pointers in posting list to make a gap for the new item's
1048 * heap TID. We shift TIDs one place to the right, losing original
1049 * rightmost TID. (nmovebytes must not include TIDs to the left of
1050 * postingoff, nor the existing rightmost/max TID that gets overwritten.)
1051 */
1053 replacepos = (char *) BTreeTupleGetPostingN(nposting, postingoff);
1054 replaceposright = (char *) BTreeTupleGetPostingN(nposting, postingoff + 1);
1055 nmovebytes = (nhtids - postingoff - 1) * sizeof(ItemPointerData);
1057
1058 /* Fill the gap at postingoff with TID of new item (original new TID) */
1059 Assert(!BTreeTupleIsPivot(newitem) && !BTreeTupleIsPosting(newitem));
1061
1062 /* Now copy oposting's rightmost/max TID into new item (final new TID) */
1064
1066 BTreeTupleGetHeapTID(newitem)) < 0);
1068
1069 return nposting;
1070}

References Assert, BTreeTupleGetHeapTID(), BTreeTupleGetMaxHeapTID(), BTreeTupleGetNPosting(), BTreeTupleGetPostingN(), BTreeTupleIsPivot(), BTreeTupleIsPosting(), CopyIndexTuple(), elog, ERROR, fb(), ItemPointerCompare(), ItemPointerCopy(), and IndexTupleData::t_tid.

Referenced by _bt_insertonpg(), btree_xlog_insert(), and btree_xlog_split().

◆ _bt_truncate()

IndexTuple _bt_truncate ( Relation  rel,
IndexTuple  lastleft,
IndexTuple  firstright,
BTScanInsert  itup_key 
)
extern

Definition at line 692 of file nbtutils.c.

694{
697 int keepnatts;
702
703 /*
704 * We should only ever truncate non-pivot tuples from leaf pages. It's
705 * never okay to truncate when splitting an internal page.
706 */
708
709 /* Determine how many attributes must be kept in truncated tuple */
710 keepnatts = _bt_keep_natts(rel, lastleft, firstright, itup_key);
711
712#ifdef DEBUG_NO_TRUNCATE
713 /* Force truncation to be ineffective for testing purposes */
714 keepnatts = nkeyatts + 1;
715#endif
716
719
721 {
722 /*
723 * index_truncate_tuple() just returns a straight copy of firstright
724 * when it has no attributes to truncate. When that happens, we may
725 * need to truncate away a posting list here instead.
726 */
729 pivot->t_info &= ~INDEX_SIZE_MASK;
731 }
732
733 /*
734 * If there is a distinguishing key attribute within pivot tuple, we're
735 * done
736 */
737 if (keepnatts <= nkeyatts)
738 {
740 return pivot;
741 }
742
743 /*
744 * We have to store a heap TID in the new pivot tuple, since no non-TID
745 * key attribute value in firstright distinguishes the right side of the
746 * split from the left side. nbtree conceptualizes this case as an
747 * inability to truncate away any key attributes, since heap TID is
748 * treated as just another key attribute (despite lacking a pg_attribute
749 * entry).
750 *
751 * Use enlarged space that holds a copy of pivot. We need the extra space
752 * to store a heap TID at the end (using the special pivot tuple
753 * representation). Note that the original pivot already has firstright's
754 * possible posting list/non-key attribute values removed at this point.
755 */
759 /* Cannot leak memory here */
760 pfree(pivot);
761
762 /*
763 * Store all of firstright's key attribute values plus a tiebreaker heap
764 * TID value in enlarged pivot tuple
765 */
766 tidpivot->t_info &= ~INDEX_SIZE_MASK;
767 tidpivot->t_info |= newsize;
770
771 /*
772 * Lehman & Yao use lastleft as the leaf high key in all cases, but don't
773 * consider suffix truncation. It seems like a good idea to follow that
774 * example in cases where no truncation takes place -- use lastleft's heap
775 * TID. (This is also the closest value to negative infinity that's
776 * legally usable.)
777 */
779
780 /*
781 * We're done. Assert() that heap TID invariants hold before returning.
782 *
783 * Lehman and Yao require that the downlink to the right page, which is to
784 * be inserted into the parent page in the second phase of a page split be
785 * a strict lower bound on items on the right page, and a non-strict upper
786 * bound for items on the left page. Assert that heap TIDs follow these
787 * invariants, since a heap TID value is apparently needed as a
788 * tiebreaker.
789 */
790#ifndef DEBUG_NO_TRUNCATE
794 BTreeTupleGetHeapTID(lastleft)) >= 0);
797#else
798
799 /*
800 * Those invariants aren't guaranteed to hold for lastleft + firstright
801 * heap TID attribute values when they're considered here only because
802 * DEBUG_NO_TRUNCATE is defined (a heap TID is probably not actually
803 * needed as a tiebreaker). DEBUG_NO_TRUNCATE must therefore use a heap
804 * TID value that always works as a strict lower bound for items to the
805 * right. In particular, it must avoid using firstright's leading key
806 * attribute values along with lastleft's heap TID value when lastleft's
807 * TID happens to be greater than firstright's TID.
808 */
810
811 /*
812 * Pivot heap TID should never be fully equal to firstright. Note that
813 * the pivot heap TID will still end up equal to lastleft's heap TID when
814 * that's the only usable value.
815 */
820#endif
821
822 return tidpivot;
823}
IndexTuple index_truncate_tuple(TupleDesc sourceDescriptor, IndexTuple source, int leavenatts)
Definition indextuple.c:508
static void ItemPointerSetOffsetNumber(ItemPointerData *pointer, OffsetNumber offsetNumber)
Definition itemptr.h:158
static void BTreeTupleSetNAtts(IndexTuple itup, uint16 nkeyatts, bool heaptid)
Definition nbtree.h:596
static int _bt_keep_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright, BTScanInsert itup_key)
Definition nbtutils.c:837

References _bt_keep_natts(), Assert, BTreeTupleGetHeapTID(), BTreeTupleGetMaxHeapTID(), BTreeTupleGetPostingOffset(), BTreeTupleIsPivot(), BTreeTupleIsPosting(), BTreeTupleSetNAtts(), fb(), index_truncate_tuple(), IndexRelationGetNumberOfAttributes, IndexRelationGetNumberOfKeyAttributes, IndexTupleSize(), ItemPointerCompare(), ItemPointerCopy(), ItemPointerGetOffsetNumber(), ItemPointerSetOffsetNumber(), MAXALIGN, memcpy(), Min, OffsetNumberPrev, palloc0(), pfree(), and RelationGetDescr.

Referenced by _bt_buildadd(), and _bt_split().

◆ _bt_unlockbuf()

void _bt_unlockbuf ( Relation  rel,
Buffer  buf 
)
extern

Definition at line 1098 of file nbtpage.c.

1099{
1100 /*
1101 * Buffer is pinned and locked, which means that it is expected to be
1102 * defined and addressable. Check that proactively.
1103 */
1105
1106 /* LockBuffer() asserts that pin is held by this backend */
1108
1109 if (!RelationUsesLocalBuffers(rel))
1111}
@ BUFFER_LOCK_UNLOCK
Definition bufmgr.h:207

References buf, BUFFER_LOCK_UNLOCK, BufferGetPage(), fb(), LockBuffer(), RelationUsesLocalBuffers, VALGRIND_CHECK_MEM_IS_DEFINED, and VALGRIND_MAKE_MEM_NOACCESS.

Referenced by _bt_drop_lock_and_maybe_pin(), _bt_getroot(), _bt_killitems(), _bt_moveright(), _bt_pagedel(), _bt_readfirstpage(), _bt_relandgetbuf(), _bt_search(), _bt_set_cleanup_info(), and _bt_unlink_halfdead_page().

◆ _bt_update_posting()

void _bt_update_posting ( BTVacuumPosting  vacposting)
extern

Definition at line 924 of file nbtdedup.c.

925{
927 uint32 keysize,
928 newsize;
929 IndexTuple itup;
930 int nhtids;
931 int ui,
932 d;
933 ItemPointer htids;
934
935 nhtids = BTreeTupleGetNPosting(origtuple) - vacposting->ndeletedtids;
936
938 Assert(nhtids > 0 && nhtids < BTreeTupleGetNPosting(origtuple));
939
940 /*
941 * Determine final size of new tuple.
942 *
943 * This calculation needs to match the code used within _bt_form_posting()
944 * for new posting list tuples. We avoid calling _bt_form_posting() here
945 * to save ourselves a second memory allocation for a htids workspace.
946 */
948 if (nhtids > 1)
949 newsize = MAXALIGN(keysize +
950 nhtids * sizeof(ItemPointerData));
951 else
952 newsize = keysize;
953
956
957 /* Allocate memory using palloc0() (matches index_form_tuple()) */
958 itup = palloc0(newsize);
959 memcpy(itup, origtuple, keysize);
960 itup->t_info &= ~INDEX_SIZE_MASK;
961 itup->t_info |= newsize;
962
963 if (nhtids > 1)
964 {
965 /* Form posting list tuple */
966 BTreeTupleSetPosting(itup, nhtids, keysize);
967 htids = BTreeTupleGetPosting(itup);
968 }
969 else
970 {
971 /* Form standard non-pivot tuple */
973 htids = &itup->t_tid;
974 }
975
976 ui = 0;
977 d = 0;
978 for (int i = 0; i < BTreeTupleGetNPosting(origtuple); i++)
979 {
980 if (d < vacposting->ndeletedtids && vacposting->deletetids[d] == i)
981 {
982 d++;
983 continue;
984 }
985 htids[ui++] = *BTreeTupleGetPostingN(origtuple, i);
986 }
987 Assert(ui == nhtids);
988 Assert(d == vacposting->ndeletedtids);
989 Assert(nhtids == 1 || _bt_posting_valid(itup));
990 Assert(nhtids > 1 || ItemPointerIsValid(&itup->t_tid));
991
992 /* vacposting arg's itup will now point to updated version */
993 vacposting->itup = itup;
994}

References Assert, BTreeTupleGetNPosting(), BTreeTupleGetPosting(), BTreeTupleGetPostingN(), BTreeTupleGetPostingOffset(), BTreeTupleSetPosting(), fb(), i, INDEX_SIZE_MASK, ItemPointerIsValid(), MAXALIGN, memcpy(), palloc0(), IndexTupleData::t_info, and IndexTupleData::t_tid.

Referenced by _bt_delitems_update(), and btree_xlog_updates().

◆ _bt_upgradelockbufcleanup()

void _bt_upgradelockbufcleanup ( Relation  rel,
Buffer  buf 
)
extern

Definition at line 1137 of file nbtpage.c.

1138{
1139 /*
1140 * Buffer is pinned and locked, which means that it is expected to be
1141 * defined and addressable. Check that proactively.
1142 */
1144
1145 /* LockBuffer() asserts that pin is held by this backend */
1148}
void LockBufferForCleanup(Buffer buffer)
Definition bufmgr.c:6670

References buf, BUFFER_LOCK_UNLOCK, BufferGetPage(), fb(), LockBuffer(), LockBufferForCleanup(), and VALGRIND_CHECK_MEM_IS_DEFINED.

Referenced by btvacuumpage().

◆ _bt_upgrademetapage()

void _bt_upgrademetapage ( Page  page)
extern

Definition at line 108 of file nbtpage.c.

109{
112
113 metad = BTPageGetMeta(page);
115
116 /* It must be really a meta page of upgradable version */
117 Assert(metaopaque->btpo_flags & BTP_META);
118 Assert(metad->btm_version < BTREE_NOVAC_VERSION);
119 Assert(metad->btm_version >= BTREE_MIN_VERSION);
120
121 /* Set version number and fill extra fields added into version 3 */
122 metad->btm_version = BTREE_NOVAC_VERSION;
123 metad->btm_last_cleanup_num_delpages = 0;
124 metad->btm_last_cleanup_num_heap_tuples = -1.0;
125 /* Only a REINDEX can set this field */
126 Assert(!metad->btm_allequalimage);
127 metad->btm_allequalimage = false;
128
129 /* Adjust pd_lower (see _bt_initmetapage() for details) */
130 ((PageHeader) page)->pd_lower =
131 ((char *) metad + sizeof(BTMetaPageData)) - (char *) page;
132}

References Assert, BTP_META, BTPageGetMeta, BTPageGetOpaque, BTREE_MIN_VERSION, BTREE_NOVAC_VERSION, fb(), and PG_USED_FOR_ASSERTS_ONLY.

Referenced by _bt_getroot(), _bt_insertonpg(), _bt_newlevel(), _bt_set_cleanup_info(), and _bt_unlink_halfdead_page().

◆ _bt_vacuum_cycleid()

BTCycleId _bt_vacuum_cycleid ( Relation  rel)
extern

Definition at line 439 of file nbtutils.c.

440{
441 BTCycleId result = 0;
442 int i;
443
444 /* Share lock is enough since this is a read-only operation */
446
447 for (i = 0; i < btvacinfo->num_vacuums; i++)
448 {
450
451 if (vac->relid.relId == rel->rd_lockInfo.lockRelId.relId &&
452 vac->relid.dbId == rel->rd_lockInfo.lockRelId.dbId)
453 {
454 result = vac->cycleid;
455 break;
456 }
457 }
458
460 return result;
461}
@ LW_SHARED
Definition lwlock.h:105

References btvacinfo, BTOneVacInfo::cycleid, LockRelId::dbId, fb(), i, LockInfoData::lockRelId, LW_SHARED, LWLockAcquire(), LWLockRelease(), BTVacInfo::num_vacuums, RelationData::rd_lockInfo, BTOneVacInfo::relid, LockRelId::relId, result, and BTVacInfo::vacuums.

Referenced by _bt_split().

◆ _bt_vacuum_needs_cleanup()

bool _bt_vacuum_needs_cleanup ( Relation  rel)
extern

Definition at line 180 of file nbtpage.c.

181{
183 Page metapg;
185 uint32 btm_version;
187
188 /*
189 * Copy details from metapage to local variables quickly.
190 *
191 * Note that we deliberately avoid using cached version of metapage here.
192 */
196 btm_version = metad->btm_version;
197
198 if (btm_version < BTREE_NOVAC_VERSION)
199 {
200 /*
201 * Metapage needs to be dynamically upgraded to store fields that are
202 * only present when btm_version >= BTREE_NOVAC_VERSION
203 */
204 _bt_relbuf(rel, metabuf);
205 return true;
206 }
207
208 prev_num_delpages = metad->btm_last_cleanup_num_delpages;
209 _bt_relbuf(rel, metabuf);
210
211 /*
212 * Trigger cleanup in rare cases where prev_num_delpages exceeds 5% of the
213 * total size of the index. We can reasonably expect (though are not
214 * guaranteed) to be able to recycle this many pages if we decide to do a
215 * btvacuumscan call during the ongoing btvacuumcleanup. For further
216 * details see the nbtree/README section on placing deleted pages in the
217 * FSM.
218 */
219 if (prev_num_delpages > 0 &&
221 return true;
222
223 return false;
224}
#define RelationGetNumberOfBlocks(reln)
Definition bufmgr.h:309

References _bt_getbuf(), _bt_relbuf(), BT_READ, BTPageGetMeta, BTREE_METAPAGE, BTREE_NOVAC_VERSION, BufferGetPage(), fb(), and RelationGetNumberOfBlocks.

Referenced by btvacuumcleanup().

◆ btadjustmembers()

void btadjustmembers ( Oid  opfamilyoid,
Oid  opclassoid,
List operators,
List functions 
)
extern

Definition at line 288 of file nbtvalidate.c.

292{
293 Oid opcintype;
294 ListCell *lc;
295
296 /*
297 * Btree operators and comparison support functions are always "loose"
298 * members of the opfamily if they are cross-type. If they are not
299 * cross-type, we prefer to tie them to the appropriate opclass ... but if
300 * the user hasn't created one, we can't do that, and must fall back to
301 * using the opfamily dependency. (We mustn't force creation of an
302 * opclass in such a case, as leaving an incomplete opclass laying about
303 * would be bad. Throwing an error is another undesirable alternative.)
304 *
305 * This behavior results in a bit of a dump/reload hazard, in that the
306 * order of restoring objects could affect what dependencies we end up
307 * with. pg_dump's existing behavior will preserve the dependency choices
308 * in most cases, but not if a cross-type operator has been bound tightly
309 * into an opclass. That's a mistake anyway, so silently "fixing" it
310 * isn't awful.
311 *
312 * Optional support functions are always "loose" family members.
313 *
314 * To avoid repeated lookups, we remember the most recently used opclass's
315 * input type.
316 */
317 if (OidIsValid(opclassoid))
318 {
319 /* During CREATE OPERATOR CLASS, need CCI to see the pg_opclass row */
321 opcintype = get_opclass_input_type(opclassoid);
322 }
323 else
324 opcintype = InvalidOid;
325
326 /*
327 * We handle operators and support functions almost identically, so rather
328 * than duplicate this code block, just join the lists.
329 */
330 foreach(lc, list_concat_copy(operators, functions))
331 {
333
334 if (op->is_func && op->number != BTORDER_PROC)
335 {
336 /* Optional support proc, so always a soft family dependency */
337 op->ref_is_hard = false;
338 op->ref_is_family = true;
339 op->refobjid = opfamilyoid;
340 }
341 else if (op->lefttype != op->righttype)
342 {
343 /* Cross-type, so always a soft family dependency */
344 op->ref_is_hard = false;
345 op->ref_is_family = true;
346 op->refobjid = opfamilyoid;
347 }
348 else
349 {
350 /* Not cross-type; is there a suitable opclass? */
351 if (op->lefttype != opcintype)
352 {
353 /* Avoid repeating this expensive lookup, even if it fails */
354 opcintype = op->lefttype;
357 opcintype);
358 }
359 if (OidIsValid(opclassoid))
360 {
361 /* Hard dependency on opclass */
362 op->ref_is_hard = true;
363 op->ref_is_family = false;
364 op->refobjid = opclassoid;
365 }
366 else
367 {
368 /* We're stuck, so make a soft dependency on the opfamily */
369 op->ref_is_hard = false;
370 op->ref_is_family = true;
371 op->refobjid = opfamilyoid;
372 }
373 }
374 }
375}
Oid opclass_for_family_datatype(Oid amoid, Oid opfamilyoid, Oid datatypeoid)
Definition amvalidate.c:236
List * list_concat_copy(const List *list1, const List *list2)
Definition list.c:598
Oid get_opclass_input_type(Oid opclass)
Definition lsyscache.c:1384
#define lfirst(lc)
Definition pg_list.h:172
static const struct fns functions
Definition regcomp.c:358
Oid refobjid
Definition amapi.h:98
Oid lefttype
Definition amapi.h:93
bool ref_is_family
Definition amapi.h:97
Oid righttype
Definition amapi.h:94
bool is_func
Definition amapi.h:90
bool ref_is_hard
Definition amapi.h:96
void CommandCounterIncrement(void)
Definition xact.c:1130

References BTORDER_PROC, CommandCounterIncrement(), fb(), functions, get_opclass_input_type(), InvalidOid, OpFamilyMember::is_func, OpFamilyMember::lefttype, lfirst, list_concat_copy(), OpFamilyMember::number, OidIsValid, opclass_for_family_datatype(), OpFamilyMember::ref_is_family, OpFamilyMember::ref_is_hard, OpFamilyMember::refobjid, and OpFamilyMember::righttype.

Referenced by bthandler().

◆ btbeginscan()

IndexScanDesc btbeginscan ( Relation  rel,
int  nkeys,
int  norderbys 
)
extern

Definition at line 339 of file nbtree.c.

340{
341 IndexScanDesc scan;
343
344 /* no order by operators allowed */
345 Assert(norderbys == 0);
346
347 /* get the scan */
348 scan = RelationGetIndexScan(rel, nkeys, norderbys);
349
350 /* allocate private workspace */
352 BTScanPosInvalidate(so->currPos);
353 BTScanPosInvalidate(so->markPos);
354 if (scan->numberOfKeys > 0)
355 so->keyData = (ScanKey) palloc(scan->numberOfKeys * sizeof(ScanKeyData));
356 else
357 so->keyData = NULL;
358
359 so->skipScan = false;
360 so->needPrimScan = false;
361 so->scanBehind = false;
362 so->oppositeDirCheck = false;
363 so->arrayKeys = NULL;
364 so->orderProcs = NULL;
365 so->arrayContext = NULL;
366
367 so->killedItems = NULL; /* until needed */
368 so->numKilled = 0;
369
370 /*
371 * We don't know yet whether the scan will be index-only, so we do not
372 * allocate the tuple workspace arrays until btrescan. However, we set up
373 * scan->xs_itupdesc whether we'll need it or not, since that's so cheap.
374 */
375 so->currTuples = so->markTuples = NULL;
376
377 scan->xs_itupdesc = RelationGetDescr(rel);
378
379 scan->opaque = so;
380
381 return scan;
382}
IndexScanDesc RelationGetIndexScan(Relation indexRelation, int nkeys, int norderbys)
Definition genam.c:80
struct TupleDescData * xs_itupdesc
Definition relscan.h:181

References Assert, BTScanPosInvalidate, fb(), IndexScanDescData::numberOfKeys, IndexScanDescData::opaque, palloc(), palloc_object, RelationGetDescr, RelationGetIndexScan(), and IndexScanDescData::xs_itupdesc.

Referenced by bthandler().

◆ btbuild()

IndexBuildResult * btbuild ( Relation  heap,
Relation  index,
struct IndexInfo indexInfo 
)
extern

Definition at line 299 of file nbtsort.c.

300{
303 double reltuples;
304
305#ifdef BTREE_BUILD_STATS
307 ResetUsage();
308#endif /* BTREE_BUILD_STATS */
309
310 buildstate.isunique = indexInfo->ii_Unique;
311 buildstate.nulls_not_distinct = indexInfo->ii_NullsNotDistinct;
312 buildstate.havedead = false;
313 buildstate.heap = heap;
314 buildstate.spool = NULL;
315 buildstate.spool2 = NULL;
316 buildstate.indtuples = 0;
317 buildstate.btleader = NULL;
318
319 /*
320 * We expect to be called exactly once for any index relation. If that's
321 * not the case, big trouble's what we have.
322 */
324 elog(ERROR, "index \"%s\" already contains data",
326
327 reltuples = _bt_spools_heapscan(heap, index, &buildstate, indexInfo);
328
329 /*
330 * Finish the build by (1) completing the sort of the spool file, (2)
331 * inserting the sorted tuples into btree pages and (3) building the upper
332 * levels. Finally, it may also be necessary to end use of parallelism.
333 */
334 _bt_leafbuild(buildstate.spool, buildstate.spool2);
336 if (buildstate.spool2)
338 if (buildstate.btleader)
340
342
343 result->heap_tuples = reltuples;
344 result->index_tuples = buildstate.indtuples;
345
346#ifdef BTREE_BUILD_STATS
348 {
349 ShowUsage("BTREE BUILD STATS");
350 ResetUsage();
351 }
352#endif /* BTREE_BUILD_STATS */
353
354 return result;
355}
static void _bt_end_parallel(BTLeader *btleader)
Definition nbtsort.c:1611
static void _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2)
Definition nbtsort.c:542
static double _bt_spools_heapscan(Relation heap, Relation index, BTBuildState *buildstate, IndexInfo *indexInfo)
Definition nbtsort.c:369
static void _bt_spooldestroy(BTSpool *btspool)
Definition nbtsort.c:521
bool ii_Unique
Definition execnodes.h:214
bool ii_NullsNotDistinct
Definition execnodes.h:216
Definition type.h:96

References _bt_end_parallel(), _bt_leafbuild(), _bt_spooldestroy(), _bt_spools_heapscan(), elog, ERROR, fb(), IndexInfo::ii_NullsNotDistinct, IndexInfo::ii_Unique, log_btree_build_stats, palloc_object, RelationGetNumberOfBlocks, RelationGetRelationName, ResetUsage(), result, and ShowUsage().

Referenced by bthandler().

◆ btbuildempty()

void btbuildempty ( Relation  index)
extern

Definition at line 183 of file nbtree.c.

184{
185 bool allequalimage = _bt_allequalimage(index, false);
186 BulkWriteState *bulkstate;
188
190
191 /* Construct metapage. */
192 metabuf = smgr_bulk_get_buf(bulkstate);
193 _bt_initmetapage((Page) metabuf, P_NONE, 0, allequalimage);
194 smgr_bulk_write(bulkstate, BTREE_METAPAGE, metabuf, true);
195
196 smgr_bulk_finish(bulkstate);
197}
BulkWriteState * smgr_bulk_start_rel(Relation rel, ForkNumber forknum)
Definition bulk_write.c:87
void smgr_bulk_write(BulkWriteState *bulkstate, BlockNumber blocknum, BulkWriteBuffer buf, bool page_std)
Definition bulk_write.c:323
BulkWriteBuffer smgr_bulk_get_buf(BulkWriteState *bulkstate)
Definition bulk_write.c:347
void smgr_bulk_finish(BulkWriteState *bulkstate)
Definition bulk_write.c:130
void _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level, bool allequalimage)
Definition nbtpage.c:68
bool _bt_allequalimage(Relation rel, bool debugmessage)
Definition nbtutils.c:1175
@ INIT_FORKNUM
Definition relpath.h:61

References _bt_allequalimage(), _bt_initmetapage(), BTREE_METAPAGE, fb(), INIT_FORKNUM, P_NONE, smgr_bulk_finish(), smgr_bulk_get_buf(), smgr_bulk_start_rel(), and smgr_bulk_write().

Referenced by bthandler().

◆ btbuildphasename()

char * btbuildphasename ( int64  phasenum)
extern

Definition at line 644 of file nbtutils.c.

645{
646 switch (phasenum)
647 {
649 return "initializing";
651 return "scanning table";
653 return "sorting live tuples";
655 return "sorting dead tuples";
657 return "loading tuples in tree";
658 default:
659 return NULL;
660 }
661}
#define PROGRESS_BTREE_PHASE_PERFORMSORT_2
Definition nbtree.h:1148
#define PROGRESS_BTREE_PHASE_LEAF_LOAD
Definition nbtree.h:1149
#define PROGRESS_BTREE_PHASE_INDEXBUILD_TABLESCAN
Definition nbtree.h:1146
#define PROGRESS_BTREE_PHASE_PERFORMSORT_1
Definition nbtree.h:1147
#define PROGRESS_CREATEIDX_SUBPHASE_INITIALIZE
Definition progress.h:135

References fb(), PROGRESS_BTREE_PHASE_INDEXBUILD_TABLESCAN, PROGRESS_BTREE_PHASE_LEAF_LOAD, PROGRESS_BTREE_PHASE_PERFORMSORT_1, PROGRESS_BTREE_PHASE_PERFORMSORT_2, and PROGRESS_CREATEIDX_SUBPHASE_INITIALIZE.

Referenced by bthandler().

◆ btbulkdelete()

IndexBulkDeleteResult * btbulkdelete ( IndexVacuumInfo info,
IndexBulkDeleteResult stats,
IndexBulkDeleteCallback  callback,
void callback_state 
)
extern

Definition at line 1119 of file nbtree.c.

1121{
1122 Relation rel = info->index;
1123 BTCycleId cycleid;
1124
1125 /* allocate stats if first time through, else re-use existing struct */
1126 if (stats == NULL)
1128
1129 /* Establish the vacuum cycle ID to use for this scan */
1130 /* The ENSURE stuff ensures we clean up shared memory on failure */
1132 {
1133 cycleid = _bt_start_vacuum(rel);
1134
1135 btvacuumscan(info, stats, callback, callback_state, cycleid);
1136 }
1138 _bt_end_vacuum(rel);
1139
1140 return stats;
1141}
#define PG_ENSURE_ERROR_CLEANUP(cleanup_function, arg)
Definition ipc.h:47
#define PG_END_ENSURE_ERROR_CLEANUP(cleanup_function, arg)
Definition ipc.h:52
static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state, BTCycleId cycleid)
Definition nbtree.c:1237
void _bt_end_vacuum_callback(int code, Datum arg)
Definition nbtutils.c:558
BTCycleId _bt_start_vacuum(Relation rel)
Definition nbtutils.c:473
static Datum PointerGetDatum(const void *X)
Definition postgres.h:342
Relation index
Definition genam.h:54
static void callback(struct sockaddr *addr, struct sockaddr *mask, void *unused)

References _bt_end_vacuum(), _bt_end_vacuum_callback(), _bt_start_vacuum(), btvacuumscan(), callback(), fb(), IndexVacuumInfo::index, palloc0_object, PG_END_ENSURE_ERROR_CLEANUP, PG_ENSURE_ERROR_CLEANUP, and PointerGetDatum().

Referenced by bthandler().

◆ btcanreturn()

bool btcanreturn ( Relation  index,
int  attno 
)
extern

Definition at line 1799 of file nbtree.c.

1800{
1801 return true;
1802}

Referenced by bthandler().

◆ btendscan()

void btendscan ( IndexScanDesc  scan)
extern

Definition at line 455 of file nbtree.c.

456{
458
459 /* we aren't holding any read locks, but gotta drop the pins */
460 if (BTScanPosIsValid(so->currPos))
461 {
462 /* Before leaving current page, deal with any killed items */
463 if (so->numKilled > 0)
464 _bt_killitems(scan);
465 BTScanPosUnpinIfPinned(so->currPos);
466 }
467
468 so->markItemIndex = -1;
469 BTScanPosUnpinIfPinned(so->markPos);
470
471 /* No need to invalidate positions, the RAM is about to be freed. */
472
473 /* Release storage */
474 if (so->keyData != NULL)
475 pfree(so->keyData);
476 /* so->arrayKeys and so->orderProcs are in arrayContext */
477 if (so->arrayContext != NULL)
478 MemoryContextDelete(so->arrayContext);
479 if (so->killedItems != NULL)
480 pfree(so->killedItems);
481 if (so->currTuples != NULL)
482 pfree(so->currTuples);
483 /* so->markTuples should not be pfree'd, see btrescan */
484 pfree(so);
485}
void MemoryContextDelete(MemoryContext context)
Definition mcxt.c:472
#define BTScanPosUnpinIfPinned(scanpos)
Definition nbtree.h:1015
void _bt_killitems(IndexScanDesc scan)
Definition nbtutils.c:191

References _bt_killitems(), BTScanPosIsValid, BTScanPosUnpinIfPinned, fb(), MemoryContextDelete(), IndexScanDescData::opaque, and pfree().

Referenced by bthandler().

◆ btestimateparallelscan()

Size btestimateparallelscan ( Relation  rel,
int  nkeys,
int  norderbys 
)
extern

Definition at line 575 of file nbtree.c.

576{
580
581 /*
582 * Pessimistically assume that every input scan key will be output with
583 * its own SAOP array
584 */
586 sizeof(int) * nkeys;
587
588 /* Single column indexes cannot possibly use a skip array */
589 if (nkeyatts == 1)
590 return estnbtreeshared;
591
592 /*
593 * Pessimistically assume that all attributes prior to the least
594 * significant attribute require a skip array (and an associated key)
595 */
596 genericattrspace = datumEstimateSpace((Datum) 0, false, true,
597 sizeof(Datum));
598 for (int attnum = 1; attnum < nkeyatts; attnum++)
599 {
600 CompactAttribute *attr;
601
602 /*
603 * We make the conservative assumption that every index column will
604 * also require a skip array.
605 *
606 * Every skip array must have space to store its scan key's sk_flags.
607 */
609
610 /* Consider space required to store a datum of opclass input type */
611 attr = TupleDescCompactAttr(rel->rd_att, attnum - 1);
612 if (attr->attbyval)
613 {
614 /* This index attribute stores pass-by-value datums */
616 true, attr->attlen);
617
619 continue;
620 }
621
622 /*
623 * This index attribute stores pass-by-reference datums.
624 *
625 * Assume that serializing this array will use just as much space as a
626 * pass-by-value datum, in addition to space for the largest possible
627 * whole index tuple (this is not just a per-datum portion of the
628 * largest possible tuple because that'd be almost as large anyway).
629 *
630 * This is quite conservative, but it's not clear how we could do much
631 * better. The executor requires an up-front storage request size
632 * that reliably covers the scan's high watermark memory usage. We
633 * can't be sure of the real high watermark until the scan is over.
634 */
637 }
638
639 return estnbtreeshared;
640}
Size datumEstimateSpace(Datum value, bool isnull, bool typByVal, int typLen)
Definition datum.c:450
Size add_size(Size s1, Size s2)
Definition shmem.c:1048
TupleDesc rd_att
Definition rel.h:112

References add_size(), CompactAttribute::attbyval, CompactAttribute::attlen, attnum, BTMaxItemSize, BTParallelScanDescData::btps_arrElems, datumEstimateSpace(), fb(), IndexRelationGetNumberOfKeyAttributes, RelationData::rd_att, and TupleDescCompactAttr().

Referenced by bthandler().

◆ btgetbitmap()

int64 btgetbitmap ( IndexScanDesc  scan,
TIDBitmap tbm 
)
extern

Definition at line 291 of file nbtree.c.

292{
294 int64 ntids = 0;
295 ItemPointer heapTid;
296
297 Assert(scan->heapRelation == NULL);
298
299 /* Each loop iteration performs another primitive index scan */
300 do
301 {
302 /* Fetch the first page & tuple */
304 {
305 /* Save tuple ID, and continue scanning */
306 heapTid = &scan->xs_heaptid;
307 tbm_add_tuples(tbm, heapTid, 1, false);
308 ntids++;
309
310 for (;;)
311 {
312 /*
313 * Advance to next tuple within page. This is the same as the
314 * easy case in _bt_next().
315 */
316 if (++so->currPos.itemIndex > so->currPos.lastItem)
317 {
318 /* let _bt_next do the heavy lifting */
319 if (!_bt_next(scan, ForwardScanDirection))
320 break;
321 }
322
323 /* Save tuple ID, and continue scanning */
324 heapTid = &so->currPos.items[so->currPos.itemIndex].heapTid;
325 tbm_add_tuples(tbm, heapTid, 1, false);
326 ntids++;
327 }
328 }
329 /* Now see if we need another primitive index scan */
330 } while (so->numArrayKeys && _bt_start_prim_scan(scan));
331
332 return ntids;
333}
int64_t int64
Definition c.h:621
static bool _bt_start_prim_scan(IndexScanDesc scan)
Definition nbtree.c:653
bool _bt_first(IndexScanDesc scan, ScanDirection dir)
Definition nbtsearch.c:883
bool _bt_next(IndexScanDesc scan, ScanDirection dir)
Definition nbtsearch.c:1586
@ ForwardScanDirection
Definition sdir.h:28
ItemPointerData xs_heaptid
Definition relscan.h:185
void tbm_add_tuples(TIDBitmap *tbm, const ItemPointerData *tids, int ntids, bool recheck)
Definition tidbitmap.c:367

References _bt_first(), _bt_next(), _bt_start_prim_scan(), Assert, fb(), ForwardScanDirection, IndexScanDescData::heapRelation, IndexScanDescData::opaque, tbm_add_tuples(), and IndexScanDescData::xs_heaptid.

Referenced by bthandler().

◆ btgettreeheight()

int btgettreeheight ( Relation  rel)
extern

Definition at line 1808 of file nbtree.c.

1809{
1810 return _bt_getrootheight(rel);
1811}
int _bt_getrootheight(Relation rel)
Definition nbtpage.c:680

References _bt_getrootheight().

Referenced by bthandler().

◆ btgettuple()

bool btgettuple ( IndexScanDesc  scan,
ScanDirection  dir 
)
extern

Definition at line 230 of file nbtree.c.

231{
233 bool res;
234
235 Assert(scan->heapRelation != NULL);
236
237 /* btree indexes are never lossy */
238 scan->xs_recheck = false;
239
240 /* Each loop iteration performs another primitive index scan */
241 do
242 {
243 /*
244 * If we've already initialized this scan, we can just advance it in
245 * the appropriate direction. If we haven't done so yet, we call
246 * _bt_first() to get the first item in the scan.
247 */
248 if (!BTScanPosIsValid(so->currPos))
249 res = _bt_first(scan, dir);
250 else
251 {
252 /*
253 * Check to see if we should kill the previously-fetched tuple.
254 */
255 if (scan->kill_prior_tuple)
256 {
257 /*
258 * Yes, remember it for later. (We'll deal with all such
259 * tuples at once right before leaving the index page.) The
260 * test for numKilled overrun is not just paranoia: if the
261 * caller reverses direction in the indexscan then the same
262 * item might get entered multiple times. It's not worth
263 * trying to optimize that, so we don't detect it, but instead
264 * just forget any excess entries.
265 */
266 if (so->killedItems == NULL)
267 so->killedItems = palloc_array(int, MaxTIDsPerBTreePage);
268 if (so->numKilled < MaxTIDsPerBTreePage)
269 so->killedItems[so->numKilled++] = so->currPos.itemIndex;
270 }
271
272 /*
273 * Now continue the scan.
274 */
275 res = _bt_next(scan, dir);
276 }
277
278 /* If we have a tuple, return it ... */
279 if (res)
280 break;
281 /* ... otherwise see if we need another primitive index scan */
282 } while (so->numArrayKeys && _bt_start_prim_scan(scan));
283
284 return res;
285}
bool kill_prior_tuple
Definition relscan.h:160

References _bt_first(), _bt_next(), _bt_start_prim_scan(), Assert, BTScanPosIsValid, fb(), IndexScanDescData::heapRelation, IndexScanDescData::kill_prior_tuple, MaxTIDsPerBTreePage, IndexScanDescData::opaque, palloc_array, and IndexScanDescData::xs_recheck.

Referenced by bthandler().

◆ btinitparallelscan()

void btinitparallelscan ( void target)
extern

Definition at line 811 of file nbtree.c.

812{
814
815 LWLockInitialize(&bt_target->btps_lock,
817 bt_target->btps_nextScanPage = InvalidBlockNumber;
818 bt_target->btps_lastCurrPage = InvalidBlockNumber;
819 bt_target->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
821}
void ConditionVariableInit(ConditionVariable *cv)
void LWLockInitialize(LWLock *lock, int tranche_id)
Definition lwlock.c:670
@ BTPARALLEL_NOT_INITIALIZED
Definition nbtree.c:58

References BTPARALLEL_NOT_INITIALIZED, ConditionVariableInit(), fb(), InvalidBlockNumber, and LWLockInitialize().

Referenced by bthandler().

◆ btinsert()

bool btinsert ( Relation  rel,
Datum values,
bool isnull,
ItemPointer  ht_ctid,
Relation  heapRel,
IndexUniqueCheck  checkUnique,
bool  indexUnchanged,
struct IndexInfo indexInfo 
)
extern

Definition at line 206 of file nbtree.c.

211{
212 bool result;
213 IndexTuple itup;
214
215 /* generate an index tuple */
216 itup = index_form_tuple(RelationGetDescr(rel), values, isnull);
217 itup->t_tid = *ht_ctid;
218
219 result = _bt_doinsert(rel, itup, checkUnique, indexUnchanged, heapRel);
220
221 pfree(itup);
222
223 return result;
224}
static Datum values[MAXATTR]
Definition bootstrap.c:190
IndexTuple index_form_tuple(TupleDesc tupleDescriptor, const Datum *values, const bool *isnull)
Definition indextuple.c:44
bool _bt_doinsert(Relation rel, IndexTuple itup, IndexUniqueCheck checkUnique, bool indexUnchanged, Relation heapRel)
Definition nbtinsert.c:105

References _bt_doinsert(), fb(), index_form_tuple(), pfree(), RelationGetDescr, result, IndexTupleData::t_tid, and values.

Referenced by bthandler().

◆ btmarkpos()

void btmarkpos ( IndexScanDesc  scan)
extern

Definition at line 491 of file nbtree.c.

492{
494
495 /* There may be an old mark with a pin (but no lock). */
496 BTScanPosUnpinIfPinned(so->markPos);
497
498 /*
499 * Just record the current itemIndex. If we later step to next page
500 * before releasing the marked position, _bt_steppage makes a full copy of
501 * the currPos struct in markPos. If (as often happens) the mark is moved
502 * before we leave the page, we don't have to do that work.
503 */
504 if (BTScanPosIsValid(so->currPos))
505 so->markItemIndex = so->currPos.itemIndex;
506 else
507 {
508 BTScanPosInvalidate(so->markPos);
509 so->markItemIndex = -1;
510 }
511}

References BTScanPosInvalidate, BTScanPosIsValid, BTScanPosUnpinIfPinned, fb(), and IndexScanDescData::opaque.

Referenced by bthandler().

◆ btoptions()

bytea * btoptions ( Datum  reloptions,
bool  validate 
)
extern

Definition at line 598 of file nbtutils.c.

599{
600 static const relopt_parse_elt tab[] = {
601 {"fillfactor", RELOPT_TYPE_INT, offsetof(BTOptions, fillfactor)},
602 {"vacuum_cleanup_index_scale_factor", RELOPT_TYPE_REAL,
603 offsetof(BTOptions, vacuum_cleanup_index_scale_factor)},
604 {"deduplicate_items", RELOPT_TYPE_BOOL,
605 offsetof(BTOptions, deduplicate_items)}
606 };
607
608 return (bytea *) build_reloptions(reloptions, validate,
610 sizeof(BTOptions),
611 tab, lengthof(tab));
612}
static bool validate(Port *port, const char *auth, const char **logdetail)
Definition auth-oauth.c:672
#define lengthof(array)
Definition c.h:873
static int fillfactor
Definition pgbench.c:188
void * build_reloptions(Datum reloptions, bool validate, relopt_kind kind, Size relopt_struct_size, const relopt_parse_elt *relopt_elems, int num_relopt_elems)
@ RELOPT_KIND_BTREE
Definition reloptions.h:44
@ RELOPT_TYPE_INT
Definition reloptions.h:32
@ RELOPT_TYPE_BOOL
Definition reloptions.h:30
@ RELOPT_TYPE_REAL
Definition reloptions.h:33
Definition c.h:776

References build_reloptions(), fb(), fillfactor, lengthof, RELOPT_KIND_BTREE, RELOPT_TYPE_BOOL, RELOPT_TYPE_INT, RELOPT_TYPE_REAL, and validate().

Referenced by bthandler().

◆ BTPageGetDeleteXid()

static FullTransactionId BTPageGetDeleteXid ( Page  page)
inlinestatic

Definition at line 261 of file nbtree.h.

262{
263 BTPageOpaque opaque;
264 BTDeletedPageData *contents;
265
266 /* We only expect to be called with a deleted page */
267 Assert(!PageIsNew(page));
268 opaque = BTPageGetOpaque(page);
269 Assert(P_ISDELETED(opaque));
270
271 /* pg_upgrade'd deleted page -- must be safe to recycle now */
272 if (!P_HAS_FULLXID(opaque))
274
275 /* Get safexid from deleted page */
276 contents = ((BTDeletedPageData *) PageGetContents(page));
277 return contents->safexid;
278}
static char * PageGetContents(Page page)
Definition bufpage.h:282
#define P_HAS_FULLXID(opaque)
Definition nbtree.h:229
FullTransactionId safexid
Definition nbtree.h:236
#define FirstNormalFullTransactionId
Definition transam.h:57

References Assert, BTPageGetOpaque, FirstNormalFullTransactionId, P_HAS_FULLXID, P_ISDELETED, PageGetContents(), PageIsNew(), and BTDeletedPageData::safexid.

Referenced by _bt_allocbuf(), BTPageIsRecyclable(), and GetBTPageStatistics().

◆ BTPageIsRecyclable()

static bool BTPageIsRecyclable ( Page  page,
Relation  heaprel 
)
inlinestatic

Definition at line 292 of file nbtree.h.

293{
294 BTPageOpaque opaque;
295
296 Assert(!PageIsNew(page));
297 Assert(heaprel != NULL);
298
299 /* Recycling okay iff page is deleted and safexid is old enough */
300 opaque = BTPageGetOpaque(page);
301 if (P_ISDELETED(opaque))
302 {
304
305 /*
306 * The page was deleted, but when? If it was just deleted, a scan
307 * might have seen the downlink to it, and will read the page later.
308 * As long as that can happen, we must keep the deleted page around as
309 * a tombstone.
310 *
311 * For that check if the deletion XID could still be visible to
312 * anyone. If not, then no scan that's still in progress could have
313 * seen its downlink, and we can recycle it.
314 */
315 return GlobalVisCheckRemovableFullXid(heaprel, safexid);
316 }
317
318 return false;
319}

References Assert, BTPageGetDeleteXid(), BTPageGetOpaque, fb(), GlobalVisCheckRemovableFullXid(), P_ISDELETED, and PageIsNew().

Referenced by _bt_allocbuf(), and btvacuumpage().

◆ BTPageSetDeleted()

static void BTPageSetDeleted ( Page  page,
FullTransactionId  safexid 
)
inlinestatic

Definition at line 240 of file nbtree.h.

241{
242 BTPageOpaque opaque;
243 PageHeader header;
244 BTDeletedPageData *contents;
245
246 opaque = BTPageGetOpaque(page);
247 header = ((PageHeader) page);
248
249 opaque->btpo_flags &= ~BTP_HALF_DEAD;
252 sizeof(BTDeletedPageData);
253 header->pd_upper = header->pd_special;
254
255 /* Set safexid in deleted page */
256 contents = ((BTDeletedPageData *) PageGetContents(page));
257 contents->safexid = safexid;
258}
#define BTP_HAS_FULLXID
Definition nbtree.h:85
#define BTP_DELETED
Definition nbtree.h:79
LocationIndex pd_special
Definition bufpage.h:193
LocationIndex pd_upper
Definition bufpage.h:192
LocationIndex pd_lower
Definition bufpage.h:191

References BTP_DELETED, BTP_HAS_FULLXID, BTPageGetOpaque, BTPageOpaqueData::btpo_flags, fb(), MAXALIGN, PageGetContents(), PageHeaderData::pd_lower, PageHeaderData::pd_special, PageHeaderData::pd_upper, BTDeletedPageData::safexid, and SizeOfPageHeaderData.

Referenced by _bt_unlink_halfdead_page(), and btree_xlog_unlink_page().

◆ btparallelrescan()

void btparallelrescan ( IndexScanDesc  scan)
extern

Definition at line 827 of file nbtree.c.

828{
830 ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
831
832 Assert(parallel_scan);
833
834 btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
835 parallel_scan->ps_offset_am);
836
837 /*
838 * In theory, we don't need to acquire the LWLock here, because there
839 * shouldn't be any other workers running at this point, but we do so for
840 * consistency.
841 */
842 LWLockAcquire(&btscan->btps_lock, LW_EXCLUSIVE);
843 btscan->btps_nextScanPage = InvalidBlockNumber;
844 btscan->btps_lastCurrPage = InvalidBlockNumber;
845 btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
846 LWLockRelease(&btscan->btps_lock);
847}

References Assert, BTPARALLEL_NOT_INITIALIZED, fb(), InvalidBlockNumber, LW_EXCLUSIVE, LWLockAcquire(), LWLockRelease(), OffsetToPointer, IndexScanDescData::parallel_scan, and ParallelIndexScanDescData::ps_offset_am.

Referenced by bthandler().

◆ btproperty()

bool btproperty ( Oid  index_oid,
int  attno,
IndexAMProperty  prop,
const char propname,
bool res,
bool isnull 
)
extern

Definition at line 621 of file nbtutils.c.

624{
625 switch (prop)
626 {
628 /* answer only for columns, not AM or whole index */
629 if (attno == 0)
630 return false;
631 /* otherwise, btree can always return data */
632 *res = true;
633 return true;
634
635 default:
636 return false; /* punt to generic code */
637 }
638}
@ AMPROP_RETURNABLE
Definition amapi.h:47

References AMPROP_RETURNABLE.

Referenced by bthandler().

◆ BTreeTupleGetDownLink()

static BlockNumber BTreeTupleGetDownLink ( IndexTuple  pivot)
inlinestatic

◆ BTreeTupleGetHeapTID()

static ItemPointer BTreeTupleGetHeapTID ( IndexTuple  itup)
inlinestatic

◆ BTreeTupleGetMaxHeapTID()

static ItemPointer BTreeTupleGetMaxHeapTID ( IndexTuple  itup)
inlinestatic

◆ BTreeTupleGetNPosting()

◆ BTreeTupleGetPosting()

◆ BTreeTupleGetPostingN()

◆ BTreeTupleGetPostingOffset()

◆ BTreeTupleGetTopParent()

static BlockNumber BTreeTupleGetTopParent ( IndexTuple  leafhikey)
inlinestatic

Definition at line 621 of file nbtree.h.

622{
624}

References fb(), and ItemPointerGetBlockNumberNoCheck().

Referenced by _bt_unlink_halfdead_page(), and bt_downlink_missing_check().

◆ BTreeTupleIsPivot()

◆ BTreeTupleIsPosting()

◆ BTreeTupleSetDownLink()

static void BTreeTupleSetDownLink ( IndexTuple  pivot,
BlockNumber  blkno 
)
inlinestatic

Definition at line 563 of file nbtree.h.

564{
565 ItemPointerSetBlockNumber(&pivot->t_tid, blkno);
566}
static void ItemPointerSetBlockNumber(ItemPointerData *pointer, BlockNumber blockNumber)
Definition itemptr.h:147

References fb(), and ItemPointerSetBlockNumber().

Referenced by _bt_buildadd(), _bt_insert_parent(), _bt_mark_page_halfdead(), _bt_newlevel(), _bt_uppershutdown(), and btree_xlog_mark_page_halfdead().

◆ BTreeTupleSetNAtts()

static void BTreeTupleSetNAtts ( IndexTuple  itup,
uint16  nkeyatts,
bool  heaptid 
)
inlinestatic

◆ BTreeTupleSetPosting()

◆ BTreeTupleSetTopParent()

static void BTreeTupleSetTopParent ( IndexTuple  leafhikey,
BlockNumber  blkno 
)
inlinestatic

◆ btrescan()

void btrescan ( IndexScanDesc  scan,
ScanKey  scankey,
int  nscankeys,
ScanKey  orderbys,
int  norderbys 
)
extern

Definition at line 388 of file nbtree.c.

390{
392
393 /* we aren't holding any read locks, but gotta drop the pins */
394 if (BTScanPosIsValid(so->currPos))
395 {
396 /* Before leaving current page, deal with any killed items */
397 if (so->numKilled > 0)
398 _bt_killitems(scan);
399 BTScanPosUnpinIfPinned(so->currPos);
400 BTScanPosInvalidate(so->currPos);
401 }
402
403 /*
404 * We prefer to eagerly drop leaf page pins before btgettuple returns.
405 * This avoids making VACUUM wait to acquire a cleanup lock on the page.
406 *
407 * We cannot safely drop leaf page pins during index-only scans due to a
408 * race condition involving VACUUM setting pages all-visible in the VM.
409 * It's also unsafe for plain index scans that use a non-MVCC snapshot.
410 *
411 * Also opt out of dropping leaf page pins eagerly during bitmap scans.
412 * Pins cannot be held for more than an instant during bitmap scans either
413 * way, so we might as well avoid wasting cycles on acquiring page LSNs.
414 *
415 * See nbtree/README section on making concurrent TID recycling safe.
416 *
417 * Note: so->dropPin should never change across rescans.
418 */
419 so->dropPin = (!scan->xs_want_itup &&
421 scan->heapRelation != NULL);
422
423 so->markItemIndex = -1;
424 so->needPrimScan = false;
425 so->scanBehind = false;
426 so->oppositeDirCheck = false;
427 BTScanPosUnpinIfPinned(so->markPos);
428 BTScanPosInvalidate(so->markPos);
429
430 /*
431 * Allocate tuple workspace arrays, if needed for an index-only scan and
432 * not already done in a previous rescan call. To save on palloc
433 * overhead, both workspaces are allocated as one palloc block; only this
434 * function and btendscan know that.
435 */
436 if (scan->xs_want_itup && so->currTuples == NULL)
437 {
438 so->currTuples = (char *) palloc(BLCKSZ * 2);
439 so->markTuples = so->currTuples + BLCKSZ;
440 }
441
442 /*
443 * Reset the scan keys
444 */
445 if (scankey && scan->numberOfKeys > 0)
446 memcpy(scan->keyData, scankey, scan->numberOfKeys * sizeof(ScanKeyData));
447 so->numberOfKeys = 0; /* until _bt_preprocess_keys sets it */
448 so->numArrayKeys = 0; /* ditto */
449}
#define IsMVCCLikeSnapshot(snapshot)
Definition snapmgr.h:74

References _bt_killitems(), BTScanPosInvalidate, BTScanPosIsValid, BTScanPosUnpinIfPinned, fb(), IndexScanDescData::heapRelation, IsMVCCLikeSnapshot, IndexScanDescData::keyData, memcpy(), IndexScanDescData::numberOfKeys, IndexScanDescData::opaque, palloc(), IndexScanDescData::xs_snapshot, and IndexScanDescData::xs_want_itup.

Referenced by bthandler().

◆ btrestrpos()

void btrestrpos ( IndexScanDesc  scan)
extern

Definition at line 517 of file nbtree.c.

518{
520
521 if (so->markItemIndex >= 0)
522 {
523 /*
524 * The scan has never moved to a new page since the last mark. Just
525 * restore the itemIndex.
526 *
527 * NB: In this case we can't count on anything in so->markPos to be
528 * accurate.
529 */
530 so->currPos.itemIndex = so->markItemIndex;
531 }
532 else
533 {
534 /*
535 * The scan moved to a new page after last mark or restore, and we are
536 * now restoring to the marked page. We aren't holding any read
537 * locks, but if we're still holding the pin for the current position,
538 * we must drop it.
539 */
540 if (BTScanPosIsValid(so->currPos))
541 {
542 /* Before leaving current page, deal with any killed items */
543 if (so->numKilled > 0)
544 _bt_killitems(scan);
545 BTScanPosUnpinIfPinned(so->currPos);
546 }
547
548 if (BTScanPosIsValid(so->markPos))
549 {
550 /* bump pin on mark buffer for assignment to current buffer */
551 if (BTScanPosIsPinned(so->markPos))
552 IncrBufferRefCount(so->markPos.buf);
553 memcpy(&so->currPos, &so->markPos,
555 so->markPos.lastItem * sizeof(BTScanPosItem));
556 if (so->currTuples)
557 memcpy(so->currTuples, so->markTuples,
558 so->markPos.nextTupleOffset);
559 /* Reset the scan's array keys (see _bt_steppage for why) */
560 if (so->numArrayKeys)
561 {
562 _bt_start_array_keys(scan, so->currPos.dir);
563 so->needPrimScan = false;
564 }
565 }
566 else
567 BTScanPosInvalidate(so->currPos);
568 }
569}
void IncrBufferRefCount(Buffer buffer)
Definition bufmgr.c:5670
static ItemArray items

References _bt_killitems(), _bt_start_array_keys(), BTScanPosInvalidate, BTScanPosIsPinned, BTScanPosIsValid, BTScanPosUnpinIfPinned, fb(), IncrBufferRefCount(), items, memcpy(), and IndexScanDescData::opaque.

Referenced by bthandler().

◆ bttranslatecmptype()

StrategyNumber bttranslatecmptype ( CompareType  cmptype,
Oid  opfamily 
)
extern

Definition at line 1834 of file nbtree.c.

1835{
1836 switch (cmptype)
1837 {
1838 case COMPARE_LT:
1839 return BTLessStrategyNumber;
1840 case COMPARE_LE:
1842 case COMPARE_EQ:
1843 return BTEqualStrategyNumber;
1844 case COMPARE_GE:
1846 case COMPARE_GT:
1848 default:
1849 return InvalidStrategy;
1850 }
1851}
@ COMPARE_LE
Definition cmptype.h:35
@ COMPARE_GT
Definition cmptype.h:38
@ COMPARE_EQ
Definition cmptype.h:36
@ COMPARE_GE
Definition cmptype.h:37
@ COMPARE_LT
Definition cmptype.h:34

References BTEqualStrategyNumber, BTGreaterEqualStrategyNumber, BTGreaterStrategyNumber, BTLessEqualStrategyNumber, BTLessStrategyNumber, COMPARE_EQ, COMPARE_GE, COMPARE_GT, COMPARE_LE, COMPARE_LT, and InvalidStrategy.

Referenced by bthandler().

◆ bttranslatestrategy()

CompareType bttranslatestrategy ( StrategyNumber  strategy,
Oid  opfamily 
)
extern

Definition at line 1814 of file nbtree.c.

1815{
1816 switch (strategy)
1817 {
1819 return COMPARE_LT;
1821 return COMPARE_LE;
1823 return COMPARE_EQ;
1825 return COMPARE_GE;
1827 return COMPARE_GT;
1828 default:
1829 return COMPARE_INVALID;
1830 }
1831}
@ COMPARE_INVALID
Definition cmptype.h:33

References BTEqualStrategyNumber, BTGreaterEqualStrategyNumber, BTGreaterStrategyNumber, BTLessEqualStrategyNumber, BTLessStrategyNumber, COMPARE_EQ, COMPARE_GE, COMPARE_GT, COMPARE_INVALID, COMPARE_LE, and COMPARE_LT.

Referenced by bthandler().

◆ btvacuumcleanup()

IndexBulkDeleteResult * btvacuumcleanup ( IndexVacuumInfo info,
IndexBulkDeleteResult stats 
)
extern

Definition at line 1149 of file nbtree.c.

1150{
1152
1153 /* No-op in ANALYZE ONLY mode */
1154 if (info->analyze_only)
1155 return stats;
1156
1157 /*
1158 * If btbulkdelete was called, we need not do anything (we just maintain
1159 * the information used within _bt_vacuum_needs_cleanup() by calling
1160 * _bt_set_cleanup_info() below).
1161 *
1162 * If btbulkdelete was _not_ called, then we have a choice to make: we
1163 * must decide whether or not a btvacuumscan() call is needed now (i.e.
1164 * whether the ongoing VACUUM operation can entirely avoid a physical scan
1165 * of the index). A call to _bt_vacuum_needs_cleanup() decides it for us
1166 * now.
1167 */
1168 if (stats == NULL)
1169 {
1170 /* Check if VACUUM operation can entirely avoid btvacuumscan() call */
1171 if (!_bt_vacuum_needs_cleanup(info->index))
1172 return NULL;
1173
1174 /*
1175 * Since we aren't going to actually delete any leaf items, there's no
1176 * need to go through all the vacuum-cycle-ID pushups here.
1177 *
1178 * Posting list tuples are a source of inaccuracy for cleanup-only
1179 * scans. btvacuumscan() will assume that the number of index tuples
1180 * from each page can be used as num_index_tuples, even though
1181 * num_index_tuples is supposed to represent the number of TIDs in the
1182 * index. This naive approach can underestimate the number of tuples
1183 * in the index significantly.
1184 *
1185 * We handle the problem by making num_index_tuples an estimate in
1186 * cleanup-only case.
1187 */
1189 btvacuumscan(info, stats, NULL, NULL, 0);
1190 stats->estimated_count = true;
1191 }
1192
1193 /*
1194 * Maintain num_delpages value in metapage for _bt_vacuum_needs_cleanup().
1195 *
1196 * num_delpages is the number of deleted pages now in the index that were
1197 * not safe to place in the FSM to be recycled just yet. num_delpages is
1198 * greater than 0 only when _bt_pagedel() actually deleted pages during
1199 * our call to btvacuumscan(). Even then, _bt_pendingfsm_finalize() must
1200 * have failed to place any newly deleted pages in the FSM just moments
1201 * ago. (Actually, there are edge cases where recycling of the current
1202 * VACUUM's newly deleted pages does not even become safe by the time the
1203 * next VACUUM comes around. See nbtree/README.)
1204 */
1205 Assert(stats->pages_deleted >= stats->pages_free);
1206 num_delpages = stats->pages_deleted - stats->pages_free;
1208
1209 /*
1210 * It's quite possible for us to be fooled by concurrent page splits into
1211 * double-counting some index tuples, so disbelieve any total that exceeds
1212 * the underlying heap's count ... if we know that accurately. Otherwise
1213 * this might just make matters worse.
1214 */
1215 if (!info->estimated_count)
1216 {
1217 if (stats->num_index_tuples > info->num_heap_tuples)
1218 stats->num_index_tuples = info->num_heap_tuples;
1219 }
1220
1221 return stats;
1222}
void _bt_set_cleanup_info(Relation rel, BlockNumber num_delpages)
Definition nbtpage.c:233
bool _bt_vacuum_needs_cleanup(Relation rel)
Definition nbtpage.c:180
BlockNumber pages_deleted
Definition genam.h:90
double num_index_tuples
Definition genam.h:87
double num_heap_tuples
Definition genam.h:60
bool analyze_only
Definition genam.h:56
bool estimated_count
Definition genam.h:58

References _bt_set_cleanup_info(), _bt_vacuum_needs_cleanup(), IndexVacuumInfo::analyze_only, Assert, btvacuumscan(), IndexVacuumInfo::estimated_count, IndexBulkDeleteResult::estimated_count, fb(), IndexVacuumInfo::index, IndexVacuumInfo::num_heap_tuples, IndexBulkDeleteResult::num_index_tuples, IndexBulkDeleteResult::pages_deleted, IndexBulkDeleteResult::pages_free, and palloc0_object.

Referenced by bthandler().

◆ btvalidate()

bool btvalidate ( Oid  opclassoid)
extern

Definition at line 40 of file nbtvalidate.c.

41{
42 bool result = true;
46 Oid opcintype;
47 char *opclassname;
48 char *opfamilyname;
50 *oprlist;
54 int usefulgroups;
55 int i;
56 ListCell *lc;
57
58 /* Fetch opclass information */
61 elog(ERROR, "cache lookup failed for operator class %u", opclassoid);
63
64 opfamilyoid = classform->opcfamily;
65 opcintype = classform->opcintype;
66 opclassname = NameStr(classform->opcname);
67
68 /* Fetch opfamily information */
69 opfamilyname = get_opfamily_name(opfamilyoid, false);
70
71 /* Fetch all operators and support functions of the opfamily */
74
75 /* Check individual support functions */
76 for (i = 0; i < proclist->n_members; i++)
77 {
78 HeapTuple proctup = &proclist->members[i]->tuple;
80 bool ok;
81
82 /* Check procedure numbers and function signatures */
83 switch (procform->amprocnum)
84 {
85 case BTORDER_PROC:
87 2, 2, procform->amproclefttype,
88 procform->amprocrighttype);
89 break;
92 1, 1, INTERNALOID);
93 break;
94 case BTINRANGE_PROC:
96 5, 5,
97 procform->amproclefttype,
98 procform->amproclefttype,
99 procform->amprocrighttype,
101 break;
103 ok = check_amproc_signature(procform->amproc, BOOLOID, true,
104 1, 1, OIDOID);
105 break;
106 case BTOPTIONS_PROC:
108 break;
110 ok = check_amproc_signature(procform->amproc, VOIDOID, true,
111 1, 1, INTERNALOID);
112 break;
113 default:
116 errmsg("operator family \"%s\" of access method %s contains function %s with invalid support number %d",
117 opfamilyname, "btree",
118 format_procedure(procform->amproc),
119 procform->amprocnum)));
120 result = false;
121 continue; /* don't want additional message */
122 }
123
124 if (!ok)
125 {
128 errmsg("operator family \"%s\" of access method %s contains function %s with wrong signature for support number %d",
129 opfamilyname, "btree",
130 format_procedure(procform->amproc),
131 procform->amprocnum)));
132 result = false;
133 }
134 }
135
136 /* Check individual operators */
137 for (i = 0; i < oprlist->n_members; i++)
138 {
139 HeapTuple oprtup = &oprlist->members[i]->tuple;
141
142 /* Check that only allowed strategy numbers exist */
143 if (oprform->amopstrategy < 1 ||
144 oprform->amopstrategy > BTMaxStrategyNumber)
145 {
148 errmsg("operator family \"%s\" of access method %s contains operator %s with invalid strategy number %d",
149 opfamilyname, "btree",
150 format_operator(oprform->amopopr),
151 oprform->amopstrategy)));
152 result = false;
153 }
154
155 /* btree doesn't support ORDER BY operators */
156 if (oprform->amoppurpose != AMOP_SEARCH ||
157 OidIsValid(oprform->amopsortfamily))
158 {
161 errmsg("operator family \"%s\" of access method %s contains invalid ORDER BY specification for operator %s",
162 opfamilyname, "btree",
163 format_operator(oprform->amopopr))));
164 result = false;
165 }
166
167 /* Check operator signature --- same for all btree strategies */
168 if (!check_amop_signature(oprform->amopopr, BOOLOID,
169 oprform->amoplefttype,
170 oprform->amoprighttype))
171 {
174 errmsg("operator family \"%s\" of access method %s contains operator %s with wrong signature",
175 opfamilyname, "btree",
176 format_operator(oprform->amopopr))));
177 result = false;
178 }
179 }
180
181 /* Now check for inconsistent groups of operators/functions */
183 usefulgroups = 0;
186 foreach(lc, grouplist)
187 {
189
190 /*
191 * It is possible for an in_range support function to have a RHS type
192 * that is otherwise irrelevant to the opfamily --- for instance, SQL
193 * requires the datetime_ops opclass to have range support with an
194 * interval offset. So, if this group appears to contain only an
195 * in_range function, ignore it: it doesn't represent a pair of
196 * supported types.
197 */
198 if (thisgroup->operatorset == 0 &&
199 thisgroup->functionset == (1 << BTINRANGE_PROC))
200 continue;
201
202 /* Else count it as a relevant group */
203 usefulgroups++;
204
205 /* Remember the group exactly matching the test opclass */
206 if (thisgroup->lefttype == opcintype &&
207 thisgroup->righttype == opcintype)
209
210 /*
211 * Identify all distinct data types handled in this opfamily. This
212 * implementation is O(N^2), but there aren't likely to be enough
213 * types in the family for it to matter.
214 */
217
218 /*
219 * Complain if there seems to be an incomplete set of either operators
220 * or support functions for this datatype pair. The sortsupport,
221 * in_range, and equalimage functions are considered optional.
222 */
223 if (thisgroup->operatorset !=
224 ((1 << BTLessStrategyNumber) |
226 (1 << BTEqualStrategyNumber) |
229 {
232 errmsg("operator family \"%s\" of access method %s is missing operator(s) for types %s and %s",
233 opfamilyname, "btree",
234 format_type_be(thisgroup->lefttype),
235 format_type_be(thisgroup->righttype))));
236 result = false;
237 }
238 if ((thisgroup->functionset & (1 << BTORDER_PROC)) == 0)
239 {
242 errmsg("operator family \"%s\" of access method %s is missing support function for types %s and %s",
243 opfamilyname, "btree",
244 format_type_be(thisgroup->lefttype),
245 format_type_be(thisgroup->righttype))));
246 result = false;
247 }
248 }
249
250 /* Check that the originally-named opclass is supported */
251 /* (if group is there, we already checked it adequately above) */
252 if (!opclassgroup)
253 {
256 errmsg("operator class \"%s\" of access method %s is missing operator(s)",
257 opclassname, "btree")));
258 result = false;
259 }
260
261 /*
262 * Complain if the opfamily doesn't have entries for all possible
263 * combinations of its supported datatypes. While missing cross-type
264 * operators are not fatal, they do limit the planner's ability to derive
265 * additional qual clauses from equivalence classes, so it seems
266 * reasonable to insist that all built-in btree opfamilies be complete.
267 */
269 {
272 errmsg("operator family \"%s\" of access method %s is missing cross-type operator(s)",
273 opfamilyname, "btree")));
274 result = false;
275 }
276
280
281 return result;
282}
bool check_amproc_signature(Oid funcid, Oid restype, bool exact, int minargs, int maxargs,...)
Definition amvalidate.c:152
bool check_amop_signature(Oid opno, Oid restype, Oid lefttype, Oid righttype)
Definition amvalidate.c:206
List * identify_opfamily_groups(CatCList *oprlist, CatCList *proclist)
Definition amvalidate.c:43
bool check_amoptsproc_signature(Oid funcid)
Definition amvalidate.c:192
#define NameStr(name)
Definition c.h:835
void ReleaseCatCacheList(CatCList *list)
Definition catcache.c:2114
#define INFO
Definition elog.h:35
char * format_type_be(Oid type_oid)
#define HeapTupleIsValid(tuple)
Definition htup.h:78
static void * GETSTRUCT(const HeapTupleData *tuple)
List * list_append_unique_oid(List *list, Oid datum)
Definition list.c:1380
char * get_opfamily_name(Oid opfid, bool missing_ok)
Definition lsyscache.c:1473
#define BTSKIPSUPPORT_PROC
Definition nbtree.h:722
#define BTSORTSUPPORT_PROC
Definition nbtree.h:718
#define BTINRANGE_PROC
Definition nbtree.h:719
#define BTOPTIONS_PROC
Definition nbtree.h:721
END_CATALOG_STRUCT typedef FormData_pg_amop * Form_pg_amop
Definition pg_amop.h:92
END_CATALOG_STRUCT typedef FormData_pg_amproc * Form_pg_amproc
Definition pg_amproc.h:72
static int list_length(const List *l)
Definition pg_list.h:152
#define NIL
Definition pg_list.h:68
END_CATALOG_STRUCT typedef FormData_pg_opclass * Form_pg_opclass
Definition pg_opclass.h:87
char * format_procedure(Oid procedure_oid)
Definition regproc.c:305
char * format_operator(Oid operator_oid)
Definition regproc.c:801
Definition pg_list.h:54
void ReleaseSysCache(HeapTuple tuple)
Definition syscache.c:265
HeapTuple SearchSysCache1(SysCacheIdentifier cacheId, Datum key1)
Definition syscache.c:221
#define SearchSysCacheList1(cacheId, key1)
Definition syscache.h:127

References BTEQUALIMAGE_PROC, BTEqualStrategyNumber, BTGreaterEqualStrategyNumber, BTGreaterStrategyNumber, BTINRANGE_PROC, BTLessEqualStrategyNumber, BTLessStrategyNumber, BTMaxStrategyNumber, BTOPTIONS_PROC, BTORDER_PROC, BTSKIPSUPPORT_PROC, BTSORTSUPPORT_PROC, check_amop_signature(), check_amoptsproc_signature(), check_amproc_signature(), elog, ereport, errcode(), errmsg, ERROR, fb(), Form_pg_amop, Form_pg_amproc, Form_pg_opclass, format_operator(), format_procedure(), format_type_be(), get_opfamily_name(), GETSTRUCT(), HeapTupleIsValid, i, identify_opfamily_groups(), INFO, lfirst, list_append_unique_oid(), list_length(), NameStr, NIL, ObjectIdGetDatum(), OidIsValid, ReleaseCatCacheList(), ReleaseSysCache(), result, SearchSysCache1(), and SearchSysCacheList1.

Referenced by bthandler().

◆ StaticAssertDecl()

StaticAssertDecl ( BT_OFFSET_MASK >=  INDEX_MAX_KEYS,
"BT_OFFSET_MASK can't fit INDEX_MAX_KEYS"   
)