PostgreSQL Source Code git master
All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Pages
nbtree.h File Reference
#include "access/amapi.h"
#include "access/itup.h"
#include "access/sdir.h"
#include "access/tableam.h"
#include "access/xlogreader.h"
#include "catalog/pg_am_d.h"
#include "catalog/pg_index.h"
#include "lib/stringinfo.h"
#include "storage/bufmgr.h"
#include "storage/shm_toc.h"
#include "utils/skipsupport.h"
Include dependency graph for nbtree.h:
This graph shows which files directly or indirectly include this file:

Go to the source code of this file.

Data Structures

struct  BTPageOpaqueData
 
struct  BTMetaPageData
 
struct  BTDeletedPageData
 
struct  BTPendingFSM
 
struct  BTVacState
 
struct  BTStackData
 
struct  BTScanInsertData
 
struct  BTInsertStateData
 
struct  BTDedupInterval
 
struct  BTDedupStateData
 
struct  BTVacuumPostingData
 
struct  BTScanPosItem
 
struct  BTScanPosData
 
struct  BTArrayKeyInfo
 
struct  BTScanOpaqueData
 
struct  BTReadPageState
 
struct  BTOptions
 

Macros

#define BTPageGetOpaque(page)   ((BTPageOpaque) PageGetSpecialPointer(page))
 
#define BTP_LEAF   (1 << 0) /* leaf page, i.e. not internal page */
 
#define BTP_ROOT   (1 << 1) /* root page (has no parent) */
 
#define BTP_DELETED   (1 << 2) /* page has been deleted from tree */
 
#define BTP_META   (1 << 3) /* meta-page */
 
#define BTP_HALF_DEAD   (1 << 4) /* empty, but still in tree */
 
#define BTP_SPLIT_END   (1 << 5) /* rightmost page of split group */
 
#define BTP_HAS_GARBAGE   (1 << 6) /* page has LP_DEAD tuples (deprecated) */
 
#define BTP_INCOMPLETE_SPLIT   (1 << 7) /* right sibling's downlink is missing */
 
#define BTP_HAS_FULLXID   (1 << 8) /* contains BTDeletedPageData */
 
#define MAX_BT_CYCLE_ID   0xFF7F
 
#define BTPageGetMeta(p)    ((BTMetaPageData *) PageGetContents(p))
 
#define BTREE_METAPAGE   0 /* first page is meta */
 
#define BTREE_MAGIC   0x053162 /* magic number in metapage */
 
#define BTREE_VERSION   4 /* current version number */
 
#define BTREE_MIN_VERSION   2 /* minimum supported version */
 
#define BTREE_NOVAC_VERSION   3 /* version with all meta fields set */
 
#define BTMaxItemSize
 
#define BTMaxItemSizeNoHeapTid
 
#define MaxTIDsPerBTreePage
 
#define BTREE_MIN_FILLFACTOR   10
 
#define BTREE_DEFAULT_FILLFACTOR   90
 
#define BTREE_NONLEAF_FILLFACTOR   70
 
#define BTREE_SINGLEVAL_FILLFACTOR   96
 
#define P_NONE   0
 
#define P_LEFTMOST(opaque)   ((opaque)->btpo_prev == P_NONE)
 
#define P_RIGHTMOST(opaque)   ((opaque)->btpo_next == P_NONE)
 
#define P_ISLEAF(opaque)   (((opaque)->btpo_flags & BTP_LEAF) != 0)
 
#define P_ISROOT(opaque)   (((opaque)->btpo_flags & BTP_ROOT) != 0)
 
#define P_ISDELETED(opaque)   (((opaque)->btpo_flags & BTP_DELETED) != 0)
 
#define P_ISMETA(opaque)   (((opaque)->btpo_flags & BTP_META) != 0)
 
#define P_ISHALFDEAD(opaque)   (((opaque)->btpo_flags & BTP_HALF_DEAD) != 0)
 
#define P_IGNORE(opaque)   (((opaque)->btpo_flags & (BTP_DELETED|BTP_HALF_DEAD)) != 0)
 
#define P_HAS_GARBAGE(opaque)   (((opaque)->btpo_flags & BTP_HAS_GARBAGE) != 0)
 
#define P_INCOMPLETE_SPLIT(opaque)   (((opaque)->btpo_flags & BTP_INCOMPLETE_SPLIT) != 0)
 
#define P_HAS_FULLXID(opaque)   (((opaque)->btpo_flags & BTP_HAS_FULLXID) != 0)
 
#define P_HIKEY   ((OffsetNumber) 1)
 
#define P_FIRSTKEY   ((OffsetNumber) 2)
 
#define P_FIRSTDATAKEY(opaque)   (P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY)
 
#define INDEX_ALT_TID_MASK   INDEX_AM_RESERVED_BIT
 
#define BT_OFFSET_MASK   0x0FFF
 
#define BT_STATUS_OFFSET_MASK   0xF000
 
#define BT_PIVOT_HEAP_TID_ATTR   0x1000
 
#define BT_IS_POSTING   0x2000
 
#define BTreeTupleGetNAtts(itup, rel)
 
#define BTCommuteStrategyNumber(strat)   (BTMaxStrategyNumber + 1 - (strat))
 
#define BTORDER_PROC   1
 
#define BTSORTSUPPORT_PROC   2
 
#define BTINRANGE_PROC   3
 
#define BTEQUALIMAGE_PROC   4
 
#define BTOPTIONS_PROC   5
 
#define BTSKIPSUPPORT_PROC   6
 
#define BTNProcs   6
 
#define BT_READ   BUFFER_LOCK_SHARE
 
#define BT_WRITE   BUFFER_LOCK_EXCLUSIVE
 
#define BTScanPosIsPinned(scanpos)
 
#define BTScanPosUnpin(scanpos)
 
#define BTScanPosUnpinIfPinned(scanpos)
 
#define BTScanPosIsValid(scanpos)
 
#define BTScanPosInvalidate(scanpos)
 
#define SK_BT_REQFWD   0x00010000 /* required to continue forward scan */
 
#define SK_BT_REQBKWD   0x00020000 /* required to continue backward scan */
 
#define SK_BT_SKIP   0x00040000 /* skip array on column without input = */
 
#define SK_BT_MINVAL   0x00080000 /* invalid sk_argument, use low_compare */
 
#define SK_BT_MAXVAL   0x00100000 /* invalid sk_argument, use high_compare */
 
#define SK_BT_NEXT   0x00200000 /* positions the scan > sk_argument */
 
#define SK_BT_PRIOR   0x00400000 /* positions the scan < sk_argument */
 
#define SK_BT_INDOPTION_SHIFT   24 /* must clear the above bits */
 
#define SK_BT_DESC   (INDOPTION_DESC << SK_BT_INDOPTION_SHIFT)
 
#define SK_BT_NULLS_FIRST   (INDOPTION_NULLS_FIRST << SK_BT_INDOPTION_SHIFT)
 
#define BTGetFillFactor(relation)
 
#define BTGetTargetPageFreeSpace(relation)    (BLCKSZ * (100 - BTGetFillFactor(relation)) / 100)
 
#define BTGetDeduplicateItems(relation)
 
#define PROGRESS_BTREE_PHASE_INDEXBUILD_TABLESCAN   2
 
#define PROGRESS_BTREE_PHASE_PERFORMSORT_1   3
 
#define PROGRESS_BTREE_PHASE_PERFORMSORT_2   4
 
#define PROGRESS_BTREE_PHASE_LEAF_LOAD   5
 

Typedefs

typedef uint16 BTCycleId
 
typedef struct BTPageOpaqueData BTPageOpaqueData
 
typedef BTPageOpaqueDataBTPageOpaque
 
typedef struct BTMetaPageData BTMetaPageData
 
typedef struct BTDeletedPageData BTDeletedPageData
 
typedef struct BTPendingFSM BTPendingFSM
 
typedef struct BTVacState BTVacState
 
typedef struct BTStackData BTStackData
 
typedef BTStackDataBTStack
 
typedef struct BTScanInsertData BTScanInsertData
 
typedef BTScanInsertDataBTScanInsert
 
typedef struct BTInsertStateData BTInsertStateData
 
typedef BTInsertStateDataBTInsertState
 
typedef struct BTDedupInterval BTDedupInterval
 
typedef struct BTDedupStateData BTDedupStateData
 
typedef BTDedupStateDataBTDedupState
 
typedef struct BTVacuumPostingData BTVacuumPostingData
 
typedef BTVacuumPostingDataBTVacuumPosting
 
typedef struct BTScanPosItem BTScanPosItem
 
typedef struct BTScanPosData BTScanPosData
 
typedef BTScanPosDataBTScanPos
 
typedef struct BTArrayKeyInfo BTArrayKeyInfo
 
typedef struct BTScanOpaqueData BTScanOpaqueData
 
typedef BTScanOpaqueDataBTScanOpaque
 
typedef struct BTReadPageState BTReadPageState
 
typedef struct BTOptions BTOptions
 

Functions

static void BTPageSetDeleted (Page page, FullTransactionId safexid)
 
static FullTransactionId BTPageGetDeleteXid (Page page)
 
static bool BTPageIsRecyclable (Page page, Relation heaprel)
 
 StaticAssertDecl (BT_OFFSET_MASK >=INDEX_MAX_KEYS, "BT_OFFSET_MASK can't fit INDEX_MAX_KEYS")
 
static bool BTreeTupleIsPivot (IndexTuple itup)
 
static bool BTreeTupleIsPosting (IndexTuple itup)
 
static void BTreeTupleSetPosting (IndexTuple itup, uint16 nhtids, int postingoffset)
 
static uint16 BTreeTupleGetNPosting (IndexTuple posting)
 
static uint32 BTreeTupleGetPostingOffset (IndexTuple posting)
 
static ItemPointer BTreeTupleGetPosting (IndexTuple posting)
 
static ItemPointer BTreeTupleGetPostingN (IndexTuple posting, int n)
 
static BlockNumber BTreeTupleGetDownLink (IndexTuple pivot)
 
static void BTreeTupleSetDownLink (IndexTuple pivot, BlockNumber blkno)
 
static void BTreeTupleSetNAtts (IndexTuple itup, uint16 nkeyatts, bool heaptid)
 
static BlockNumber BTreeTupleGetTopParent (IndexTuple leafhikey)
 
static void BTreeTupleSetTopParent (IndexTuple leafhikey, BlockNumber blkno)
 
static ItemPointer BTreeTupleGetHeapTID (IndexTuple itup)
 
static ItemPointer BTreeTupleGetMaxHeapTID (IndexTuple itup)
 
void btbuildempty (Relation index)
 
bool btinsert (Relation rel, Datum *values, bool *isnull, ItemPointer ht_ctid, Relation heapRel, IndexUniqueCheck checkUnique, bool indexUnchanged, struct IndexInfo *indexInfo)
 
IndexScanDesc btbeginscan (Relation rel, int nkeys, int norderbys)
 
Size btestimateparallelscan (Relation rel, int nkeys, int norderbys)
 
void btinitparallelscan (void *target)
 
bool btgettuple (IndexScanDesc scan, ScanDirection dir)
 
int64 btgetbitmap (IndexScanDesc scan, TIDBitmap *tbm)
 
void btrescan (IndexScanDesc scan, ScanKey scankey, int nscankeys, ScanKey orderbys, int norderbys)
 
void btparallelrescan (IndexScanDesc scan)
 
void btendscan (IndexScanDesc scan)
 
void btmarkpos (IndexScanDesc scan)
 
void btrestrpos (IndexScanDesc scan)
 
IndexBulkDeleteResultbtbulkdelete (IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state)
 
IndexBulkDeleteResultbtvacuumcleanup (IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
 
bool btcanreturn (Relation index, int attno)
 
int btgettreeheight (Relation rel)
 
CompareType bttranslatestrategy (StrategyNumber strategy, Oid opfamily)
 
StrategyNumber bttranslatecmptype (CompareType cmptype, Oid opfamily)
 
bool _bt_parallel_seize (IndexScanDesc scan, BlockNumber *next_scan_page, BlockNumber *last_curr_page, bool first)
 
void _bt_parallel_release (IndexScanDesc scan, BlockNumber next_scan_page, BlockNumber curr_page)
 
void _bt_parallel_done (IndexScanDesc scan)
 
void _bt_parallel_primscan_schedule (IndexScanDesc scan, BlockNumber curr_page)
 
void _bt_dedup_pass (Relation rel, Buffer buf, IndexTuple newitem, Size newitemsz, bool bottomupdedup)
 
bool _bt_bottomupdel_pass (Relation rel, Buffer buf, Relation heapRel, Size newitemsz)
 
void _bt_dedup_start_pending (BTDedupState state, IndexTuple base, OffsetNumber baseoff)
 
bool _bt_dedup_save_htid (BTDedupState state, IndexTuple itup)
 
Size _bt_dedup_finish_pending (Page newpage, BTDedupState state)
 
IndexTuple _bt_form_posting (IndexTuple base, ItemPointer htids, int nhtids)
 
void _bt_update_posting (BTVacuumPosting vacposting)
 
IndexTuple _bt_swap_posting (IndexTuple newitem, IndexTuple oposting, int postingoff)
 
bool _bt_doinsert (Relation rel, IndexTuple itup, IndexUniqueCheck checkUnique, bool indexUnchanged, Relation heapRel)
 
void _bt_finish_split (Relation rel, Relation heaprel, Buffer lbuf, BTStack stack)
 
Buffer _bt_getstackbuf (Relation rel, Relation heaprel, BTStack stack, BlockNumber child)
 
OffsetNumber _bt_findsplitloc (Relation rel, Page origpage, OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem, bool *newitemonleft)
 
void _bt_initmetapage (Page page, BlockNumber rootbknum, uint32 level, bool allequalimage)
 
bool _bt_vacuum_needs_cleanup (Relation rel)
 
void _bt_set_cleanup_info (Relation rel, BlockNumber num_delpages)
 
void _bt_upgrademetapage (Page page)
 
Buffer _bt_getroot (Relation rel, Relation heaprel, int access)
 
Buffer _bt_gettrueroot (Relation rel)
 
int _bt_getrootheight (Relation rel)
 
void _bt_metaversion (Relation rel, bool *heapkeyspace, bool *allequalimage)
 
void _bt_checkpage (Relation rel, Buffer buf)
 
Buffer _bt_getbuf (Relation rel, BlockNumber blkno, int access)
 
Buffer _bt_allocbuf (Relation rel, Relation heaprel)
 
Buffer _bt_relandgetbuf (Relation rel, Buffer obuf, BlockNumber blkno, int access)
 
void _bt_relbuf (Relation rel, Buffer buf)
 
void _bt_lockbuf (Relation rel, Buffer buf, int access)
 
void _bt_unlockbuf (Relation rel, Buffer buf)
 
bool _bt_conditionallockbuf (Relation rel, Buffer buf)
 
void _bt_upgradelockbufcleanup (Relation rel, Buffer buf)
 
void _bt_pageinit (Page page, Size size)
 
void _bt_delitems_vacuum (Relation rel, Buffer buf, OffsetNumber *deletable, int ndeletable, BTVacuumPosting *updatable, int nupdatable)
 
void _bt_delitems_delete_check (Relation rel, Buffer buf, Relation heapRel, TM_IndexDeleteOp *delstate)
 
void _bt_pagedel (Relation rel, Buffer leafbuf, BTVacState *vstate)
 
void _bt_pendingfsm_init (Relation rel, BTVacState *vstate, bool cleanuponly)
 
void _bt_pendingfsm_finalize (Relation rel, BTVacState *vstate)
 
void _bt_preprocess_keys (IndexScanDesc scan)
 
BTStack _bt_search (Relation rel, Relation heaprel, BTScanInsert key, Buffer *bufP, int access)
 
OffsetNumber _bt_binsrch_insert (Relation rel, BTInsertState insertstate)
 
int32 _bt_compare (Relation rel, BTScanInsert key, Page page, OffsetNumber offnum)
 
bool _bt_first (IndexScanDesc scan, ScanDirection dir)
 
bool _bt_next (IndexScanDesc scan, ScanDirection dir)
 
Buffer _bt_get_endpoint (Relation rel, uint32 level, bool rightmost)
 
BTScanInsert _bt_mkscankey (Relation rel, IndexTuple itup)
 
void _bt_freestack (BTStack stack)
 
bool _bt_start_prim_scan (IndexScanDesc scan, ScanDirection dir)
 
int _bt_binsrch_array_skey (FmgrInfo *orderproc, bool cur_elem_trig, ScanDirection dir, Datum tupdatum, bool tupnull, BTArrayKeyInfo *array, ScanKey cur, int32 *set_elem_result)
 
void _bt_start_array_keys (IndexScanDesc scan, ScanDirection dir)
 
bool _bt_checkkeys (IndexScanDesc scan, BTReadPageState *pstate, bool arrayKeys, IndexTuple tuple, int tupnatts)
 
bool _bt_scanbehind_checkkeys (IndexScanDesc scan, ScanDirection dir, IndexTuple finaltup)
 
void _bt_set_startikey (IndexScanDesc scan, BTReadPageState *pstate)
 
void _bt_killitems (IndexScanDesc scan)
 
BTCycleId _bt_vacuum_cycleid (Relation rel)
 
BTCycleId _bt_start_vacuum (Relation rel)
 
void _bt_end_vacuum (Relation rel)
 
void _bt_end_vacuum_callback (int code, Datum arg)
 
Size BTreeShmemSize (void)
 
void BTreeShmemInit (void)
 
byteabtoptions (Datum reloptions, bool validate)
 
bool btproperty (Oid index_oid, int attno, IndexAMProperty prop, const char *propname, bool *res, bool *isnull)
 
char * btbuildphasename (int64 phasenum)
 
IndexTuple _bt_truncate (Relation rel, IndexTuple lastleft, IndexTuple firstright, BTScanInsert itup_key)
 
int _bt_keep_natts_fast (Relation rel, IndexTuple lastleft, IndexTuple firstright)
 
bool _bt_check_natts (Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum)
 
void _bt_check_third_page (Relation rel, Relation heap, bool needheaptidspace, Page page, IndexTuple newtup)
 
bool _bt_allequalimage (Relation rel, bool debugmessage)
 
bool btvalidate (Oid opclassoid)
 
void btadjustmembers (Oid opfamilyoid, Oid opclassoid, List *operators, List *functions)
 
IndexBuildResultbtbuild (Relation heap, Relation index, struct IndexInfo *indexInfo)
 
void _bt_parallel_build_main (dsm_segment *seg, shm_toc *toc)
 

Macro Definition Documentation

◆ BT_IS_POSTING

#define BT_IS_POSTING   0x2000

Definition at line 467 of file nbtree.h.

◆ BT_OFFSET_MASK

#define BT_OFFSET_MASK   0x0FFF

Definition at line 463 of file nbtree.h.

◆ BT_PIVOT_HEAP_TID_ATTR

#define BT_PIVOT_HEAP_TID_ATTR   0x1000

Definition at line 466 of file nbtree.h.

◆ BT_READ

#define BT_READ   BUFFER_LOCK_SHARE

Definition at line 730 of file nbtree.h.

◆ BT_STATUS_OFFSET_MASK

#define BT_STATUS_OFFSET_MASK   0xF000

Definition at line 464 of file nbtree.h.

◆ BT_WRITE

#define BT_WRITE   BUFFER_LOCK_EXCLUSIVE

Definition at line 731 of file nbtree.h.

◆ BTCommuteStrategyNumber

#define BTCommuteStrategyNumber (   strat)    (BTMaxStrategyNumber + 1 - (strat))

Definition at line 686 of file nbtree.h.

◆ BTEQUALIMAGE_PROC

#define BTEQUALIMAGE_PROC   4

Definition at line 720 of file nbtree.h.

◆ BTGetDeduplicateItems

#define BTGetDeduplicateItems (   relation)
Value:
(AssertMacro(relation->rd_rel->relkind == RELKIND_INDEX && \
relation->rd_rel->relam == BTREE_AM_OID), \
((relation)->rd_options ? \
((BTOptions *) (relation)->rd_options)->deduplicate_items : true))
#define AssertMacro(condition)
Definition: c.h:830

Definition at line 1165 of file nbtree.h.

◆ BTGetFillFactor

#define BTGetFillFactor (   relation)
Value:
(AssertMacro(relation->rd_rel->relkind == RELKIND_INDEX && \
relation->rd_rel->relam == BTREE_AM_OID), \
(relation)->rd_options ? \
((BTOptions *) (relation)->rd_options)->fillfactor : \
BTREE_DEFAULT_FILLFACTOR)

Definition at line 1157 of file nbtree.h.

◆ BTGetTargetPageFreeSpace

#define BTGetTargetPageFreeSpace (   relation)     (BLCKSZ * (100 - BTGetFillFactor(relation)) / 100)

Definition at line 1163 of file nbtree.h.

◆ BTINRANGE_PROC

#define BTINRANGE_PROC   3

Definition at line 719 of file nbtree.h.

◆ BTMaxItemSize

#define BTMaxItemSize
Value:
(MAXALIGN_DOWN((BLCKSZ - \
MAXALIGN(sizeof(BTPageOpaqueData))) / 3) - \
MAXALIGN(sizeof(ItemPointerData)))
#define SizeOfPageHeaderData
Definition: bufpage.h:217
#define MAXALIGN_DOWN(LEN)
Definition: c.h:794
#define MAXALIGN(LEN)
Definition: c.h:782

Definition at line 165 of file nbtree.h.

◆ BTMaxItemSizeNoHeapTid

#define BTMaxItemSizeNoHeapTid
Value:
MAXALIGN_DOWN((BLCKSZ - \
MAXALIGN(sizeof(BTPageOpaqueData))) / 3)

Definition at line 170 of file nbtree.h.

◆ BTNProcs

#define BTNProcs   6

Definition at line 723 of file nbtree.h.

◆ BTOPTIONS_PROC

#define BTOPTIONS_PROC   5

Definition at line 721 of file nbtree.h.

◆ BTORDER_PROC

#define BTORDER_PROC   1

Definition at line 717 of file nbtree.h.

◆ BTP_DELETED

#define BTP_DELETED   (1 << 2) /* page has been deleted from tree */

Definition at line 79 of file nbtree.h.

◆ BTP_HALF_DEAD

#define BTP_HALF_DEAD   (1 << 4) /* empty, but still in tree */

Definition at line 81 of file nbtree.h.

◆ BTP_HAS_FULLXID

#define BTP_HAS_FULLXID   (1 << 8) /* contains BTDeletedPageData */

Definition at line 85 of file nbtree.h.

◆ BTP_HAS_GARBAGE

#define BTP_HAS_GARBAGE   (1 << 6) /* page has LP_DEAD tuples (deprecated) */

Definition at line 83 of file nbtree.h.

◆ BTP_INCOMPLETE_SPLIT

#define BTP_INCOMPLETE_SPLIT   (1 << 7) /* right sibling's downlink is missing */

Definition at line 84 of file nbtree.h.

◆ BTP_LEAF

#define BTP_LEAF   (1 << 0) /* leaf page, i.e. not internal page */

Definition at line 77 of file nbtree.h.

◆ BTP_META

#define BTP_META   (1 << 3) /* meta-page */

Definition at line 80 of file nbtree.h.

◆ BTP_ROOT

#define BTP_ROOT   (1 << 1) /* root page (has no parent) */

Definition at line 78 of file nbtree.h.

◆ BTP_SPLIT_END

#define BTP_SPLIT_END   (1 << 5) /* rightmost page of split group */

Definition at line 82 of file nbtree.h.

◆ BTPageGetMeta

#define BTPageGetMeta (   p)     ((BTMetaPageData *) PageGetContents(p))

Definition at line 122 of file nbtree.h.

◆ BTPageGetOpaque

#define BTPageGetOpaque (   page)    ((BTPageOpaque) PageGetSpecialPointer(page))

Definition at line 74 of file nbtree.h.

◆ BTREE_DEFAULT_FILLFACTOR

#define BTREE_DEFAULT_FILLFACTOR   90

Definition at line 201 of file nbtree.h.

◆ BTREE_MAGIC

#define BTREE_MAGIC   0x053162 /* magic number in metapage */

Definition at line 150 of file nbtree.h.

◆ BTREE_METAPAGE

#define BTREE_METAPAGE   0 /* first page is meta */

Definition at line 149 of file nbtree.h.

◆ BTREE_MIN_FILLFACTOR

#define BTREE_MIN_FILLFACTOR   10

Definition at line 200 of file nbtree.h.

◆ BTREE_MIN_VERSION

#define BTREE_MIN_VERSION   2 /* minimum supported version */

Definition at line 152 of file nbtree.h.

◆ BTREE_NONLEAF_FILLFACTOR

#define BTREE_NONLEAF_FILLFACTOR   70

Definition at line 202 of file nbtree.h.

◆ BTREE_NOVAC_VERSION

#define BTREE_NOVAC_VERSION   3 /* version with all meta fields set */

Definition at line 153 of file nbtree.h.

◆ BTREE_SINGLEVAL_FILLFACTOR

#define BTREE_SINGLEVAL_FILLFACTOR   96

Definition at line 203 of file nbtree.h.

◆ BTREE_VERSION

#define BTREE_VERSION   4 /* current version number */

Definition at line 151 of file nbtree.h.

◆ BTreeTupleGetNAtts

#define BTreeTupleGetNAtts (   itup,
  rel 
)
Value:
( \
(BTreeTupleIsPivot(itup)) ? \
( \
) \
: \
)
static OffsetNumber ItemPointerGetOffsetNumberNoCheck(const ItemPointerData *pointer)
Definition: itemptr.h:114
static bool BTreeTupleIsPivot(IndexTuple itup)
Definition: nbtree.h:481
#define BT_OFFSET_MASK
Definition: nbtree.h:463
#define IndexRelationGetNumberOfAttributes(relation)
Definition: rel.h:528

Definition at line 578 of file nbtree.h.

◆ BTScanPosInvalidate

#define BTScanPosInvalidate (   scanpos)
Value:
do { \
(scanpos).buf = InvalidBuffer; \
(scanpos).currPage = InvalidBlockNumber; \
} while (0)
#define InvalidBlockNumber
Definition: block.h:33
#define InvalidBuffer
Definition: buf.h:25
static char * buf
Definition: pg_test_fsync.c:72

Definition at line 1027 of file nbtree.h.

◆ BTScanPosIsPinned

#define BTScanPosIsPinned (   scanpos)
Value:
( \
AssertMacro(BlockNumberIsValid((scanpos).currPage) || \
!BufferIsValid((scanpos).buf)), \
BufferIsValid((scanpos).buf) \
)
static bool BlockNumberIsValid(BlockNumber blockNumber)
Definition: block.h:71
static bool BufferIsValid(Buffer bufnum)
Definition: bufmgr.h:358

Definition at line 1004 of file nbtree.h.

◆ BTScanPosIsValid

#define BTScanPosIsValid (   scanpos)
Value:
( \
AssertMacro(BlockNumberIsValid((scanpos).currPage) || \
!BufferIsValid((scanpos).buf)), \
BlockNumberIsValid((scanpos).currPage) \
)

Definition at line 1021 of file nbtree.h.

◆ BTScanPosUnpin

#define BTScanPosUnpin (   scanpos)
Value:
do { \
ReleaseBuffer((scanpos).buf); \
(scanpos).buf = InvalidBuffer; \
} while (0)

Definition at line 1010 of file nbtree.h.

◆ BTScanPosUnpinIfPinned

#define BTScanPosUnpinIfPinned (   scanpos)
Value:
do { \
if (BTScanPosIsPinned(scanpos)) \
BTScanPosUnpin(scanpos); \
} while (0)
#define BTScanPosIsPinned(scanpos)
Definition: nbtree.h:1004

Definition at line 1015 of file nbtree.h.

◆ BTSKIPSUPPORT_PROC

#define BTSKIPSUPPORT_PROC   6

Definition at line 722 of file nbtree.h.

◆ BTSORTSUPPORT_PROC

#define BTSORTSUPPORT_PROC   2

Definition at line 718 of file nbtree.h.

◆ INDEX_ALT_TID_MASK

#define INDEX_ALT_TID_MASK   INDEX_AM_RESERVED_BIT

Definition at line 460 of file nbtree.h.

◆ MAX_BT_CYCLE_ID

#define MAX_BT_CYCLE_ID   0xFF7F

Definition at line 94 of file nbtree.h.

◆ MaxTIDsPerBTreePage

#define MaxTIDsPerBTreePage
Value:
(int) ((BLCKSZ - SizeOfPageHeaderData - sizeof(BTPageOpaqueData)) / \
sizeof(ItemPointerData))

Definition at line 186 of file nbtree.h.

◆ P_FIRSTDATAKEY

#define P_FIRSTDATAKEY (   opaque)    (P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY)

Definition at line 370 of file nbtree.h.

◆ P_FIRSTKEY

#define P_FIRSTKEY   ((OffsetNumber) 2)

Definition at line 369 of file nbtree.h.

◆ P_HAS_FULLXID

#define P_HAS_FULLXID (   opaque)    (((opaque)->btpo_flags & BTP_HAS_FULLXID) != 0)

Definition at line 229 of file nbtree.h.

◆ P_HAS_GARBAGE

#define P_HAS_GARBAGE (   opaque)    (((opaque)->btpo_flags & BTP_HAS_GARBAGE) != 0)

Definition at line 227 of file nbtree.h.

◆ P_HIKEY

#define P_HIKEY   ((OffsetNumber) 1)

Definition at line 368 of file nbtree.h.

◆ P_IGNORE

#define P_IGNORE (   opaque)    (((opaque)->btpo_flags & (BTP_DELETED|BTP_HALF_DEAD)) != 0)

Definition at line 226 of file nbtree.h.

◆ P_INCOMPLETE_SPLIT

#define P_INCOMPLETE_SPLIT (   opaque)    (((opaque)->btpo_flags & BTP_INCOMPLETE_SPLIT) != 0)

Definition at line 228 of file nbtree.h.

◆ P_ISDELETED

#define P_ISDELETED (   opaque)    (((opaque)->btpo_flags & BTP_DELETED) != 0)

Definition at line 223 of file nbtree.h.

◆ P_ISHALFDEAD

#define P_ISHALFDEAD (   opaque)    (((opaque)->btpo_flags & BTP_HALF_DEAD) != 0)

Definition at line 225 of file nbtree.h.

◆ P_ISLEAF

#define P_ISLEAF (   opaque)    (((opaque)->btpo_flags & BTP_LEAF) != 0)

Definition at line 221 of file nbtree.h.

◆ P_ISMETA

#define P_ISMETA (   opaque)    (((opaque)->btpo_flags & BTP_META) != 0)

Definition at line 224 of file nbtree.h.

◆ P_ISROOT

#define P_ISROOT (   opaque)    (((opaque)->btpo_flags & BTP_ROOT) != 0)

Definition at line 222 of file nbtree.h.

◆ P_LEFTMOST

#define P_LEFTMOST (   opaque)    ((opaque)->btpo_prev == P_NONE)

Definition at line 219 of file nbtree.h.

◆ P_NONE

#define P_NONE   0

Definition at line 213 of file nbtree.h.

◆ P_RIGHTMOST

#define P_RIGHTMOST (   opaque)    ((opaque)->btpo_next == P_NONE)

Definition at line 220 of file nbtree.h.

◆ PROGRESS_BTREE_PHASE_INDEXBUILD_TABLESCAN

#define PROGRESS_BTREE_PHASE_INDEXBUILD_TABLESCAN   2

Definition at line 1176 of file nbtree.h.

◆ PROGRESS_BTREE_PHASE_LEAF_LOAD

#define PROGRESS_BTREE_PHASE_LEAF_LOAD   5

Definition at line 1179 of file nbtree.h.

◆ PROGRESS_BTREE_PHASE_PERFORMSORT_1

#define PROGRESS_BTREE_PHASE_PERFORMSORT_1   3

Definition at line 1177 of file nbtree.h.

◆ PROGRESS_BTREE_PHASE_PERFORMSORT_2

#define PROGRESS_BTREE_PHASE_PERFORMSORT_2   4

Definition at line 1178 of file nbtree.h.

◆ SK_BT_DESC

#define SK_BT_DESC   (INDOPTION_DESC << SK_BT_INDOPTION_SHIFT)

Definition at line 1146 of file nbtree.h.

◆ SK_BT_INDOPTION_SHIFT

#define SK_BT_INDOPTION_SHIFT   24 /* must clear the above bits */

Definition at line 1145 of file nbtree.h.

◆ SK_BT_MAXVAL

#define SK_BT_MAXVAL   0x00100000 /* invalid sk_argument, use high_compare */

Definition at line 1140 of file nbtree.h.

◆ SK_BT_MINVAL

#define SK_BT_MINVAL   0x00080000 /* invalid sk_argument, use low_compare */

Definition at line 1139 of file nbtree.h.

◆ SK_BT_NEXT

#define SK_BT_NEXT   0x00200000 /* positions the scan > sk_argument */

Definition at line 1141 of file nbtree.h.

◆ SK_BT_NULLS_FIRST

#define SK_BT_NULLS_FIRST   (INDOPTION_NULLS_FIRST << SK_BT_INDOPTION_SHIFT)

Definition at line 1147 of file nbtree.h.

◆ SK_BT_PRIOR

#define SK_BT_PRIOR   0x00400000 /* positions the scan < sk_argument */

Definition at line 1142 of file nbtree.h.

◆ SK_BT_REQBKWD

#define SK_BT_REQBKWD   0x00020000 /* required to continue backward scan */

Definition at line 1135 of file nbtree.h.

◆ SK_BT_REQFWD

#define SK_BT_REQFWD   0x00010000 /* required to continue forward scan */

Definition at line 1134 of file nbtree.h.

◆ SK_BT_SKIP

#define SK_BT_SKIP   0x00040000 /* skip array on column without input = */

Definition at line 1136 of file nbtree.h.

Typedef Documentation

◆ BTArrayKeyInfo

◆ BTCycleId

typedef uint16 BTCycleId

Definition at line 30 of file nbtree.h.

◆ BTDedupInterval

◆ BTDedupState

Definition at line 904 of file nbtree.h.

◆ BTDedupStateData

◆ BTDeletedPageData

◆ BTInsertState

Definition at line 846 of file nbtree.h.

◆ BTInsertStateData

◆ BTMetaPageData

◆ BTOptions

typedef struct BTOptions BTOptions

◆ BTPageOpaque

Definition at line 72 of file nbtree.h.

◆ BTPageOpaqueData

◆ BTPendingFSM

typedef struct BTPendingFSM BTPendingFSM

◆ BTReadPageState

◆ BTScanInsert

Definition at line 807 of file nbtree.h.

◆ BTScanInsertData

◆ BTScanOpaque

Definition at line 1096 of file nbtree.h.

◆ BTScanOpaqueData

◆ BTScanPos

Definition at line 1002 of file nbtree.h.

◆ BTScanPosData

typedef struct BTScanPosData BTScanPosData

◆ BTScanPosItem

typedef struct BTScanPosItem BTScanPosItem

◆ BTStack

typedef BTStackData* BTStack

Definition at line 750 of file nbtree.h.

◆ BTStackData

typedef struct BTStackData BTStackData

◆ BTVacState

typedef struct BTVacState BTVacState

◆ BTVacuumPosting

Definition at line 925 of file nbtree.h.

◆ BTVacuumPostingData

Function Documentation

◆ _bt_allequalimage()

bool _bt_allequalimage ( Relation  rel,
bool  debugmessage 
)

Definition at line 4273 of file nbtutils.c.

4274{
4275 bool allequalimage = true;
4276
4277 /* INCLUDE indexes can never support deduplication */
4280 return false;
4281
4282 for (int i = 0; i < IndexRelationGetNumberOfKeyAttributes(rel); i++)
4283 {
4284 Oid opfamily = rel->rd_opfamily[i];
4285 Oid opcintype = rel->rd_opcintype[i];
4286 Oid collation = rel->rd_indcollation[i];
4287 Oid equalimageproc;
4288
4289 equalimageproc = get_opfamily_proc(opfamily, opcintype, opcintype,
4291
4292 /*
4293 * If there is no BTEQUALIMAGE_PROC then deduplication is assumed to
4294 * be unsafe. Otherwise, actually call proc and see what it says.
4295 */
4296 if (!OidIsValid(equalimageproc) ||
4297 !DatumGetBool(OidFunctionCall1Coll(equalimageproc, collation,
4298 ObjectIdGetDatum(opcintype))))
4299 {
4300 allequalimage = false;
4301 break;
4302 }
4303 }
4304
4305 if (debugmessage)
4306 {
4307 if (allequalimage)
4308 elog(DEBUG1, "index \"%s\" can safely use deduplication",
4310 else
4311 elog(DEBUG1, "index \"%s\" cannot use deduplication",
4313 }
4314
4315 return allequalimage;
4316}
#define OidIsValid(objectId)
Definition: c.h:746
#define DEBUG1
Definition: elog.h:30
#define elog(elevel,...)
Definition: elog.h:226
Datum OidFunctionCall1Coll(Oid functionId, Oid collation, Datum arg1)
Definition: fmgr.c:1411
int i
Definition: isn.c:77
Oid get_opfamily_proc(Oid opfamily, Oid lefttype, Oid righttype, int16 procnum)
Definition: lsyscache.c:888
#define BTEQUALIMAGE_PROC
Definition: nbtree.h:720
static bool DatumGetBool(Datum X)
Definition: postgres.h:95
static Datum ObjectIdGetDatum(Oid X)
Definition: postgres.h:257
unsigned int Oid
Definition: postgres_ext.h:30
#define RelationGetRelationName(relation)
Definition: rel.h:550
#define IndexRelationGetNumberOfKeyAttributes(relation)
Definition: rel.h:535
Oid * rd_opcintype
Definition: rel.h:208
Oid * rd_opfamily
Definition: rel.h:207
Oid * rd_indcollation
Definition: rel.h:217

References BTEQUALIMAGE_PROC, DatumGetBool(), DEBUG1, elog, get_opfamily_proc(), i, IndexRelationGetNumberOfAttributes, IndexRelationGetNumberOfKeyAttributes, ObjectIdGetDatum(), OidFunctionCall1Coll(), OidIsValid, RelationData::rd_indcollation, RelationData::rd_opcintype, RelationData::rd_opfamily, and RelationGetRelationName.

Referenced by _bt_leafbuild(), bt_index_check_callback(), and btbuildempty().

◆ _bt_allocbuf()

Buffer _bt_allocbuf ( Relation  rel,
Relation  heaprel 
)

Definition at line 869 of file nbtpage.c.

870{
871 Buffer buf;
872 BlockNumber blkno;
873 Page page;
874
875 Assert(heaprel != NULL);
876
877 /*
878 * First see if the FSM knows of any free pages.
879 *
880 * We can't trust the FSM's report unreservedly; we have to check that the
881 * page is still free. (For example, an already-free page could have been
882 * re-used between the time the last VACUUM scanned it and the time the
883 * VACUUM made its FSM updates.)
884 *
885 * In fact, it's worse than that: we can't even assume that it's safe to
886 * take a lock on the reported page. If somebody else has a lock on it,
887 * or even worse our own caller does, we could deadlock. (The own-caller
888 * scenario is actually not improbable. Consider an index on a serial or
889 * timestamp column. Nearly all splits will be at the rightmost page, so
890 * it's entirely likely that _bt_split will call us while holding a lock
891 * on the page most recently acquired from FSM. A VACUUM running
892 * concurrently with the previous split could well have placed that page
893 * back in FSM.)
894 *
895 * To get around that, we ask for only a conditional lock on the reported
896 * page. If we fail, then someone else is using the page, and we may
897 * reasonably assume it's not free. (If we happen to be wrong, the worst
898 * consequence is the page will be lost to use till the next VACUUM, which
899 * is no big problem.)
900 */
901 for (;;)
902 {
903 blkno = GetFreeIndexPage(rel);
904 if (blkno == InvalidBlockNumber)
905 break;
906 buf = ReadBuffer(rel, blkno);
907 if (_bt_conditionallockbuf(rel, buf))
908 {
909 page = BufferGetPage(buf);
910
911 /*
912 * It's possible to find an all-zeroes page in an index. For
913 * example, a backend might successfully extend the relation one
914 * page and then crash before it is able to make a WAL entry for
915 * adding the page. If we find a zeroed page then reclaim it
916 * immediately.
917 */
918 if (PageIsNew(page))
919 {
920 /* Okay to use page. Initialize and return it. */
922 return buf;
923 }
924
925 if (BTPageIsRecyclable(page, heaprel))
926 {
927 /*
928 * If we are generating WAL for Hot Standby then create a WAL
929 * record that will allow us to conflict with queries running
930 * on standby, in case they have snapshots older than safexid
931 * value
932 */
934 {
935 xl_btree_reuse_page xlrec_reuse;
936
937 /*
938 * Note that we don't register the buffer with the record,
939 * because this operation doesn't modify the page (that
940 * already happened, back when VACUUM deleted the page).
941 * This record only exists to provide a conflict point for
942 * Hot Standby. See record REDO routine comments.
943 */
944 xlrec_reuse.locator = rel->rd_locator;
945 xlrec_reuse.block = blkno;
947 xlrec_reuse.isCatalogRel =
949
952
953 XLogInsert(RM_BTREE_ID, XLOG_BTREE_REUSE_PAGE);
954 }
955
956 /* Okay to use page. Re-initialize and return it. */
958 return buf;
959 }
960 elog(DEBUG2, "FSM returned nonrecyclable page");
961 _bt_relbuf(rel, buf);
962 }
963 else
964 {
965 elog(DEBUG2, "FSM returned nonlockable page");
966 /* couldn't get lock, so just drop pin */
968 }
969 }
970
971 /*
972 * Extend the relation by one page. Need to use RBM_ZERO_AND_LOCK or we
973 * risk a race condition against btvacuumscan --- see comments therein.
974 * This forces us to repeat the valgrind request that _bt_lockbuf()
975 * otherwise would make, as we can't use _bt_lockbuf() without introducing
976 * a race.
977 */
979 if (!RelationUsesLocalBuffers(rel))
981
982 /* Initialize the new page before returning it */
983 page = BufferGetPage(buf);
984 Assert(PageIsNew(page));
986
987 return buf;
988}
uint32 BlockNumber
Definition: block.h:31
int Buffer
Definition: buf.h:23
Buffer ExtendBufferedRel(BufferManagerRelation bmr, ForkNumber forkNum, BufferAccessStrategy strategy, uint32 flags)
Definition: bufmgr.c:851
void ReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:5303
Buffer ReadBuffer(Relation reln, BlockNumber blockNum)
Definition: bufmgr.c:751
static Page BufferGetPage(Buffer buffer)
Definition: bufmgr.h:407
static Size BufferGetPageSize(Buffer buffer)
Definition: bufmgr.h:396
@ EB_LOCK_FIRST
Definition: bufmgr.h:87
#define BMR_REL(p_rel)
Definition: bufmgr.h:108
static bool PageIsNew(const PageData *page)
Definition: bufpage.h:234
PageData * Page
Definition: bufpage.h:82
#define DEBUG2
Definition: elog.h:29
Assert(PointerIsAligned(start, uint64))
BlockNumber GetFreeIndexPage(Relation rel)
Definition: indexfsm.c:38
#define VALGRIND_MAKE_MEM_DEFINED(addr, size)
Definition: memdebug.h:26
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:1023
void _bt_pageinit(Page page, Size size)
Definition: nbtpage.c:1129
bool _bt_conditionallockbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:1093
static FullTransactionId BTPageGetDeleteXid(Page page)
Definition: nbtree.h:261
static bool BTPageIsRecyclable(Page page, Relation heaprel)
Definition: nbtree.h:292
#define XLOG_BTREE_REUSE_PAGE
Definition: nbtxlog.h:40
#define SizeOfBtreeReusePage
Definition: nbtxlog.h:192
#define RelationIsAccessibleInLogicalDecoding(relation)
Definition: rel.h:695
#define RelationNeedsWAL(relation)
Definition: rel.h:639
#define RelationUsesLocalBuffers(relation)
Definition: rel.h:648
@ MAIN_FORKNUM
Definition: relpath.h:58
RelFileLocator rd_locator
Definition: rel.h:57
FullTransactionId snapshotConflictHorizon
Definition: nbtxlog.h:187
RelFileLocator locator
Definition: nbtxlog.h:185
BlockNumber block
Definition: nbtxlog.h:186
#define XLogStandbyInfoActive()
Definition: xlog.h:123
XLogRecPtr XLogInsert(RmgrId rmid, uint8 info)
Definition: xloginsert.c:474
void XLogRegisterData(const void *data, uint32 len)
Definition: xloginsert.c:364
void XLogBeginInsert(void)
Definition: xloginsert.c:149

References _bt_conditionallockbuf(), _bt_pageinit(), _bt_relbuf(), Assert(), xl_btree_reuse_page::block, BMR_REL, BTPageGetDeleteXid(), BTPageIsRecyclable(), buf, BufferGetPage(), BufferGetPageSize(), DEBUG2, EB_LOCK_FIRST, elog, ExtendBufferedRel(), GetFreeIndexPage(), InvalidBlockNumber, xl_btree_reuse_page::isCatalogRel, xl_btree_reuse_page::locator, MAIN_FORKNUM, PageIsNew(), RelationData::rd_locator, ReadBuffer(), RelationIsAccessibleInLogicalDecoding, RelationNeedsWAL, RelationUsesLocalBuffers, ReleaseBuffer(), SizeOfBtreeReusePage, xl_btree_reuse_page::snapshotConflictHorizon, VALGRIND_MAKE_MEM_DEFINED, XLOG_BTREE_REUSE_PAGE, XLogBeginInsert(), XLogInsert(), XLogRegisterData(), and XLogStandbyInfoActive.

Referenced by _bt_getroot(), _bt_newlevel(), and _bt_split().

◆ _bt_binsrch_array_skey()

int _bt_binsrch_array_skey ( FmgrInfo orderproc,
bool  cur_elem_trig,
ScanDirection  dir,
Datum  tupdatum,
bool  tupnull,
BTArrayKeyInfo array,
ScanKey  cur,
int32 set_elem_result 
)

Definition at line 287 of file nbtutils.c.

292{
293 int low_elem = 0,
294 mid_elem = -1,
295 high_elem = array->num_elems - 1,
296 result = 0;
297 Datum arrdatum;
298
299 Assert(cur->sk_flags & SK_SEARCHARRAY);
300 Assert(!(cur->sk_flags & SK_BT_SKIP));
301 Assert(!(cur->sk_flags & SK_ISNULL)); /* SAOP arrays never have NULLs */
302 Assert(cur->sk_strategy == BTEqualStrategyNumber);
303
304 if (cur_elem_trig)
305 {
307 Assert(cur->sk_flags & SK_BT_REQFWD);
308
309 /*
310 * When the scan key that triggered array advancement is a required
311 * array scan key, it is now certain that the current array element
312 * (plus all prior elements relative to the current scan direction)
313 * cannot possibly be at or ahead of the corresponding tuple value.
314 * (_bt_checkkeys must have called _bt_tuple_before_array_skeys, which
315 * makes sure this is true as a condition of advancing the arrays.)
316 *
317 * This makes it safe to exclude array elements up to and including
318 * the former-current array element from our search.
319 *
320 * Separately, when array advancement was triggered by a required scan
321 * key, the array element immediately after the former-current element
322 * is often either an exact tupdatum match, or a "close by" near-match
323 * (a near-match tupdatum is one whose key space falls _between_ the
324 * former-current and new-current array elements). We'll detect both
325 * cases via an optimistic comparison of the new search lower bound
326 * (or new search upper bound in the case of backwards scans).
327 */
328 if (ScanDirectionIsForward(dir))
329 {
330 low_elem = array->cur_elem + 1; /* old cur_elem exhausted */
331
332 /* Compare prospective new cur_elem (also the new lower bound) */
333 if (high_elem >= low_elem)
334 {
335 arrdatum = array->elem_values[low_elem];
336 result = _bt_compare_array_skey(orderproc, tupdatum, tupnull,
337 arrdatum, cur);
338
339 if (result <= 0)
340 {
341 /* Optimistic comparison optimization worked out */
342 *set_elem_result = result;
343 return low_elem;
344 }
345 mid_elem = low_elem;
346 low_elem++; /* this cur_elem exhausted, too */
347 }
348
349 if (high_elem < low_elem)
350 {
351 /* Caller needs to perform "beyond end" array advancement */
352 *set_elem_result = 1;
353 return high_elem;
354 }
355 }
356 else
357 {
358 high_elem = array->cur_elem - 1; /* old cur_elem exhausted */
359
360 /* Compare prospective new cur_elem (also the new upper bound) */
361 if (high_elem >= low_elem)
362 {
363 arrdatum = array->elem_values[high_elem];
364 result = _bt_compare_array_skey(orderproc, tupdatum, tupnull,
365 arrdatum, cur);
366
367 if (result >= 0)
368 {
369 /* Optimistic comparison optimization worked out */
370 *set_elem_result = result;
371 return high_elem;
372 }
373 mid_elem = high_elem;
374 high_elem--; /* this cur_elem exhausted, too */
375 }
376
377 if (high_elem < low_elem)
378 {
379 /* Caller needs to perform "beyond end" array advancement */
380 *set_elem_result = -1;
381 return low_elem;
382 }
383 }
384 }
385
386 while (high_elem > low_elem)
387 {
388 mid_elem = low_elem + ((high_elem - low_elem) / 2);
389 arrdatum = array->elem_values[mid_elem];
390
391 result = _bt_compare_array_skey(orderproc, tupdatum, tupnull,
392 arrdatum, cur);
393
394 if (result == 0)
395 {
396 /*
397 * It's safe to quit as soon as we see an equal array element.
398 * This often saves an extra comparison or two...
399 */
400 low_elem = mid_elem;
401 break;
402 }
403
404 if (result > 0)
405 low_elem = mid_elem + 1;
406 else
407 high_elem = mid_elem;
408 }
409
410 /*
411 * ...but our caller also cares about how its searched-for tuple datum
412 * compares to the low_elem datum. Must always set *set_elem_result with
413 * the result of that comparison specifically.
414 */
415 if (low_elem != mid_elem)
416 result = _bt_compare_array_skey(orderproc, tupdatum, tupnull,
417 array->elem_values[low_elem], cur);
418
419 *set_elem_result = result;
420
421 return low_elem;
422}
struct cursor * cur
Definition: ecpg.c:29
#define SK_BT_SKIP
Definition: nbtree.h:1136
#define SK_BT_REQFWD
Definition: nbtree.h:1134
static int32 _bt_compare_array_skey(FmgrInfo *orderproc, Datum tupdatum, bool tupnull, Datum arrdatum, ScanKey cur)
Definition: nbtutils.c:216
uintptr_t Datum
Definition: postgres.h:69
#define ScanDirectionIsForward(direction)
Definition: sdir.h:64
#define ScanDirectionIsNoMovement(direction)
Definition: sdir.h:57
#define SK_SEARCHARRAY
Definition: skey.h:120
#define SK_ISNULL
Definition: skey.h:115
#define BTEqualStrategyNumber
Definition: stratnum.h:31
Datum * elem_values
Definition: nbtree.h:1041

References _bt_compare_array_skey(), Assert(), BTEqualStrategyNumber, cur, BTArrayKeyInfo::cur_elem, BTArrayKeyInfo::elem_values, BTArrayKeyInfo::num_elems, ScanDirectionIsForward, ScanDirectionIsNoMovement, SK_BT_REQFWD, SK_BT_SKIP, SK_ISNULL, and SK_SEARCHARRAY.

Referenced by _bt_advance_array_keys(), _bt_saoparray_shrink(), and _bt_set_startikey().

◆ _bt_binsrch_insert()

OffsetNumber _bt_binsrch_insert ( Relation  rel,
BTInsertState  insertstate 
)

Definition at line 474 of file nbtsearch.c.

475{
476 BTScanInsert key = insertstate->itup_key;
477 Page page;
478 BTPageOpaque opaque;
479 OffsetNumber low,
480 high,
481 stricthigh;
482 int32 result,
483 cmpval;
484
485 page = BufferGetPage(insertstate->buf);
486 opaque = BTPageGetOpaque(page);
487
488 Assert(P_ISLEAF(opaque));
489 Assert(!key->nextkey);
490 Assert(insertstate->postingoff == 0);
491
492 if (!insertstate->bounds_valid)
493 {
494 /* Start new binary search */
495 low = P_FIRSTDATAKEY(opaque);
496 high = PageGetMaxOffsetNumber(page);
497 }
498 else
499 {
500 /* Restore result of previous binary search against same page */
501 low = insertstate->low;
502 high = insertstate->stricthigh;
503 }
504
505 /* If there are no keys on the page, return the first available slot */
506 if (unlikely(high < low))
507 {
508 /* Caller can't reuse bounds */
509 insertstate->low = InvalidOffsetNumber;
510 insertstate->stricthigh = InvalidOffsetNumber;
511 insertstate->bounds_valid = false;
512 return low;
513 }
514
515 /*
516 * Binary search to find the first key on the page >= scan key. (nextkey
517 * is always false when inserting).
518 *
519 * The loop invariant is: all slots before 'low' are < scan key, all slots
520 * at or after 'high' are >= scan key. 'stricthigh' is > scan key, and is
521 * maintained to save additional search effort for caller.
522 *
523 * We can fall out when high == low.
524 */
525 if (!insertstate->bounds_valid)
526 high++; /* establish the loop invariant for high */
527 stricthigh = high; /* high initially strictly higher */
528
529 cmpval = 1; /* !nextkey comparison value */
530
531 while (high > low)
532 {
533 OffsetNumber mid = low + ((high - low) / 2);
534
535 /* We have low <= mid < high, so mid points at a real slot */
536
537 result = _bt_compare(rel, key, page, mid);
538
539 if (result >= cmpval)
540 low = mid + 1;
541 else
542 {
543 high = mid;
544 if (result != 0)
545 stricthigh = high;
546 }
547
548 /*
549 * If tuple at offset located by binary search is a posting list whose
550 * TID range overlaps with caller's scantid, perform posting list
551 * binary search to set postingoff for caller. Caller must split the
552 * posting list when postingoff is set. This should happen
553 * infrequently.
554 */
555 if (unlikely(result == 0 && key->scantid != NULL))
556 {
557 /*
558 * postingoff should never be set more than once per leaf page
559 * binary search. That would mean that there are duplicate table
560 * TIDs in the index, which is never okay. Check for that here.
561 */
562 if (insertstate->postingoff != 0)
564 (errcode(ERRCODE_INDEX_CORRUPTED),
565 errmsg_internal("table tid from new index tuple (%u,%u) cannot find insert offset between offsets %u and %u of block %u in index \"%s\"",
568 low, stricthigh,
569 BufferGetBlockNumber(insertstate->buf),
571
572 insertstate->postingoff = _bt_binsrch_posting(key, page, mid);
573 }
574 }
575
576 /*
577 * On a leaf page, a binary search always returns the first key >= scan
578 * key (at least in !nextkey case), which could be the last slot + 1. This
579 * is also the lower bound of cached search.
580 *
581 * stricthigh may also be the last slot + 1, which prevents caller from
582 * using bounds directly, but is still useful to us if we're called a
583 * second time with cached bounds (cached low will be < stricthigh when
584 * that happens).
585 */
586 insertstate->low = low;
587 insertstate->stricthigh = stricthigh;
588 insertstate->bounds_valid = true;
589
590 return low;
591}
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:4161
static OffsetNumber PageGetMaxOffsetNumber(const PageData *page)
Definition: bufpage.h:372
int32_t int32
Definition: c.h:498
#define unlikely(x)
Definition: c.h:347
int errmsg_internal(const char *fmt,...)
Definition: elog.c:1158
int errcode(int sqlerrcode)
Definition: elog.c:854
#define ERROR
Definition: elog.h:39
#define ereport(elevel,...)
Definition: elog.h:149
static OffsetNumber ItemPointerGetOffsetNumber(const ItemPointerData *pointer)
Definition: itemptr.h:124
static BlockNumber ItemPointerGetBlockNumber(const ItemPointerData *pointer)
Definition: itemptr.h:103
#define P_ISLEAF(opaque)
Definition: nbtree.h:221
#define BTPageGetOpaque(page)
Definition: nbtree.h:74
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:370
static int _bt_binsrch_posting(BTScanInsert key, Page page, OffsetNumber offnum)
Definition: nbtsearch.c:602
int32 _bt_compare(Relation rel, BTScanInsert key, Page page, OffsetNumber offnum)
Definition: nbtsearch.c:688
#define InvalidOffsetNumber
Definition: off.h:26
uint16 OffsetNumber
Definition: off.h:24
OffsetNumber stricthigh
Definition: nbtree.h:836
bool bounds_valid
Definition: nbtree.h:834
OffsetNumber low
Definition: nbtree.h:835
BTScanInsert itup_key
Definition: nbtree.h:824

References _bt_binsrch_posting(), _bt_compare(), Assert(), BTInsertStateData::bounds_valid, BTPageGetOpaque, BTInsertStateData::buf, BufferGetBlockNumber(), BufferGetPage(), ereport, errcode(), errmsg_internal(), ERROR, InvalidOffsetNumber, ItemPointerGetBlockNumber(), ItemPointerGetOffsetNumber(), BTInsertStateData::itup_key, sort-test::key, BTInsertStateData::low, P_FIRSTDATAKEY, P_ISLEAF, PageGetMaxOffsetNumber(), BTInsertStateData::postingoff, RelationGetRelationName, BTInsertStateData::stricthigh, and unlikely.

Referenced by _bt_check_unique(), _bt_findinsertloc(), and bt_rootdescend().

◆ _bt_bottomupdel_pass()

bool _bt_bottomupdel_pass ( Relation  rel,
Buffer  buf,
Relation  heapRel,
Size  newitemsz 
)

Definition at line 307 of file nbtdedup.c.

309{
310 OffsetNumber offnum,
311 minoff,
312 maxoff;
313 Page page = BufferGetPage(buf);
314 BTPageOpaque opaque = BTPageGetOpaque(page);
316 TM_IndexDeleteOp delstate;
317 bool neverdedup;
318 int nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
319
320 /* Passed-in newitemsz is MAXALIGNED but does not include line pointer */
321 newitemsz += sizeof(ItemIdData);
322
323 /* Initialize deduplication state */
325 state->deduplicate = true;
326 state->nmaxitems = 0;
327 state->maxpostingsize = BLCKSZ; /* We're not really deduplicating */
328 state->base = NULL;
329 state->baseoff = InvalidOffsetNumber;
330 state->basetupsize = 0;
331 state->htids = palloc(state->maxpostingsize);
332 state->nhtids = 0;
333 state->nitems = 0;
334 state->phystupsize = 0;
335 state->nintervals = 0;
336
337 /*
338 * Initialize tableam state that describes bottom-up index deletion
339 * operation.
340 *
341 * We'll go on to ask the tableam to search for TIDs whose index tuples we
342 * can safely delete. The tableam will search until our leaf page space
343 * target is satisfied, or until the cost of continuing with the tableam
344 * operation seems too high. It focuses its efforts on TIDs associated
345 * with duplicate index tuples that we mark "promising".
346 *
347 * This space target is a little arbitrary. The tableam must be able to
348 * keep the costs and benefits in balance. We provide the tableam with
349 * exhaustive information about what might work, without directly
350 * concerning ourselves with avoiding work during the tableam call. Our
351 * role in costing the bottom-up deletion process is strictly advisory.
352 */
353 delstate.irel = rel;
354 delstate.iblknum = BufferGetBlockNumber(buf);
355 delstate.bottomup = true;
356 delstate.bottomupfreespace = Max(BLCKSZ / 16, newitemsz);
357 delstate.ndeltids = 0;
358 delstate.deltids = palloc(MaxTIDsPerBTreePage * sizeof(TM_IndexDelete));
359 delstate.status = palloc(MaxTIDsPerBTreePage * sizeof(TM_IndexStatus));
360
361 minoff = P_FIRSTDATAKEY(opaque);
362 maxoff = PageGetMaxOffsetNumber(page);
363 for (offnum = minoff;
364 offnum <= maxoff;
365 offnum = OffsetNumberNext(offnum))
366 {
367 ItemId itemid = PageGetItemId(page, offnum);
368 IndexTuple itup = (IndexTuple) PageGetItem(page, itemid);
369
370 Assert(!ItemIdIsDead(itemid));
371
372 if (offnum == minoff)
373 {
374 /* itup starts first pending interval */
375 _bt_dedup_start_pending(state, itup, offnum);
376 }
377 else if (_bt_keep_natts_fast(rel, state->base, itup) > nkeyatts &&
379 {
380 /* Tuple is equal; just added its TIDs to pending interval */
381 }
382 else
383 {
384 /* Finalize interval -- move its TIDs to delete state */
385 _bt_bottomupdel_finish_pending(page, state, &delstate);
386
387 /* itup starts new pending interval */
388 _bt_dedup_start_pending(state, itup, offnum);
389 }
390 }
391 /* Finalize final interval -- move its TIDs to delete state */
392 _bt_bottomupdel_finish_pending(page, state, &delstate);
393
394 /*
395 * We don't give up now in the event of having few (or even zero)
396 * promising tuples for the tableam because it's not up to us as the index
397 * AM to manage costs (note that the tableam might have heuristics of its
398 * own that work out what to do). We should at least avoid having our
399 * caller do a useless deduplication pass after we return in the event of
400 * zero promising tuples, though.
401 */
402 neverdedup = false;
403 if (state->nintervals == 0)
404 neverdedup = true;
405
406 pfree(state->htids);
407 pfree(state);
408
409 /* Ask tableam which TIDs are deletable, then physically delete them */
410 _bt_delitems_delete_check(rel, buf, heapRel, &delstate);
411
412 pfree(delstate.deltids);
413 pfree(delstate.status);
414
415 /* Report "success" to caller unconditionally to avoid deduplication */
416 if (neverdedup)
417 return true;
418
419 /* Don't dedup when we won't end up back here any time soon anyway */
420 return PageGetExactFreeSpace(page) >= Max(BLCKSZ / 24, newitemsz);
421}
Size PageGetExactFreeSpace(const PageData *page)
Definition: bufpage.c:957
static Item PageGetItem(const PageData *page, const ItemIdData *itemId)
Definition: bufpage.h:354
static ItemId PageGetItemId(Page page, OffsetNumber offsetNumber)
Definition: bufpage.h:244
#define Max(x, y)
Definition: c.h:969
struct ItemIdData ItemIdData
#define ItemIdIsDead(itemId)
Definition: itemid.h:113
IndexTupleData * IndexTuple
Definition: itup.h:53
void pfree(void *pointer)
Definition: mcxt.c:1524
void * palloc(Size size)
Definition: mcxt.c:1317
bool _bt_dedup_save_htid(BTDedupState state, IndexTuple itup)
Definition: nbtdedup.c:484
void _bt_dedup_start_pending(BTDedupState state, IndexTuple base, OffsetNumber baseoff)
Definition: nbtdedup.c:433
static void _bt_bottomupdel_finish_pending(Page page, BTDedupState state, TM_IndexDeleteOp *delstate)
Definition: nbtdedup.c:648
void _bt_delitems_delete_check(Relation rel, Buffer buf, Relation heapRel, TM_IndexDeleteOp *delstate)
Definition: nbtpage.c:1513
#define MaxTIDsPerBTreePage
Definition: nbtree.h:186
BTDedupStateData * BTDedupState
Definition: nbtree.h:904
int _bt_keep_natts_fast(Relation rel, IndexTuple lastleft, IndexTuple firstright)
Definition: nbtutils.c:4009
#define OffsetNumberNext(offsetNumber)
Definition: off.h:52
TM_IndexStatus * status
Definition: tableam.h:248
int bottomupfreespace
Definition: tableam.h:243
Relation irel
Definition: tableam.h:240
TM_IndexDelete * deltids
Definition: tableam.h:247
BlockNumber iblknum
Definition: tableam.h:241
Definition: regguts.h:323

References _bt_bottomupdel_finish_pending(), _bt_dedup_save_htid(), _bt_dedup_start_pending(), _bt_delitems_delete_check(), _bt_keep_natts_fast(), Assert(), TM_IndexDeleteOp::bottomup, TM_IndexDeleteOp::bottomupfreespace, BTPageGetOpaque, buf, BufferGetBlockNumber(), BufferGetPage(), TM_IndexDeleteOp::deltids, TM_IndexDeleteOp::iblknum, IndexRelationGetNumberOfKeyAttributes, InvalidOffsetNumber, TM_IndexDeleteOp::irel, ItemIdIsDead, Max, MaxTIDsPerBTreePage, TM_IndexDeleteOp::ndeltids, OffsetNumberNext, P_FIRSTDATAKEY, PageGetExactFreeSpace(), PageGetItem(), PageGetItemId(), PageGetMaxOffsetNumber(), palloc(), pfree(), and TM_IndexDeleteOp::status.

Referenced by _bt_delete_or_dedup_one_page().

◆ _bt_check_natts()

bool _bt_check_natts ( Relation  rel,
bool  heapkeyspace,
Page  page,
OffsetNumber  offnum 
)

Definition at line 4056 of file nbtutils.c.

4057{
4060 BTPageOpaque opaque = BTPageGetOpaque(page);
4061 IndexTuple itup;
4062 int tupnatts;
4063
4064 /*
4065 * We cannot reliably test a deleted or half-dead page, since they have
4066 * dummy high keys
4067 */
4068 if (P_IGNORE(opaque))
4069 return true;
4070
4071 Assert(offnum >= FirstOffsetNumber &&
4072 offnum <= PageGetMaxOffsetNumber(page));
4073
4074 itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
4075 tupnatts = BTreeTupleGetNAtts(itup, rel);
4076
4077 /* !heapkeyspace indexes do not support deduplication */
4078 if (!heapkeyspace && BTreeTupleIsPosting(itup))
4079 return false;
4080
4081 /* Posting list tuples should never have "pivot heap TID" bit set */
4082 if (BTreeTupleIsPosting(itup) &&
4085 return false;
4086
4087 /* INCLUDE indexes do not support deduplication */
4088 if (natts != nkeyatts && BTreeTupleIsPosting(itup))
4089 return false;
4090
4091 if (P_ISLEAF(opaque))
4092 {
4093 if (offnum >= P_FIRSTDATAKEY(opaque))
4094 {
4095 /*
4096 * Non-pivot tuple should never be explicitly marked as a pivot
4097 * tuple
4098 */
4099 if (BTreeTupleIsPivot(itup))
4100 return false;
4101
4102 /*
4103 * Leaf tuples that are not the page high key (non-pivot tuples)
4104 * should never be truncated. (Note that tupnatts must have been
4105 * inferred, even with a posting list tuple, because only pivot
4106 * tuples store tupnatts directly.)
4107 */
4108 return tupnatts == natts;
4109 }
4110 else
4111 {
4112 /*
4113 * Rightmost page doesn't contain a page high key, so tuple was
4114 * checked above as ordinary leaf tuple
4115 */
4116 Assert(!P_RIGHTMOST(opaque));
4117
4118 /*
4119 * !heapkeyspace high key tuple contains only key attributes. Note
4120 * that tupnatts will only have been explicitly represented in
4121 * !heapkeyspace indexes that happen to have non-key attributes.
4122 */
4123 if (!heapkeyspace)
4124 return tupnatts == nkeyatts;
4125
4126 /* Use generic heapkeyspace pivot tuple handling */
4127 }
4128 }
4129 else /* !P_ISLEAF(opaque) */
4130 {
4131 if (offnum == P_FIRSTDATAKEY(opaque))
4132 {
4133 /*
4134 * The first tuple on any internal page (possibly the first after
4135 * its high key) is its negative infinity tuple. Negative
4136 * infinity tuples are always truncated to zero attributes. They
4137 * are a particular kind of pivot tuple.
4138 */
4139 if (heapkeyspace)
4140 return tupnatts == 0;
4141
4142 /*
4143 * The number of attributes won't be explicitly represented if the
4144 * negative infinity tuple was generated during a page split that
4145 * occurred with a version of Postgres before v11. There must be
4146 * a problem when there is an explicit representation that is
4147 * non-zero, or when there is no explicit representation and the
4148 * tuple is evidently not a pre-pg_upgrade tuple.
4149 *
4150 * Prior to v11, downlinks always had P_HIKEY as their offset.
4151 * Accept that as an alternative indication of a valid
4152 * !heapkeyspace negative infinity tuple.
4153 */
4154 return tupnatts == 0 ||
4156 }
4157 else
4158 {
4159 /*
4160 * !heapkeyspace downlink tuple with separator key contains only
4161 * key attributes. Note that tupnatts will only have been
4162 * explicitly represented in !heapkeyspace indexes that happen to
4163 * have non-key attributes.
4164 */
4165 if (!heapkeyspace)
4166 return tupnatts == nkeyatts;
4167
4168 /* Use generic heapkeyspace pivot tuple handling */
4169 }
4170 }
4171
4172 /* Handle heapkeyspace pivot tuples (excluding minus infinity items) */
4173 Assert(heapkeyspace);
4174
4175 /*
4176 * Explicit representation of the number of attributes is mandatory with
4177 * heapkeyspace index pivot tuples, regardless of whether or not there are
4178 * non-key attributes.
4179 */
4180 if (!BTreeTupleIsPivot(itup))
4181 return false;
4182
4183 /* Pivot tuple should not use posting list representation (redundant) */
4184 if (BTreeTupleIsPosting(itup))
4185 return false;
4186
4187 /*
4188 * Heap TID is a tiebreaker key attribute, so it cannot be untruncated
4189 * when any other key attribute is truncated
4190 */
4191 if (BTreeTupleGetHeapTID(itup) != NULL && tupnatts != nkeyatts)
4192 return false;
4193
4194 /*
4195 * Pivot tuple must have at least one untruncated key attribute (minus
4196 * infinity pivot tuples are the only exception). Pivot tuples can never
4197 * represent that there is a value present for a key attribute that
4198 * exceeds pg_index.indnkeyatts for the index.
4199 */
4200 return tupnatts > 0 && tupnatts <= nkeyatts;
4201}
int16_t int16
Definition: c.h:497
#define BT_PIVOT_HEAP_TID_ATTR
Definition: nbtree.h:466
#define P_HIKEY
Definition: nbtree.h:368
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:220
#define P_IGNORE(opaque)
Definition: nbtree.h:226
static bool BTreeTupleIsPosting(IndexTuple itup)
Definition: nbtree.h:493
static ItemPointer BTreeTupleGetHeapTID(IndexTuple itup)
Definition: nbtree.h:639
#define BTreeTupleGetNAtts(itup, rel)
Definition: nbtree.h:578
#define FirstOffsetNumber
Definition: off.h:27
ItemPointerData t_tid
Definition: itup.h:37

References Assert(), BT_PIVOT_HEAP_TID_ATTR, BTPageGetOpaque, BTreeTupleGetHeapTID(), BTreeTupleGetNAtts, BTreeTupleIsPivot(), BTreeTupleIsPosting(), FirstOffsetNumber, IndexRelationGetNumberOfAttributes, IndexRelationGetNumberOfKeyAttributes, ItemPointerGetOffsetNumber(), ItemPointerGetOffsetNumberNoCheck(), P_FIRSTDATAKEY, P_HIKEY, P_IGNORE, P_ISLEAF, P_RIGHTMOST, PageGetItem(), PageGetItemId(), PageGetMaxOffsetNumber(), and IndexTupleData::t_tid.

Referenced by _bt_compare(), and bt_target_page_check().

◆ _bt_check_third_page()

void _bt_check_third_page ( Relation  rel,
Relation  heap,
bool  needheaptidspace,
Page  page,
IndexTuple  newtup 
)

Definition at line 4216 of file nbtutils.c.

4218{
4219 Size itemsz;
4220 BTPageOpaque opaque;
4221
4222 itemsz = MAXALIGN(IndexTupleSize(newtup));
4223
4224 /* Double check item size against limit */
4225 if (itemsz <= BTMaxItemSize)
4226 return;
4227
4228 /*
4229 * Tuple is probably too large to fit on page, but it's possible that the
4230 * index uses version 2 or version 3, or that page is an internal page, in
4231 * which case a slightly higher limit applies.
4232 */
4233 if (!needheaptidspace && itemsz <= BTMaxItemSizeNoHeapTid)
4234 return;
4235
4236 /*
4237 * Internal page insertions cannot fail here, because that would mean that
4238 * an earlier leaf level insertion that should have failed didn't
4239 */
4240 opaque = BTPageGetOpaque(page);
4241 if (!P_ISLEAF(opaque))
4242 elog(ERROR, "cannot insert oversized tuple of size %zu on internal page of index \"%s\"",
4243 itemsz, RelationGetRelationName(rel));
4244
4245 ereport(ERROR,
4246 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
4247 errmsg("index row size %zu exceeds btree version %u maximum %zu for index \"%s\"",
4248 itemsz,
4249 needheaptidspace ? BTREE_VERSION : BTREE_NOVAC_VERSION,
4250 needheaptidspace ? BTMaxItemSize : BTMaxItemSizeNoHeapTid,
4252 errdetail("Index row references tuple (%u,%u) in relation \"%s\".",
4256 errhint("Values larger than 1/3 of a buffer page cannot be indexed.\n"
4257 "Consider a function index of an MD5 hash of the value, "
4258 "or use full text indexing."),
4260}
size_t Size
Definition: c.h:576
int errdetail(const char *fmt,...)
Definition: elog.c:1204
int errhint(const char *fmt,...)
Definition: elog.c:1318
int errmsg(const char *fmt,...)
Definition: elog.c:1071
static Size IndexTupleSize(const IndexTupleData *itup)
Definition: itup.h:71
#define BTREE_VERSION
Definition: nbtree.h:151
#define BTREE_NOVAC_VERSION
Definition: nbtree.h:153
#define BTMaxItemSizeNoHeapTid
Definition: nbtree.h:170
#define BTMaxItemSize
Definition: nbtree.h:165
int errtableconstraint(Relation rel, const char *conname)
Definition: relcache.c:6086

References BTMaxItemSize, BTMaxItemSizeNoHeapTid, BTPageGetOpaque, BTREE_NOVAC_VERSION, BTREE_VERSION, BTreeTupleGetHeapTID(), elog, ereport, errcode(), errdetail(), errhint(), errmsg(), ERROR, errtableconstraint(), IndexTupleSize(), ItemPointerGetBlockNumber(), ItemPointerGetOffsetNumber(), MAXALIGN, P_ISLEAF, and RelationGetRelationName.

Referenced by _bt_buildadd(), and _bt_findinsertloc().

◆ _bt_checkkeys()

bool _bt_checkkeys ( IndexScanDesc  scan,
BTReadPageState pstate,
bool  arrayKeys,
IndexTuple  tuple,
int  tupnatts 
)

Definition at line 2261 of file nbtutils.c.

2263{
2264 TupleDesc tupdesc = RelationGetDescr(scan->indexRelation);
2265 BTScanOpaque so = (BTScanOpaque) scan->opaque;
2266 ScanDirection dir = so->currPos.dir;
2267 int ikey = pstate->startikey;
2268 bool res;
2269
2270 Assert(BTreeTupleGetNAtts(tuple, scan->indexRelation) == tupnatts);
2271 Assert(!so->needPrimScan && !so->scanBehind && !so->oppositeDirCheck);
2272 Assert(arrayKeys || so->numArrayKeys == 0);
2273
2274 res = _bt_check_compare(scan, dir, tuple, tupnatts, tupdesc, arrayKeys,
2275 pstate->forcenonrequired, &pstate->continuescan,
2276 &ikey);
2277
2278 /*
2279 * If _bt_check_compare relied on the pstate.startikey optimization, call
2280 * again (in assert-enabled builds) to verify it didn't affect our answer.
2281 *
2282 * Note: we can't do this when !pstate.forcenonrequired, since any arrays
2283 * before pstate.startikey won't have advanced on this page at all.
2284 */
2285 Assert(!pstate->forcenonrequired || arrayKeys);
2286#ifdef USE_ASSERT_CHECKING
2287 if (pstate->startikey > 0 && !pstate->forcenonrequired)
2288 {
2289 bool dres,
2290 dcontinuescan;
2291 int dikey = 0;
2292
2293 /* Pass arrayKeys=false to avoid array side-effects */
2294 dres = _bt_check_compare(scan, dir, tuple, tupnatts, tupdesc, false,
2295 pstate->forcenonrequired, &dcontinuescan,
2296 &dikey);
2297 Assert(res == dres);
2298 Assert(pstate->continuescan == dcontinuescan);
2299
2300 /*
2301 * Should also get the same ikey result. We need a slightly weaker
2302 * assertion during arrayKeys calls, since they might be using an
2303 * array that couldn't be marked required during preprocessing.
2304 */
2305 Assert(arrayKeys || ikey == dikey);
2306 Assert(ikey <= dikey);
2307 }
2308#endif
2309
2310 /*
2311 * Only one _bt_check_compare call is required in the common case where
2312 * there are no equality strategy array scan keys. Otherwise we can only
2313 * accept _bt_check_compare's answer unreservedly when it didn't set
2314 * pstate.continuescan=false.
2315 */
2316 if (!arrayKeys || pstate->continuescan)
2317 return res;
2318
2319 /*
2320 * _bt_check_compare call set continuescan=false in the presence of
2321 * equality type array keys. This could mean that the tuple is just past
2322 * the end of matches for the current array keys.
2323 *
2324 * It's also possible that the scan is still _before_ the _start_ of
2325 * tuples matching the current set of array keys. Check for that first.
2326 */
2327 Assert(!pstate->forcenonrequired);
2328 if (_bt_tuple_before_array_skeys(scan, dir, tuple, tupdesc, tupnatts, true,
2329 ikey, NULL))
2330 {
2331 /* Override _bt_check_compare, continue primitive scan */
2332 pstate->continuescan = true;
2333
2334 /*
2335 * We will end up here repeatedly given a group of tuples > the
2336 * previous array keys and < the now-current keys (for a backwards
2337 * scan it's just the same, though the operators swap positions).
2338 *
2339 * We must avoid allowing this linear search process to scan very many
2340 * tuples from well before the start of tuples matching the current
2341 * array keys (or from well before the point where we'll once again
2342 * have to advance the scan's array keys).
2343 *
2344 * We keep the overhead under control by speculatively "looking ahead"
2345 * to later still-unscanned items from this same leaf page. We'll
2346 * only attempt this once the number of tuples that the linear search
2347 * process has examined starts to get out of hand.
2348 */
2349 pstate->rechecks++;
2351 {
2352 /* See if we should skip ahead within the current leaf page */
2353 _bt_checkkeys_look_ahead(scan, pstate, tupnatts, tupdesc);
2354
2355 /*
2356 * Might have set pstate.skip to a later page offset. When that
2357 * happens then _bt_readpage caller will inexpensively skip ahead
2358 * to a later tuple from the same page (the one just after the
2359 * tuple we successfully "looked ahead" to).
2360 */
2361 }
2362
2363 /* This indextuple doesn't match the current qual, in any case */
2364 return false;
2365 }
2366
2367 /*
2368 * Caller's tuple is >= the current set of array keys and other equality
2369 * constraint scan keys (or <= if this is a backwards scan). It's now
2370 * clear that we _must_ advance any required array keys in lockstep with
2371 * the scan.
2372 */
2373 return _bt_advance_array_keys(scan, pstate, tuple, tupnatts, tupdesc,
2374 ikey, true);
2375}
BTScanOpaqueData * BTScanOpaque
Definition: nbtree.h:1096
static bool _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate, IndexTuple tuple, int tupnatts, TupleDesc tupdesc, int sktrig, bool sktrig_required)
Definition: nbtutils.c:1460
#define LOOK_AHEAD_REQUIRED_RECHECKS
Definition: nbtutils.c:27
static bool _bt_check_compare(IndexScanDesc scan, ScanDirection dir, IndexTuple tuple, int tupnatts, TupleDesc tupdesc, bool advancenonrequired, bool forcenonrequired, bool *continuescan, int *ikey)
Definition: nbtutils.c:2795
static bool _bt_tuple_before_array_skeys(IndexScanDesc scan, ScanDirection dir, IndexTuple tuple, TupleDesc tupdesc, int tupnatts, bool readpagetup, int sktrig, bool *scanBehind)
Definition: nbtutils.c:1147
static void _bt_checkkeys_look_ahead(IndexScanDesc scan, BTReadPageState *pstate, int tupnatts, TupleDesc tupdesc)
Definition: nbtutils.c:3208
#define RelationGetDescr(relation)
Definition: rel.h:542
ScanDirection
Definition: sdir.h:25
bool forcenonrequired
Definition: nbtree.h:1109
bool continuescan
Definition: nbtree.h:1117
int16 rechecks
Definition: nbtree.h:1123
bool needPrimScan
Definition: nbtree.h:1063
BTScanPosData currPos
Definition: nbtree.h:1092
bool oppositeDirCheck
Definition: nbtree.h:1065
ScanDirection dir
Definition: nbtree.h:973
Relation indexRelation
Definition: relscan.h:137

References _bt_advance_array_keys(), _bt_check_compare(), _bt_checkkeys_look_ahead(), _bt_tuple_before_array_skeys(), Assert(), BTreeTupleGetNAtts, BTReadPageState::continuescan, BTScanOpaqueData::currPos, BTScanPosData::dir, BTReadPageState::forcenonrequired, IndexScanDescData::indexRelation, LOOK_AHEAD_REQUIRED_RECHECKS, BTScanOpaqueData::needPrimScan, BTScanOpaqueData::numArrayKeys, IndexScanDescData::opaque, BTScanOpaqueData::oppositeDirCheck, BTReadPageState::rechecks, RelationGetDescr, BTScanOpaqueData::scanBehind, and BTReadPageState::startikey.

Referenced by _bt_readpage().

◆ _bt_checkpage()

void _bt_checkpage ( Relation  rel,
Buffer  buf 
)

Definition at line 797 of file nbtpage.c.

798{
799 Page page = BufferGetPage(buf);
800
801 /*
802 * ReadBuffer verifies that every newly-read page passes
803 * PageHeaderIsValid, which means it either contains a reasonably sane
804 * page header or is all-zero. We have to defend against the all-zero
805 * case, however.
806 */
807 if (PageIsNew(page))
809 (errcode(ERRCODE_INDEX_CORRUPTED),
810 errmsg("index \"%s\" contains unexpected zero page at block %u",
813 errhint("Please REINDEX it.")));
814
815 /*
816 * Additionally check that the special area looks sane.
817 */
818 if (PageGetSpecialSize(page) != MAXALIGN(sizeof(BTPageOpaqueData)))
820 (errcode(ERRCODE_INDEX_CORRUPTED),
821 errmsg("index \"%s\" contains corrupted page at block %u",
824 errhint("Please REINDEX it.")));
825}
static uint16 PageGetSpecialSize(const PageData *page)
Definition: bufpage.h:317

References buf, BufferGetBlockNumber(), BufferGetPage(), ereport, errcode(), errhint(), errmsg(), ERROR, MAXALIGN, PageGetSpecialSize(), PageIsNew(), and RelationGetRelationName.

Referenced by _bt_getbuf(), _bt_relandgetbuf(), _bt_search_insert(), bt_recheck_sibling_links(), btvacuumpage(), and palloc_btree_page().

◆ _bt_compare()

int32 _bt_compare ( Relation  rel,
BTScanInsert  key,
Page  page,
OffsetNumber  offnum 
)

Definition at line 688 of file nbtsearch.c.

692{
693 TupleDesc itupdesc = RelationGetDescr(rel);
694 BTPageOpaque opaque = BTPageGetOpaque(page);
695 IndexTuple itup;
696 ItemPointer heapTid;
697 ScanKey scankey;
698 int ncmpkey;
699 int ntupatts;
700 int32 result;
701
702 Assert(_bt_check_natts(rel, key->heapkeyspace, page, offnum));
704 Assert(key->heapkeyspace || key->scantid == NULL);
705
706 /*
707 * Force result ">" if target item is first data item on an internal page
708 * --- see NOTE above.
709 */
710 if (!P_ISLEAF(opaque) && offnum == P_FIRSTDATAKEY(opaque))
711 return 1;
712
713 itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
714 ntupatts = BTreeTupleGetNAtts(itup, rel);
715
716 /*
717 * The scan key is set up with the attribute number associated with each
718 * term in the key. It is important that, if the index is multi-key, the
719 * scan contain the first k key attributes, and that they be in order. If
720 * you think about how multi-key ordering works, you'll understand why
721 * this is.
722 *
723 * We don't test for violation of this condition here, however. The
724 * initial setup for the index scan had better have gotten it right (see
725 * _bt_first).
726 */
727
728 ncmpkey = Min(ntupatts, key->keysz);
729 Assert(key->heapkeyspace || ncmpkey == key->keysz);
730 Assert(!BTreeTupleIsPosting(itup) || key->allequalimage);
731 scankey = key->scankeys;
732 for (int i = 1; i <= ncmpkey; i++)
733 {
734 Datum datum;
735 bool isNull;
736
737 datum = index_getattr(itup, scankey->sk_attno, itupdesc, &isNull);
738
739 if (scankey->sk_flags & SK_ISNULL) /* key is NULL */
740 {
741 if (isNull)
742 result = 0; /* NULL "=" NULL */
743 else if (scankey->sk_flags & SK_BT_NULLS_FIRST)
744 result = -1; /* NULL "<" NOT_NULL */
745 else
746 result = 1; /* NULL ">" NOT_NULL */
747 }
748 else if (isNull) /* key is NOT_NULL and item is NULL */
749 {
750 if (scankey->sk_flags & SK_BT_NULLS_FIRST)
751 result = 1; /* NOT_NULL ">" NULL */
752 else
753 result = -1; /* NOT_NULL "<" NULL */
754 }
755 else
756 {
757 /*
758 * The sk_func needs to be passed the index value as left arg and
759 * the sk_argument as right arg (they might be of different
760 * types). Since it is convenient for callers to think of
761 * _bt_compare as comparing the scankey to the index item, we have
762 * to flip the sign of the comparison result. (Unless it's a DESC
763 * column, in which case we *don't* flip the sign.)
764 */
765 result = DatumGetInt32(FunctionCall2Coll(&scankey->sk_func,
766 scankey->sk_collation,
767 datum,
768 scankey->sk_argument));
769
770 if (!(scankey->sk_flags & SK_BT_DESC))
771 INVERT_COMPARE_RESULT(result);
772 }
773
774 /* if the keys are unequal, return the difference */
775 if (result != 0)
776 return result;
777
778 scankey++;
779 }
780
781 /*
782 * All non-truncated attributes (other than heap TID) were found to be
783 * equal. Treat truncated attributes as minus infinity when scankey has a
784 * key attribute value that would otherwise be compared directly.
785 *
786 * Note: it doesn't matter if ntupatts includes non-key attributes;
787 * scankey won't, so explicitly excluding non-key attributes isn't
788 * necessary.
789 */
790 if (key->keysz > ntupatts)
791 return 1;
792
793 /*
794 * Use the heap TID attribute and scantid to try to break the tie. The
795 * rules are the same as any other key attribute -- only the
796 * representation differs.
797 */
798 heapTid = BTreeTupleGetHeapTID(itup);
799 if (key->scantid == NULL)
800 {
801 /*
802 * Forward scans have a scankey that is considered greater than a
803 * truncated pivot tuple if and when the scankey has equal values for
804 * attributes up to and including the least significant untruncated
805 * attribute in tuple. Even attributes that were omitted from the
806 * scan key are considered greater than -inf truncated attributes.
807 * (See _bt_binsrch for an explanation of our backward scan behavior.)
808 *
809 * For example, if an index has the minimum two attributes (single
810 * user key attribute, plus heap TID attribute), and a page's high key
811 * is ('foo', -inf), and scankey is ('foo', <omitted>), the search
812 * will not descend to the page to the left. The search will descend
813 * right instead. The truncated attribute in pivot tuple means that
814 * all non-pivot tuples on the page to the left are strictly < 'foo',
815 * so it isn't necessary to descend left. In other words, search
816 * doesn't have to descend left because it isn't interested in a match
817 * that has a heap TID value of -inf.
818 *
819 * Note: the heap TID part of the test ensures that scankey is being
820 * compared to a pivot tuple with one or more truncated -inf key
821 * attributes. The heap TID attribute is the last key attribute in
822 * every index, of course, but other than that it isn't special.
823 */
824 if (!key->backward && key->keysz == ntupatts && heapTid == NULL &&
825 key->heapkeyspace)
826 return 1;
827
828 /* All provided scankey arguments found to be equal */
829 return 0;
830 }
831
832 /*
833 * Treat truncated heap TID as minus infinity, since scankey has a key
834 * attribute value (scantid) that would otherwise be compared directly
835 */
837 if (heapTid == NULL)
838 return 1;
839
840 /*
841 * Scankey must be treated as equal to a posting list tuple if its scantid
842 * value falls within the range of the posting list. In all other cases
843 * there can only be a single heap TID value, which is compared directly
844 * with scantid.
845 */
847 result = ItemPointerCompare(key->scantid, heapTid);
848 if (result <= 0 || !BTreeTupleIsPosting(itup))
849 return result;
850 else
851 {
852 result = ItemPointerCompare(key->scantid,
854 if (result > 0)
855 return 1;
856 }
857
858 return 0;
859}
#define Min(x, y)
Definition: c.h:975
#define INVERT_COMPARE_RESULT(var)
Definition: c.h:1077
Datum FunctionCall2Coll(FmgrInfo *flinfo, Oid collation, Datum arg1, Datum arg2)
Definition: fmgr.c:1149
int32 ItemPointerCompare(ItemPointer arg1, ItemPointer arg2)
Definition: itemptr.c:51
static Datum index_getattr(IndexTuple tup, int attnum, TupleDesc tupleDesc, bool *isnull)
Definition: itup.h:131
#define SK_BT_NULLS_FIRST
Definition: nbtree.h:1147
#define SK_BT_DESC
Definition: nbtree.h:1146
static ItemPointer BTreeTupleGetMaxHeapTID(IndexTuple itup)
Definition: nbtree.h:665
bool _bt_check_natts(Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum)
Definition: nbtutils.c:4056
static int32 DatumGetInt32(Datum X)
Definition: postgres.h:207
int sk_flags
Definition: skey.h:66
Datum sk_argument
Definition: skey.h:72
FmgrInfo sk_func
Definition: skey.h:71
Oid sk_collation
Definition: skey.h:70
AttrNumber sk_attno
Definition: skey.h:67

References _bt_check_natts(), Assert(), BTPageGetOpaque, BTreeTupleGetHeapTID(), BTreeTupleGetMaxHeapTID(), BTreeTupleGetNAtts, BTreeTupleIsPosting(), DatumGetInt32(), FunctionCall2Coll(), i, index_getattr(), IndexRelationGetNumberOfKeyAttributes, INVERT_COMPARE_RESULT, ItemPointerCompare(), sort-test::key, Min, P_FIRSTDATAKEY, P_ISLEAF, PageGetItem(), PageGetItemId(), RelationGetDescr, ScanKeyData::sk_argument, ScanKeyData::sk_attno, SK_BT_DESC, SK_BT_NULLS_FIRST, ScanKeyData::sk_collation, ScanKeyData::sk_flags, ScanKeyData::sk_func, and SK_ISNULL.

Referenced by _bt_binsrch(), _bt_binsrch_insert(), _bt_check_unique(), _bt_findinsertloc(), _bt_moveright(), _bt_search_insert(), bt_rootdescend(), bt_target_page_check(), invariant_g_offset(), invariant_l_nontarget_offset(), invariant_l_offset(), and invariant_leq_offset().

◆ _bt_conditionallockbuf()

bool _bt_conditionallockbuf ( Relation  rel,
Buffer  buf 
)

Definition at line 1093 of file nbtpage.c.

1094{
1095 /* ConditionalLockBuffer() asserts that pin is held by this backend */
1097 return false;
1098
1099 if (!RelationUsesLocalBuffers(rel))
1101
1102 return true;
1103}
bool ConditionalLockBuffer(Buffer buffer)
Definition: bufmgr.c:5563

References buf, BufferGetPage(), ConditionalLockBuffer(), RelationUsesLocalBuffers, and VALGRIND_MAKE_MEM_DEFINED.

Referenced by _bt_allocbuf(), and _bt_search_insert().

◆ _bt_dedup_finish_pending()

Size _bt_dedup_finish_pending ( Page  newpage,
BTDedupState  state 
)

Definition at line 555 of file nbtdedup.c.

556{
557 OffsetNumber tupoff;
558 Size tuplesz;
559 Size spacesaving;
560
561 Assert(state->nitems > 0);
562 Assert(state->nitems <= state->nhtids);
563 Assert(state->intervals[state->nintervals].baseoff == state->baseoff);
564
565 tupoff = OffsetNumberNext(PageGetMaxOffsetNumber(newpage));
566 if (state->nitems == 1)
567 {
568 /* Use original, unchanged base tuple */
569 tuplesz = IndexTupleSize(state->base);
570 Assert(tuplesz == MAXALIGN(IndexTupleSize(state->base)));
571 Assert(tuplesz <= BTMaxItemSize);
572 if (PageAddItem(newpage, (Item) state->base, tuplesz, tupoff,
573 false, false) == InvalidOffsetNumber)
574 elog(ERROR, "deduplication failed to add tuple to page");
575
576 spacesaving = 0;
577 }
578 else
579 {
580 IndexTuple final;
581
582 /* Form a tuple with a posting list */
583 final = _bt_form_posting(state->base, state->htids, state->nhtids);
584 tuplesz = IndexTupleSize(final);
585 Assert(tuplesz <= state->maxpostingsize);
586
587 /* Save final number of items for posting list */
588 state->intervals[state->nintervals].nitems = state->nitems;
589
590 Assert(tuplesz == MAXALIGN(IndexTupleSize(final)));
591 Assert(tuplesz <= BTMaxItemSize);
592 if (PageAddItem(newpage, (Item) final, tuplesz, tupoff, false,
593 false) == InvalidOffsetNumber)
594 elog(ERROR, "deduplication failed to add tuple to page");
595
596 pfree(final);
597 spacesaving = state->phystupsize - (tuplesz + sizeof(ItemIdData));
598 /* Increment nintervals, since we wrote a new posting list tuple */
599 state->nintervals++;
600 Assert(spacesaving > 0 && spacesaving < BLCKSZ);
601 }
602
603 /* Reset state for next pending posting list */
604 state->nhtids = 0;
605 state->nitems = 0;
606 state->phystupsize = 0;
607
608 return spacesaving;
609}
#define PageAddItem(page, item, size, offsetNumber, overwrite, is_heap)
Definition: bufpage.h:472
Pointer Item
Definition: item.h:17
IndexTuple _bt_form_posting(IndexTuple base, ItemPointer htids, int nhtids)
Definition: nbtdedup.c:864

References _bt_form_posting(), Assert(), BTMaxItemSize, elog, ERROR, IndexTupleSize(), InvalidOffsetNumber, MAXALIGN, OffsetNumberNext, PageAddItem, PageGetMaxOffsetNumber(), and pfree().

Referenced by _bt_dedup_pass(), and btree_xlog_dedup().

◆ _bt_dedup_pass()

void _bt_dedup_pass ( Relation  rel,
Buffer  buf,
IndexTuple  newitem,
Size  newitemsz,
bool  bottomupdedup 
)

Definition at line 58 of file nbtdedup.c.

60{
61 OffsetNumber offnum,
62 minoff,
63 maxoff;
64 Page page = BufferGetPage(buf);
65 BTPageOpaque opaque = BTPageGetOpaque(page);
66 Page newpage;
68 Size pagesaving PG_USED_FOR_ASSERTS_ONLY = 0;
69 bool singlevalstrat = false;
70 int nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
71
72 /* Passed-in newitemsz is MAXALIGNED but does not include line pointer */
73 newitemsz += sizeof(ItemIdData);
74
75 /*
76 * Initialize deduplication state.
77 *
78 * It would be possible for maxpostingsize (limit on posting list tuple
79 * size) to be set to one third of the page. However, it seems like a
80 * good idea to limit the size of posting lists to one sixth of a page.
81 * That ought to leave us with a good split point when pages full of
82 * duplicates can be split several times.
83 */
85 state->deduplicate = true;
86 state->nmaxitems = 0;
87 state->maxpostingsize = Min(BTMaxItemSize / 2, INDEX_SIZE_MASK);
88 /* Metadata about base tuple of current pending posting list */
89 state->base = NULL;
90 state->baseoff = InvalidOffsetNumber;
91 state->basetupsize = 0;
92 /* Metadata about current pending posting list TIDs */
93 state->htids = palloc(state->maxpostingsize);
94 state->nhtids = 0;
95 state->nitems = 0;
96 /* Size of all physical tuples to be replaced by pending posting list */
97 state->phystupsize = 0;
98 /* nintervals should be initialized to zero */
99 state->nintervals = 0;
100
101 minoff = P_FIRSTDATAKEY(opaque);
102 maxoff = PageGetMaxOffsetNumber(page);
103
104 /*
105 * Consider applying "single value" strategy, though only if the page
106 * seems likely to be split in the near future
107 */
108 if (!bottomupdedup)
109 singlevalstrat = _bt_do_singleval(rel, page, state, minoff, newitem);
110
111 /*
112 * Deduplicate items from page, and write them to newpage.
113 *
114 * Copy the original page's LSN into newpage copy. This will become the
115 * updated version of the page. We need this because XLogInsert will
116 * examine the LSN and possibly dump it in a page image.
117 */
118 newpage = PageGetTempPageCopySpecial(page);
119 PageSetLSN(newpage, PageGetLSN(page));
120
121 /* Copy high key, if any */
122 if (!P_RIGHTMOST(opaque))
123 {
124 ItemId hitemid = PageGetItemId(page, P_HIKEY);
125 Size hitemsz = ItemIdGetLength(hitemid);
126 IndexTuple hitem = (IndexTuple) PageGetItem(page, hitemid);
127
128 if (PageAddItem(newpage, (Item) hitem, hitemsz, P_HIKEY,
129 false, false) == InvalidOffsetNumber)
130 elog(ERROR, "deduplication failed to add highkey");
131 }
132
133 for (offnum = minoff;
134 offnum <= maxoff;
135 offnum = OffsetNumberNext(offnum))
136 {
137 ItemId itemid = PageGetItemId(page, offnum);
138 IndexTuple itup = (IndexTuple) PageGetItem(page, itemid);
139
140 Assert(!ItemIdIsDead(itemid));
141
142 if (offnum == minoff)
143 {
144 /*
145 * No previous/base tuple for the data item -- use the data item
146 * as base tuple of pending posting list
147 */
148 _bt_dedup_start_pending(state, itup, offnum);
149 }
150 else if (state->deduplicate &&
151 _bt_keep_natts_fast(rel, state->base, itup) > nkeyatts &&
153 {
154 /*
155 * Tuple is equal to base tuple of pending posting list. Heap
156 * TID(s) for itup have been saved in state.
157 */
158 }
159 else
160 {
161 /*
162 * Tuple is not equal to pending posting list tuple, or
163 * _bt_dedup_save_htid() opted to not merge current item into
164 * pending posting list for some other reason (e.g., adding more
165 * TIDs would have caused posting list to exceed current
166 * maxpostingsize).
167 *
168 * If state contains pending posting list with more than one item,
169 * form new posting tuple and add it to our temp page (newpage).
170 * Else add pending interval's base tuple to the temp page as-is.
171 */
172 pagesaving += _bt_dedup_finish_pending(newpage, state);
173
174 if (singlevalstrat)
175 {
176 /*
177 * Single value strategy's extra steps.
178 *
179 * Lower maxpostingsize for sixth and final large posting list
180 * tuple at the point where 5 maxpostingsize-capped tuples
181 * have either been formed or observed.
182 *
183 * When a sixth maxpostingsize-capped item is formed/observed,
184 * stop merging together tuples altogether. The few tuples
185 * that remain at the end of the page won't be merged together
186 * at all (at least not until after a future page split takes
187 * place, when this page's newly allocated right sibling page
188 * gets its first deduplication pass).
189 */
190 if (state->nmaxitems == 5)
191 _bt_singleval_fillfactor(page, state, newitemsz);
192 else if (state->nmaxitems == 6)
193 {
194 state->deduplicate = false;
195 singlevalstrat = false; /* won't be back here */
196 }
197 }
198
199 /* itup starts new pending posting list */
200 _bt_dedup_start_pending(state, itup, offnum);
201 }
202 }
203
204 /* Handle the last item */
205 pagesaving += _bt_dedup_finish_pending(newpage, state);
206
207 /*
208 * If no items suitable for deduplication were found, newpage must be
209 * exactly the same as the original page, so just return from function.
210 *
211 * We could determine whether or not to proceed on the basis the space
212 * savings being sufficient to avoid an immediate page split instead. We
213 * don't do that because there is some small value in nbtsplitloc.c always
214 * operating against a page that is fully deduplicated (apart from
215 * newitem). Besides, most of the cost has already been paid.
216 */
217 if (state->nintervals == 0)
218 {
219 /* cannot leak memory here */
220 pfree(newpage);
221 pfree(state->htids);
222 pfree(state);
223 return;
224 }
225
226 /*
227 * By here, it's clear that deduplication will definitely go ahead.
228 *
229 * Clear the BTP_HAS_GARBAGE page flag. The index must be a heapkeyspace
230 * index, and as such we'll never pay attention to BTP_HAS_GARBAGE anyway.
231 * But keep things tidy.
232 */
233 if (P_HAS_GARBAGE(opaque))
234 {
235 BTPageOpaque nopaque = BTPageGetOpaque(newpage);
236
237 nopaque->btpo_flags &= ~BTP_HAS_GARBAGE;
238 }
239
241
242 PageRestoreTempPage(newpage, page);
244
245 /* XLOG stuff */
246 if (RelationNeedsWAL(rel))
247 {
248 XLogRecPtr recptr;
249 xl_btree_dedup xlrec_dedup;
250
251 xlrec_dedup.nintervals = state->nintervals;
252
255 XLogRegisterData(&xlrec_dedup, SizeOfBtreeDedup);
256
257 /*
258 * The intervals array is not in the buffer, but pretend that it is.
259 * When XLogInsert stores the whole buffer, the array need not be
260 * stored too.
261 */
262 XLogRegisterBufData(0, state->intervals,
263 state->nintervals * sizeof(BTDedupInterval));
264
265 recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DEDUP);
266
267 PageSetLSN(page, recptr);
268 }
269
271
272 /* Local space accounting should agree with page accounting */
273 Assert(pagesaving < newitemsz || PageGetExactFreeSpace(page) >= newitemsz);
274
275 /* cannot leak memory here */
276 pfree(state->htids);
277 pfree(state);
278}
void MarkBufferDirty(Buffer buffer)
Definition: bufmgr.c:2945
void PageRestoreTempPage(Page tempPage, Page oldPage)
Definition: bufpage.c:423
Page PageGetTempPageCopySpecial(const PageData *page)
Definition: bufpage.c:401
static void PageSetLSN(Page page, XLogRecPtr lsn)
Definition: bufpage.h:391
static XLogRecPtr PageGetLSN(const PageData *page)
Definition: bufpage.h:386
#define PG_USED_FOR_ASSERTS_ONLY
Definition: c.h:224
#define ItemIdGetLength(itemId)
Definition: itemid.h:59
#define INDEX_SIZE_MASK
Definition: itup.h:65
#define START_CRIT_SECTION()
Definition: miscadmin.h:149
#define END_CRIT_SECTION()
Definition: miscadmin.h:151
static bool _bt_do_singleval(Relation rel, Page page, BTDedupState state, OffsetNumber minoff, IndexTuple newitem)
Definition: nbtdedup.c:782
Size _bt_dedup_finish_pending(Page newpage, BTDedupState state)
Definition: nbtdedup.c:555
static void _bt_singleval_fillfactor(Page page, BTDedupState state, Size newitemsz)
Definition: nbtdedup.c:822
#define P_HAS_GARBAGE(opaque)
Definition: nbtree.h:227
#define XLOG_BTREE_DEDUP
Definition: nbtxlog.h:33
#define SizeOfBtreeDedup
Definition: nbtxlog.h:174
uint16 btpo_flags
Definition: nbtree.h:68
uint16 nintervals
Definition: nbtxlog.h:169
uint64 XLogRecPtr
Definition: xlogdefs.h:21
void XLogRegisterBufData(uint8 block_id, const void *data, uint32 len)
Definition: xloginsert.c:405
void XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags)
Definition: xloginsert.c:242
#define REGBUF_STANDARD
Definition: xloginsert.h:35

References _bt_dedup_finish_pending(), _bt_dedup_save_htid(), _bt_dedup_start_pending(), _bt_do_singleval(), _bt_keep_natts_fast(), _bt_singleval_fillfactor(), Assert(), BTMaxItemSize, BTPageGetOpaque, BTPageOpaqueData::btpo_flags, buf, BufferGetPage(), elog, END_CRIT_SECTION, ERROR, INDEX_SIZE_MASK, IndexRelationGetNumberOfKeyAttributes, InvalidOffsetNumber, ItemIdGetLength, ItemIdIsDead, MarkBufferDirty(), Min, xl_btree_dedup::nintervals, OffsetNumberNext, P_FIRSTDATAKEY, P_HAS_GARBAGE, P_HIKEY, P_RIGHTMOST, PageAddItem, PageGetExactFreeSpace(), PageGetItem(), PageGetItemId(), PageGetLSN(), PageGetMaxOffsetNumber(), PageGetTempPageCopySpecial(), PageRestoreTempPage(), PageSetLSN(), palloc(), pfree(), PG_USED_FOR_ASSERTS_ONLY, REGBUF_STANDARD, RelationNeedsWAL, SizeOfBtreeDedup, START_CRIT_SECTION, XLOG_BTREE_DEDUP, XLogBeginInsert(), XLogInsert(), XLogRegisterBufData(), XLogRegisterBuffer(), and XLogRegisterData().

Referenced by _bt_delete_or_dedup_one_page().

◆ _bt_dedup_save_htid()

bool _bt_dedup_save_htid ( BTDedupState  state,
IndexTuple  itup 
)

Definition at line 484 of file nbtdedup.c.

485{
486 int nhtids;
487 ItemPointer htids;
488 Size mergedtupsz;
489
491
492 if (!BTreeTupleIsPosting(itup))
493 {
494 nhtids = 1;
495 htids = &itup->t_tid;
496 }
497 else
498 {
499 nhtids = BTreeTupleGetNPosting(itup);
500 htids = BTreeTupleGetPosting(itup);
501 }
502
503 /*
504 * Don't append (have caller finish pending posting list as-is) if
505 * appending heap TID(s) from itup would put us over maxpostingsize limit.
506 *
507 * This calculation needs to match the code used within _bt_form_posting()
508 * for new posting list tuples.
509 */
510 mergedtupsz = MAXALIGN(state->basetupsize +
511 (state->nhtids + nhtids) * sizeof(ItemPointerData));
512
513 if (mergedtupsz > state->maxpostingsize)
514 {
515 /*
516 * Count this as an oversized item for single value strategy, though
517 * only when there are 50 TIDs in the final posting list tuple. This
518 * limit (which is fairly arbitrary) avoids confusion about how many
519 * 1/6 of a page tuples have been encountered/created by the current
520 * deduplication pass.
521 *
522 * Note: We deliberately don't consider which deduplication pass
523 * merged together tuples to create this item (could be a previous
524 * deduplication pass, or current pass). See _bt_do_singleval()
525 * comments.
526 */
527 if (state->nhtids > 50)
528 state->nmaxitems++;
529
530 return false;
531 }
532
533 /*
534 * Save heap TIDs to pending posting list tuple -- itup can be merged into
535 * pending posting list
536 */
537 state->nitems++;
538 memcpy(state->htids + state->nhtids, htids,
539 sizeof(ItemPointerData) * nhtids);
540 state->nhtids += nhtids;
541 state->phystupsize += MAXALIGN(IndexTupleSize(itup)) + sizeof(ItemIdData);
542
543 return true;
544}
static uint16 BTreeTupleGetNPosting(IndexTuple posting)
Definition: nbtree.h:519
static ItemPointer BTreeTupleGetPosting(IndexTuple posting)
Definition: nbtree.h:538

References Assert(), BTreeTupleGetNPosting(), BTreeTupleGetPosting(), BTreeTupleIsPivot(), BTreeTupleIsPosting(), IndexTupleSize(), MAXALIGN, and IndexTupleData::t_tid.

Referenced by _bt_bottomupdel_pass(), _bt_dedup_pass(), _bt_load(), and btree_xlog_dedup().

◆ _bt_dedup_start_pending()

void _bt_dedup_start_pending ( BTDedupState  state,
IndexTuple  base,
OffsetNumber  baseoff 
)

Definition at line 433 of file nbtdedup.c.

435{
436 Assert(state->nhtids == 0);
437 Assert(state->nitems == 0);
439
440 /*
441 * Copy heap TID(s) from new base tuple for new candidate posting list
442 * into working state's array
443 */
444 if (!BTreeTupleIsPosting(base))
445 {
446 memcpy(state->htids, &base->t_tid, sizeof(ItemPointerData));
447 state->nhtids = 1;
448 state->basetupsize = IndexTupleSize(base);
449 }
450 else
451 {
452 int nposting;
453
454 nposting = BTreeTupleGetNPosting(base);
455 memcpy(state->htids, BTreeTupleGetPosting(base),
456 sizeof(ItemPointerData) * nposting);
457 state->nhtids = nposting;
458 /* basetupsize should not include existing posting list */
459 state->basetupsize = BTreeTupleGetPostingOffset(base);
460 }
461
462 /*
463 * Save new base tuple itself -- it'll be needed if we actually create a
464 * new posting list from new pending posting list.
465 *
466 * Must maintain physical size of all existing tuples (including line
467 * pointer overhead) so that we can calculate space savings on page.
468 */
469 state->nitems = 1;
470 state->base = base;
471 state->baseoff = baseoff;
472 state->phystupsize = MAXALIGN(IndexTupleSize(base)) + sizeof(ItemIdData);
473 /* Also save baseoff in pending state for interval */
474 state->intervals[state->nintervals].baseoff = state->baseoff;
475}
static uint32 BTreeTupleGetPostingOffset(IndexTuple posting)
Definition: nbtree.h:530

References Assert(), BTreeTupleGetNPosting(), BTreeTupleGetPosting(), BTreeTupleGetPostingOffset(), BTreeTupleIsPivot(), BTreeTupleIsPosting(), IndexTupleSize(), MAXALIGN, and IndexTupleData::t_tid.

Referenced by _bt_bottomupdel_pass(), _bt_dedup_pass(), _bt_load(), and btree_xlog_dedup().

◆ _bt_delitems_delete_check()

void _bt_delitems_delete_check ( Relation  rel,
Buffer  buf,
Relation  heapRel,
TM_IndexDeleteOp delstate 
)

Definition at line 1513 of file nbtpage.c.

1515{
1516 Page page = BufferGetPage(buf);
1517 TransactionId snapshotConflictHorizon;
1518 bool isCatalogRel;
1519 OffsetNumber postingidxoffnum = InvalidOffsetNumber;
1520 int ndeletable = 0,
1521 nupdatable = 0;
1524
1525 /* Use tableam interface to determine which tuples to delete first */
1526 snapshotConflictHorizon = table_index_delete_tuples(heapRel, delstate);
1527 isCatalogRel = RelationIsAccessibleInLogicalDecoding(heapRel);
1528
1529 /* Should not WAL-log snapshotConflictHorizon unless it's required */
1530 if (!XLogStandbyInfoActive())
1531 snapshotConflictHorizon = InvalidTransactionId;
1532
1533 /*
1534 * Construct a leaf-page-wise description of what _bt_delitems_delete()
1535 * needs to do to physically delete index tuples from the page.
1536 *
1537 * Must sort deltids array to restore leaf-page-wise order (original order
1538 * before call to tableam). This is the order that the loop expects.
1539 *
1540 * Note that deltids array might be a lot smaller now. It might even have
1541 * no entries at all (with bottom-up deletion caller), in which case there
1542 * is nothing left to do.
1543 */
1544 qsort(delstate->deltids, delstate->ndeltids, sizeof(TM_IndexDelete),
1546 if (delstate->ndeltids == 0)
1547 {
1548 Assert(delstate->bottomup);
1549 return;
1550 }
1551
1552 /* We definitely have to delete at least one index tuple (or one TID) */
1553 for (int i = 0; i < delstate->ndeltids; i++)
1554 {
1555 TM_IndexStatus *dstatus = delstate->status + delstate->deltids[i].id;
1556 OffsetNumber idxoffnum = dstatus->idxoffnum;
1557 ItemId itemid = PageGetItemId(page, idxoffnum);
1558 IndexTuple itup = (IndexTuple) PageGetItem(page, itemid);
1559 int nestedi,
1560 nitem;
1561 BTVacuumPosting vacposting;
1562
1563 Assert(OffsetNumberIsValid(idxoffnum));
1564
1565 if (idxoffnum == postingidxoffnum)
1566 {
1567 /*
1568 * This deltid entry is a TID from a posting list tuple that has
1569 * already been completely processed
1570 */
1573 &delstate->deltids[i].tid) < 0);
1575 &delstate->deltids[i].tid) >= 0);
1576 continue;
1577 }
1578
1579 if (!BTreeTupleIsPosting(itup))
1580 {
1581 /* Plain non-pivot tuple */
1582 Assert(ItemPointerEquals(&itup->t_tid, &delstate->deltids[i].tid));
1583 if (dstatus->knowndeletable)
1584 deletable[ndeletable++] = idxoffnum;
1585 continue;
1586 }
1587
1588 /*
1589 * itup is a posting list tuple whose lowest deltids entry (which may
1590 * or may not be for the first TID from itup) is considered here now.
1591 * We should process all of the deltids entries for the posting list
1592 * together now, though (not just the lowest). Remember to skip over
1593 * later itup-related entries during later iterations of outermost
1594 * loop.
1595 */
1596 postingidxoffnum = idxoffnum; /* Remember work in outermost loop */
1597 nestedi = i; /* Initialize for first itup deltids entry */
1598 vacposting = NULL; /* Describes final action for itup */
1599 nitem = BTreeTupleGetNPosting(itup);
1600 for (int p = 0; p < nitem; p++)
1601 {
1602 ItemPointer ptid = BTreeTupleGetPostingN(itup, p);
1603 int ptidcmp = -1;
1604
1605 /*
1606 * This nested loop reuses work across ptid TIDs taken from itup.
1607 * We take advantage of the fact that both itup's TIDs and deltids
1608 * entries (within a single itup/posting list grouping) must both
1609 * be in ascending TID order.
1610 */
1611 for (; nestedi < delstate->ndeltids; nestedi++)
1612 {
1613 TM_IndexDelete *tcdeltid = &delstate->deltids[nestedi];
1614 TM_IndexStatus *tdstatus = (delstate->status + tcdeltid->id);
1615
1616 /* Stop once we get past all itup related deltids entries */
1617 Assert(tdstatus->idxoffnum >= idxoffnum);
1618 if (tdstatus->idxoffnum != idxoffnum)
1619 break;
1620
1621 /* Skip past non-deletable itup related entries up front */
1622 if (!tdstatus->knowndeletable)
1623 continue;
1624
1625 /* Entry is first partial ptid match (or an exact match)? */
1626 ptidcmp = ItemPointerCompare(&tcdeltid->tid, ptid);
1627 if (ptidcmp >= 0)
1628 {
1629 /* Greater than or equal (partial or exact) match... */
1630 break;
1631 }
1632 }
1633
1634 /* ...exact ptid match to a deletable deltids entry? */
1635 if (ptidcmp != 0)
1636 continue;
1637
1638 /* Exact match for deletable deltids entry -- ptid gets deleted */
1639 if (vacposting == NULL)
1640 {
1641 vacposting = palloc(offsetof(BTVacuumPostingData, deletetids) +
1642 nitem * sizeof(uint16));
1643 vacposting->itup = itup;
1644 vacposting->updatedoffset = idxoffnum;
1645 vacposting->ndeletedtids = 0;
1646 }
1647 vacposting->deletetids[vacposting->ndeletedtids++] = p;
1648 }
1649
1650 /* Final decision on itup, a posting list tuple */
1651
1652 if (vacposting == NULL)
1653 {
1654 /* No TIDs to delete from itup -- do nothing */
1655 }
1656 else if (vacposting->ndeletedtids == nitem)
1657 {
1658 /* Straight delete of itup (to delete all TIDs) */
1659 deletable[ndeletable++] = idxoffnum;
1660 /* Turns out we won't need granular information */
1661 pfree(vacposting);
1662 }
1663 else
1664 {
1665 /* Delete some (but not all) TIDs from itup */
1666 Assert(vacposting->ndeletedtids > 0 &&
1667 vacposting->ndeletedtids < nitem);
1668 updatable[nupdatable++] = vacposting;
1669 }
1670 }
1671
1672 /* Physically delete tuples (or TIDs) using deletable (or updatable) */
1673 _bt_delitems_delete(rel, buf, snapshotConflictHorizon, isCatalogRel,
1674 deletable, ndeletable, updatable, nupdatable);
1675
1676 /* be tidy */
1677 for (int i = 0; i < nupdatable; i++)
1678 pfree(updatable[i]);
1679}
uint16_t uint16
Definition: c.h:501
uint32 TransactionId
Definition: c.h:623
bool ItemPointerEquals(ItemPointer pointer1, ItemPointer pointer2)
Definition: itemptr.c:35
#define MaxIndexTuplesPerPage
Definition: itup.h:181
static void _bt_delitems_delete(Relation rel, Buffer buf, TransactionId snapshotConflictHorizon, bool isCatalogRel, OffsetNumber *deletable, int ndeletable, BTVacuumPosting *updatable, int nupdatable)
Definition: nbtpage.c:1284
static int _bt_delitems_cmp(const void *a, const void *b)
Definition: nbtpage.c:1464
static ItemPointer BTreeTupleGetPostingN(IndexTuple posting, int n)
Definition: nbtree.h:545
#define OffsetNumberIsValid(offsetNumber)
Definition: off.h:39
#define qsort(a, b, c, d)
Definition: port.h:479
uint16 deletetids[FLEXIBLE_ARRAY_MEMBER]
Definition: nbtree.h:922
uint16 ndeletedtids
Definition: nbtree.h:921
IndexTuple itup
Definition: nbtree.h:917
OffsetNumber updatedoffset
Definition: nbtree.h:918
ItemPointerData tid
Definition: tableam.h:206
bool knowndeletable
Definition: tableam.h:213
OffsetNumber idxoffnum
Definition: tableam.h:212
static TransactionId table_index_delete_tuples(Relation rel, TM_IndexDeleteOp *delstate)
Definition: tableam.h:1316
#define InvalidTransactionId
Definition: transam.h:31

References _bt_delitems_cmp(), _bt_delitems_delete(), Assert(), TM_IndexDeleteOp::bottomup, BTreeTupleGetHeapTID(), BTreeTupleGetMaxHeapTID(), BTreeTupleGetNPosting(), BTreeTupleGetPostingN(), BTreeTupleIsPosting(), buf, BufferGetPage(), BTVacuumPostingData::deletetids, TM_IndexDeleteOp::deltids, i, TM_IndexDelete::id, TM_IndexStatus::idxoffnum, InvalidOffsetNumber, InvalidTransactionId, ItemPointerCompare(), ItemPointerEquals(), BTVacuumPostingData::itup, TM_IndexStatus::knowndeletable, MaxIndexTuplesPerPage, BTVacuumPostingData::ndeletedtids, TM_IndexDeleteOp::ndeltids, OffsetNumberIsValid, PageGetItem(), PageGetItemId(), palloc(), pfree(), qsort, RelationIsAccessibleInLogicalDecoding, TM_IndexDeleteOp::status, IndexTupleData::t_tid, table_index_delete_tuples(), TM_IndexDelete::tid, BTVacuumPostingData::updatedoffset, and XLogStandbyInfoActive.

Referenced by _bt_bottomupdel_pass(), and _bt_simpledel_pass().

◆ _bt_delitems_vacuum()

void _bt_delitems_vacuum ( Relation  rel,
Buffer  buf,
OffsetNumber deletable,
int  ndeletable,
BTVacuumPosting updatable,
int  nupdatable 
)

Definition at line 1154 of file nbtpage.c.

1157{
1158 Page page = BufferGetPage(buf);
1159 BTPageOpaque opaque;
1160 bool needswal = RelationNeedsWAL(rel);
1161 char *updatedbuf = NULL;
1162 Size updatedbuflen = 0;
1163 OffsetNumber updatedoffsets[MaxIndexTuplesPerPage];
1164
1165 /* Shouldn't be called unless there's something to do */
1166 Assert(ndeletable > 0 || nupdatable > 0);
1167
1168 /* Generate new version of posting lists without deleted TIDs */
1169 if (nupdatable > 0)
1170 updatedbuf = _bt_delitems_update(updatable, nupdatable,
1171 updatedoffsets, &updatedbuflen,
1172 needswal);
1173
1174 /* No ereport(ERROR) until changes are logged */
1176
1177 /*
1178 * Handle posting tuple updates.
1179 *
1180 * Deliberately do this before handling simple deletes. If we did it the
1181 * other way around (i.e. WAL record order -- simple deletes before
1182 * updates) then we'd have to make compensating changes to the 'updatable'
1183 * array of offset numbers.
1184 *
1185 * PageIndexTupleOverwrite() won't unset each item's LP_DEAD bit when it
1186 * happens to already be set. It's important that we not interfere with
1187 * any future simple index tuple deletion operations.
1188 */
1189 for (int i = 0; i < nupdatable; i++)
1190 {
1191 OffsetNumber updatedoffset = updatedoffsets[i];
1192 IndexTuple itup;
1193 Size itemsz;
1194
1195 itup = updatable[i]->itup;
1196 itemsz = MAXALIGN(IndexTupleSize(itup));
1197 if (!PageIndexTupleOverwrite(page, updatedoffset, (Item) itup,
1198 itemsz))
1199 elog(PANIC, "failed to update partially dead item in block %u of index \"%s\"",
1201 }
1202
1203 /* Now handle simple deletes of entire tuples */
1204 if (ndeletable > 0)
1205 PageIndexMultiDelete(page, deletable, ndeletable);
1206
1207 /*
1208 * We can clear the vacuum cycle ID since this page has certainly been
1209 * processed by the current vacuum scan.
1210 */
1211 opaque = BTPageGetOpaque(page);
1212 opaque->btpo_cycleid = 0;
1213
1214 /*
1215 * Clear the BTP_HAS_GARBAGE page flag.
1216 *
1217 * This flag indicates the presence of LP_DEAD items on the page (though
1218 * not reliably). Note that we only rely on it with pg_upgrade'd
1219 * !heapkeyspace indexes. That's why clearing it here won't usually
1220 * interfere with simple index tuple deletion.
1221 */
1222 opaque->btpo_flags &= ~BTP_HAS_GARBAGE;
1223
1225
1226 /* XLOG stuff */
1227 if (needswal)
1228 {
1229 XLogRecPtr recptr;
1230 xl_btree_vacuum xlrec_vacuum;
1231
1232 xlrec_vacuum.ndeleted = ndeletable;
1233 xlrec_vacuum.nupdated = nupdatable;
1234
1237 XLogRegisterData(&xlrec_vacuum, SizeOfBtreeVacuum);
1238
1239 if (ndeletable > 0)
1240 XLogRegisterBufData(0, deletable,
1241 ndeletable * sizeof(OffsetNumber));
1242
1243 if (nupdatable > 0)
1244 {
1245 XLogRegisterBufData(0, updatedoffsets,
1246 nupdatable * sizeof(OffsetNumber));
1247 XLogRegisterBufData(0, updatedbuf, updatedbuflen);
1248 }
1249
1250 recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_VACUUM);
1251
1252 PageSetLSN(page, recptr);
1253 }
1254
1256
1257 /* can't leak memory here */
1258 if (updatedbuf != NULL)
1259 pfree(updatedbuf);
1260 /* free tuples allocated within _bt_delitems_update() */
1261 for (int i = 0; i < nupdatable; i++)
1262 pfree(updatable[i]->itup);
1263}
void PageIndexMultiDelete(Page page, OffsetNumber *itemnos, int nitems)
Definition: bufpage.c:1160
bool PageIndexTupleOverwrite(Page page, OffsetNumber offnum, Item newtup, Size newsize)
Definition: bufpage.c:1404
#define PANIC
Definition: elog.h:42
static char * _bt_delitems_update(BTVacuumPosting *updatable, int nupdatable, OffsetNumber *updatedoffsets, Size *updatedbuflen, bool needswal)
Definition: nbtpage.c:1405
#define SizeOfBtreeVacuum
Definition: nbtxlog.h:234
#define XLOG_BTREE_VACUUM
Definition: nbtxlog.h:39
BTCycleId btpo_cycleid
Definition: nbtree.h:69
uint16 ndeleted
Definition: nbtxlog.h:222
uint16 nupdated
Definition: nbtxlog.h:223

References _bt_delitems_update(), Assert(), BTPageGetOpaque, BTPageOpaqueData::btpo_cycleid, BTPageOpaqueData::btpo_flags, buf, BufferGetBlockNumber(), BufferGetPage(), elog, END_CRIT_SECTION, i, IndexTupleSize(), BTVacuumPostingData::itup, MarkBufferDirty(), MAXALIGN, MaxIndexTuplesPerPage, xl_btree_vacuum::ndeleted, xl_btree_vacuum::nupdated, PageIndexMultiDelete(), PageIndexTupleOverwrite(), PageSetLSN(), PANIC, pfree(), REGBUF_STANDARD, RelationGetRelationName, RelationNeedsWAL, SizeOfBtreeVacuum, START_CRIT_SECTION, XLOG_BTREE_VACUUM, XLogBeginInsert(), XLogInsert(), XLogRegisterBufData(), XLogRegisterBuffer(), and XLogRegisterData().

Referenced by btvacuumpage().

◆ _bt_doinsert()

bool _bt_doinsert ( Relation  rel,
IndexTuple  itup,
IndexUniqueCheck  checkUnique,
bool  indexUnchanged,
Relation  heapRel 
)

Definition at line 102 of file nbtinsert.c.

105{
106 bool is_unique = false;
107 BTInsertStateData insertstate;
108 BTScanInsert itup_key;
109 BTStack stack;
110 bool checkingunique = (checkUnique != UNIQUE_CHECK_NO);
111
112 /* we need an insertion scan key to do our search, so build one */
113 itup_key = _bt_mkscankey(rel, itup);
114
115 if (checkingunique)
116 {
117 if (!itup_key->anynullkeys)
118 {
119 /* No (heapkeyspace) scantid until uniqueness established */
120 itup_key->scantid = NULL;
121 }
122 else
123 {
124 /*
125 * Scan key for new tuple contains NULL key values. Bypass
126 * checkingunique steps. They are unnecessary because core code
127 * considers NULL unequal to every value, including NULL.
128 *
129 * This optimization avoids O(N^2) behavior within the
130 * _bt_findinsertloc() heapkeyspace path when a unique index has a
131 * large number of "duplicates" with NULL key values.
132 */
133 checkingunique = false;
134 /* Tuple is unique in the sense that core code cares about */
135 Assert(checkUnique != UNIQUE_CHECK_EXISTING);
136 is_unique = true;
137 }
138 }
139
140 /*
141 * Fill in the BTInsertState working area, to track the current page and
142 * position within the page to insert on.
143 *
144 * Note that itemsz is passed down to lower level code that deals with
145 * inserting the item. It must be MAXALIGN()'d. This ensures that space
146 * accounting code consistently considers the alignment overhead that we
147 * expect PageAddItem() will add later. (Actually, index_form_tuple() is
148 * already conservative about alignment, but we don't rely on that from
149 * this distance. Besides, preserving the "true" tuple size in index
150 * tuple headers for the benefit of nbtsplitloc.c might happen someday.
151 * Note that heapam does not MAXALIGN() each heap tuple's lp_len field.)
152 */
153 insertstate.itup = itup;
154 insertstate.itemsz = MAXALIGN(IndexTupleSize(itup));
155 insertstate.itup_key = itup_key;
156 insertstate.bounds_valid = false;
157 insertstate.buf = InvalidBuffer;
158 insertstate.postingoff = 0;
159
160search:
161
162 /*
163 * Find and lock the leaf page that the tuple should be added to by
164 * searching from the root page. insertstate.buf will hold a buffer that
165 * is locked in exclusive mode afterwards.
166 */
167 stack = _bt_search_insert(rel, heapRel, &insertstate);
168
169 /*
170 * checkingunique inserts are not allowed to go ahead when two tuples with
171 * equal key attribute values would be visible to new MVCC snapshots once
172 * the xact commits. Check for conflicts in the locked page/buffer (if
173 * needed) here.
174 *
175 * It might be necessary to check a page to the right in _bt_check_unique,
176 * though that should be very rare. In practice the first page the value
177 * could be on (with scantid omitted) is almost always also the only page
178 * that a matching tuple might be found on. This is due to the behavior
179 * of _bt_findsplitloc with duplicate tuples -- a group of duplicates can
180 * only be allowed to cross a page boundary when there is no candidate
181 * leaf page split point that avoids it. Also, _bt_check_unique can use
182 * the leaf page high key to determine that there will be no duplicates on
183 * the right sibling without actually visiting it (it uses the high key in
184 * cases where the new item happens to belong at the far right of the leaf
185 * page).
186 *
187 * NOTE: obviously, _bt_check_unique can only detect keys that are already
188 * in the index; so it cannot defend against concurrent insertions of the
189 * same key. We protect against that by means of holding a write lock on
190 * the first page the value could be on, with omitted/-inf value for the
191 * implicit heap TID tiebreaker attribute. Any other would-be inserter of
192 * the same key must acquire a write lock on the same page, so only one
193 * would-be inserter can be making the check at one time. Furthermore,
194 * once we are past the check we hold write locks continuously until we
195 * have performed our insertion, so no later inserter can fail to see our
196 * insertion. (This requires some care in _bt_findinsertloc.)
197 *
198 * If we must wait for another xact, we release the lock while waiting,
199 * and then must perform a new search.
200 *
201 * For a partial uniqueness check, we don't wait for the other xact. Just
202 * let the tuple in and return false for possibly non-unique, or true for
203 * definitely unique.
204 */
205 if (checkingunique)
206 {
207 TransactionId xwait;
208 uint32 speculativeToken;
209
210 xwait = _bt_check_unique(rel, &insertstate, heapRel, checkUnique,
211 &is_unique, &speculativeToken);
212
213 if (unlikely(TransactionIdIsValid(xwait)))
214 {
215 /* Have to wait for the other guy ... */
216 _bt_relbuf(rel, insertstate.buf);
217 insertstate.buf = InvalidBuffer;
218
219 /*
220 * If it's a speculative insertion, wait for it to finish (ie. to
221 * go ahead with the insertion, or kill the tuple). Otherwise
222 * wait for the transaction to finish as usual.
223 */
224 if (speculativeToken)
225 SpeculativeInsertionWait(xwait, speculativeToken);
226 else
227 XactLockTableWait(xwait, rel, &itup->t_tid, XLTW_InsertIndex);
228
229 /* start over... */
230 if (stack)
231 _bt_freestack(stack);
232 goto search;
233 }
234
235 /* Uniqueness is established -- restore heap tid as scantid */
236 if (itup_key->heapkeyspace)
237 itup_key->scantid = &itup->t_tid;
238 }
239
240 if (checkUnique != UNIQUE_CHECK_EXISTING)
241 {
242 OffsetNumber newitemoff;
243
244 /*
245 * The only conflict predicate locking cares about for indexes is when
246 * an index tuple insert conflicts with an existing lock. We don't
247 * know the actual page we're going to insert on for sure just yet in
248 * checkingunique and !heapkeyspace cases, but it's okay to use the
249 * first page the value could be on (with scantid omitted) instead.
250 */
252
253 /*
254 * Do the insertion. Note that insertstate contains cached binary
255 * search bounds established within _bt_check_unique when insertion is
256 * checkingunique.
257 */
258 newitemoff = _bt_findinsertloc(rel, &insertstate, checkingunique,
259 indexUnchanged, stack, heapRel);
260 _bt_insertonpg(rel, heapRel, itup_key, insertstate.buf, InvalidBuffer,
261 stack, itup, insertstate.itemsz, newitemoff,
262 insertstate.postingoff, false);
263 }
264 else
265 {
266 /* just release the buffer */
267 _bt_relbuf(rel, insertstate.buf);
268 }
269
270 /* be tidy */
271 if (stack)
272 _bt_freestack(stack);
273 pfree(itup_key);
274
275 return is_unique;
276}
uint32_t uint32
Definition: c.h:502
@ UNIQUE_CHECK_NO
Definition: genam.h:140
@ UNIQUE_CHECK_EXISTING
Definition: genam.h:143
void XactLockTableWait(TransactionId xid, Relation rel, ItemPointer ctid, XLTW_Oper oper)
Definition: lmgr.c:663
void SpeculativeInsertionWait(TransactionId xid, uint32 token)
Definition: lmgr.c:822
@ XLTW_InsertIndex
Definition: lmgr.h:31
static BTStack _bt_search_insert(Relation rel, Relation heaprel, BTInsertState insertstate)
Definition: nbtinsert.c:317
static OffsetNumber _bt_findinsertloc(Relation rel, BTInsertState insertstate, bool checkingunique, bool indexUnchanged, BTStack stack, Relation heapRel)
Definition: nbtinsert.c:815
static void _bt_insertonpg(Relation rel, Relation heaprel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, BTStack stack, IndexTuple itup, Size itemsz, OffsetNumber newitemoff, int postingoff, bool split_only_page)
Definition: nbtinsert.c:1105
static TransactionId _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel, IndexUniqueCheck checkUnique, bool *is_unique, uint32 *speculativeToken)
Definition: nbtinsert.c:408
void _bt_freestack(BTStack stack)
Definition: nbtutils.c:187
BTScanInsert _bt_mkscankey(Relation rel, IndexTuple itup)
Definition: nbtutils.c:95
void CheckForSerializableConflictIn(Relation relation, ItemPointer tid, BlockNumber blkno)
Definition: predicate.c:4336
IndexTuple itup
Definition: nbtree.h:822
ItemPointer scantid
Definition: nbtree.h:802
bool heapkeyspace
Definition: nbtree.h:797
bool anynullkeys
Definition: nbtree.h:799
#define TransactionIdIsValid(xid)
Definition: transam.h:41

References _bt_check_unique(), _bt_findinsertloc(), _bt_freestack(), _bt_insertonpg(), _bt_mkscankey(), _bt_relbuf(), _bt_search_insert(), BTScanInsertData::anynullkeys, Assert(), BTInsertStateData::bounds_valid, BTInsertStateData::buf, BufferGetBlockNumber(), CheckForSerializableConflictIn(), BTScanInsertData::heapkeyspace, IndexTupleSize(), InvalidBuffer, BTInsertStateData::itemsz, BTInsertStateData::itup, BTInsertStateData::itup_key, MAXALIGN, pfree(), BTInsertStateData::postingoff, BTScanInsertData::scantid, SpeculativeInsertionWait(), IndexTupleData::t_tid, TransactionIdIsValid, UNIQUE_CHECK_EXISTING, UNIQUE_CHECK_NO, unlikely, XactLockTableWait(), and XLTW_InsertIndex.

Referenced by btinsert().

◆ _bt_end_vacuum()

void _bt_end_vacuum ( Relation  rel)

Definition at line 3618 of file nbtutils.c.

3619{
3620 int i;
3621
3622 LWLockAcquire(BtreeVacuumLock, LW_EXCLUSIVE);
3623
3624 /* Find the array entry */
3625 for (i = 0; i < btvacinfo->num_vacuums; i++)
3626 {
3627 BTOneVacInfo *vac = &btvacinfo->vacuums[i];
3628
3629 if (vac->relid.relId == rel->rd_lockInfo.lockRelId.relId &&
3630 vac->relid.dbId == rel->rd_lockInfo.lockRelId.dbId)
3631 {
3632 /* Remove it by shifting down the last entry */
3633 *vac = btvacinfo->vacuums[btvacinfo->num_vacuums - 1];
3635 break;
3636 }
3637 }
3638
3639 LWLockRelease(BtreeVacuumLock);
3640}
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1180
void LWLockRelease(LWLock *lock)
Definition: lwlock.c:1900
@ LW_EXCLUSIVE
Definition: lwlock.h:114
static BTVacInfo * btvacinfo
Definition: nbtutils.c:3514
LockRelId relid
Definition: nbtutils.c:3502
int num_vacuums
Definition: nbtutils.c:3509
BTOneVacInfo vacuums[FLEXIBLE_ARRAY_MEMBER]
Definition: nbtutils.c:3511
LockRelId lockRelId
Definition: rel.h:46
Oid relId
Definition: rel.h:40
Oid dbId
Definition: rel.h:41
LockInfoData rd_lockInfo
Definition: rel.h:114

References btvacinfo, LockRelId::dbId, i, LockInfoData::lockRelId, LW_EXCLUSIVE, LWLockAcquire(), LWLockRelease(), BTVacInfo::num_vacuums, RelationData::rd_lockInfo, BTOneVacInfo::relid, LockRelId::relId, and BTVacInfo::vacuums.

Referenced by _bt_end_vacuum_callback(), and btbulkdelete().

◆ _bt_end_vacuum_callback()

void _bt_end_vacuum_callback ( int  code,
Datum  arg 
)

Definition at line 3646 of file nbtutils.c.

3647{
3649}
void _bt_end_vacuum(Relation rel)
Definition: nbtutils.c:3618
void * arg
static Pointer DatumGetPointer(Datum X)
Definition: postgres.h:317

References _bt_end_vacuum(), arg, and DatumGetPointer().

Referenced by btbulkdelete().

◆ _bt_findsplitloc()

OffsetNumber _bt_findsplitloc ( Relation  rel,
Page  origpage,
OffsetNumber  newitemoff,
Size  newitemsz,
IndexTuple  newitem,
bool *  newitemonleft 
)

Definition at line 129 of file nbtsplitloc.c.

135{
136 BTPageOpaque opaque;
137 int leftspace,
138 rightspace,
139 olddataitemstotal,
140 olddataitemstoleft,
141 perfectpenalty,
142 leaffillfactor;
144 FindSplitStrat strategy;
145 ItemId itemid;
146 OffsetNumber offnum,
147 maxoff,
148 firstrightoff;
149 double fillfactormult;
150 bool usemult;
151 SplitPoint leftpage,
152 rightpage;
153
154 opaque = BTPageGetOpaque(origpage);
155 maxoff = PageGetMaxOffsetNumber(origpage);
156
157 /* Total free space available on a btree page, after fixed overhead */
158 leftspace = rightspace =
160 MAXALIGN(sizeof(BTPageOpaqueData));
161
162 /* The right page will have the same high key as the old page */
163 if (!P_RIGHTMOST(opaque))
164 {
165 itemid = PageGetItemId(origpage, P_HIKEY);
166 rightspace -= (int) (MAXALIGN(ItemIdGetLength(itemid)) +
167 sizeof(ItemIdData));
168 }
169
170 /* Count up total space in data items before actually scanning 'em */
171 olddataitemstotal = rightspace - (int) PageGetExactFreeSpace(origpage);
172 leaffillfactor = BTGetFillFactor(rel);
173
174 /* Passed-in newitemsz is MAXALIGNED but does not include line pointer */
175 newitemsz += sizeof(ItemIdData);
176 state.rel = rel;
177 state.origpage = origpage;
178 state.newitem = newitem;
179 state.newitemsz = newitemsz;
180 state.is_leaf = P_ISLEAF(opaque);
181 state.is_rightmost = P_RIGHTMOST(opaque);
182 state.leftspace = leftspace;
183 state.rightspace = rightspace;
184 state.olddataitemstotal = olddataitemstotal;
185 state.minfirstrightsz = SIZE_MAX;
186 state.newitemoff = newitemoff;
187
188 /* newitem cannot be a posting list item */
189 Assert(!BTreeTupleIsPosting(newitem));
190
191 /*
192 * nsplits should never exceed maxoff because there will be at most as
193 * many candidate split points as there are points _between_ tuples, once
194 * you imagine that the new item is already on the original page (the
195 * final number of splits may be slightly lower because not all points
196 * between tuples will be legal).
197 */
198 state.maxsplits = maxoff;
199 state.splits = palloc(sizeof(SplitPoint) * state.maxsplits);
200 state.nsplits = 0;
201
202 /*
203 * Scan through the data items and calculate space usage for a split at
204 * each possible position
205 */
206 olddataitemstoleft = 0;
207
208 for (offnum = P_FIRSTDATAKEY(opaque);
209 offnum <= maxoff;
210 offnum = OffsetNumberNext(offnum))
211 {
212 Size itemsz;
213
214 itemid = PageGetItemId(origpage, offnum);
215 itemsz = MAXALIGN(ItemIdGetLength(itemid)) + sizeof(ItemIdData);
216
217 /*
218 * When item offset number is not newitemoff, neither side of the
219 * split can be newitem. Record a split after the previous data item
220 * from original page, but before the current data item from original
221 * page. (_bt_recsplitloc() will reject the split when there are no
222 * previous items, which we rely on.)
223 */
224 if (offnum < newitemoff)
225 _bt_recsplitloc(&state, offnum, false, olddataitemstoleft, itemsz);
226 else if (offnum > newitemoff)
227 _bt_recsplitloc(&state, offnum, true, olddataitemstoleft, itemsz);
228 else
229 {
230 /*
231 * Record a split after all "offnum < newitemoff" original page
232 * data items, but before newitem
233 */
234 _bt_recsplitloc(&state, offnum, false, olddataitemstoleft, itemsz);
235
236 /*
237 * Record a split after newitem, but before data item from
238 * original page at offset newitemoff/current offset
239 */
240 _bt_recsplitloc(&state, offnum, true, olddataitemstoleft, itemsz);
241 }
242
243 olddataitemstoleft += itemsz;
244 }
245
246 /*
247 * Record a split after all original page data items, but before newitem.
248 * (Though only when it's possible that newitem will end up alone on new
249 * right page.)
250 */
251 Assert(olddataitemstoleft == olddataitemstotal);
252 if (newitemoff > maxoff)
253 _bt_recsplitloc(&state, newitemoff, false, olddataitemstotal, 0);
254
255 /*
256 * I believe it is not possible to fail to find a feasible split, but just
257 * in case ...
258 */
259 if (state.nsplits == 0)
260 elog(ERROR, "could not find a feasible split point for index \"%s\"",
262
263 /*
264 * Start search for a split point among list of legal split points. Give
265 * primary consideration to equalizing available free space in each half
266 * of the split initially (start with default strategy), while applying
267 * rightmost and split-after-new-item optimizations where appropriate.
268 * Either of the two other fallback strategies may be required for cases
269 * with a large number of duplicates around the original/space-optimal
270 * split point.
271 *
272 * Default strategy gives some weight to suffix truncation in deciding a
273 * split point on leaf pages. It attempts to select a split point where a
274 * distinguishing attribute appears earlier in the new high key for the
275 * left side of the split, in order to maximize the number of trailing
276 * attributes that can be truncated away. Only candidate split points
277 * that imply an acceptable balance of free space on each side are
278 * considered. See _bt_defaultinterval().
279 */
280 if (!state.is_leaf)
281 {
282 /* fillfactormult only used on rightmost page */
283 usemult = state.is_rightmost;
284 fillfactormult = BTREE_NONLEAF_FILLFACTOR / 100.0;
285 }
286 else if (state.is_rightmost)
287 {
288 /* Rightmost leaf page -- fillfactormult always used */
289 usemult = true;
290 fillfactormult = leaffillfactor / 100.0;
291 }
292 else if (_bt_afternewitemoff(&state, maxoff, leaffillfactor, &usemult))
293 {
294 /*
295 * New item inserted at rightmost point among a localized grouping on
296 * a leaf page -- apply "split after new item" optimization, either by
297 * applying leaf fillfactor multiplier, or by choosing the exact split
298 * point that leaves newitem as lastleft. (usemult is set for us.)
299 */
300 if (usemult)
301 {
302 /* fillfactormult should be set based on leaf fillfactor */
303 fillfactormult = leaffillfactor / 100.0;
304 }
305 else
306 {
307 /* find precise split point after newitemoff */
308 for (int i = 0; i < state.nsplits; i++)
309 {
310 SplitPoint *split = state.splits + i;
311
312 if (split->newitemonleft &&
313 newitemoff == split->firstrightoff)
314 {
315 pfree(state.splits);
316 *newitemonleft = true;
317 return newitemoff;
318 }
319 }
320
321 /*
322 * Cannot legally split after newitemoff; proceed with split
323 * without using fillfactor multiplier. This is defensive, and
324 * should never be needed in practice.
325 */
326 fillfactormult = 0.50;
327 }
328 }
329 else
330 {
331 /* Other leaf page. 50:50 page split. */
332 usemult = false;
333 /* fillfactormult not used, but be tidy */
334 fillfactormult = 0.50;
335 }
336
337 /*
338 * Save leftmost and rightmost splits for page before original ordinal
339 * sort order is lost by delta/fillfactormult sort
340 */
341 leftpage = state.splits[0];
342 rightpage = state.splits[state.nsplits - 1];
343
344 /* Give split points a fillfactormult-wise delta, and sort on deltas */
345 _bt_deltasortsplits(&state, fillfactormult, usemult);
346
347 /* Determine split interval for default strategy */
348 state.interval = _bt_defaultinterval(&state);
349
350 /*
351 * Determine if default strategy/split interval will produce a
352 * sufficiently distinguishing split, or if we should change strategies.
353 * Alternative strategies change the range of split points that are
354 * considered acceptable (split interval), and possibly change
355 * fillfactormult, in order to deal with pages with a large number of
356 * duplicates gracefully.
357 *
358 * Pass low and high splits for the entire page (actually, they're for an
359 * imaginary version of the page that includes newitem). These are used
360 * when the initial split interval encloses split points that are full of
361 * duplicates, and we need to consider if it's even possible to avoid
362 * appending a heap TID.
363 */
364 perfectpenalty = _bt_strategy(&state, &leftpage, &rightpage, &strategy);
365
366 if (strategy == SPLIT_DEFAULT)
367 {
368 /*
369 * Default strategy worked out (always works out with internal page).
370 * Original split interval still stands.
371 */
372 }
373
374 /*
375 * Many duplicates strategy is used when a heap TID would otherwise be
376 * appended, but the page isn't completely full of logical duplicates.
377 *
378 * The split interval is widened to include all legal candidate split
379 * points. There might be a few as two distinct values in the whole-page
380 * split interval, though it's also possible that most of the values on
381 * the page are unique. The final split point will either be to the
382 * immediate left or to the immediate right of the group of duplicate
383 * tuples that enclose the first/delta-optimal split point (perfect
384 * penalty was set so that the lowest delta split point that avoids
385 * appending a heap TID will be chosen). Maximizing the number of
386 * attributes that can be truncated away is not a goal of the many
387 * duplicates strategy.
388 *
389 * Single value strategy is used when it is impossible to avoid appending
390 * a heap TID. It arranges to leave the left page very full. This
391 * maximizes space utilization in cases where tuples with the same
392 * attribute values span many pages. Newly inserted duplicates will tend
393 * to have higher heap TID values, so we'll end up splitting to the right
394 * consistently. (Single value strategy is harmless though not
395 * particularly useful with !heapkeyspace indexes.)
396 */
397 else if (strategy == SPLIT_MANY_DUPLICATES)
398 {
399 Assert(state.is_leaf);
400 /* Shouldn't try to truncate away extra user attributes */
401 Assert(perfectpenalty ==
403 /* No need to resort splits -- no change in fillfactormult/deltas */
404 state.interval = state.nsplits;
405 }
406 else if (strategy == SPLIT_SINGLE_VALUE)
407 {
408 Assert(state.is_leaf);
409 /* Split near the end of the page */
410 usemult = true;
411 fillfactormult = BTREE_SINGLEVAL_FILLFACTOR / 100.0;
412 /* Resort split points with new delta */
413 _bt_deltasortsplits(&state, fillfactormult, usemult);
414 /* Appending a heap TID is unavoidable, so interval of 1 is fine */
415 state.interval = 1;
416 }
417
418 /*
419 * Search among acceptable split points (using final split interval) for
420 * the entry that has the lowest penalty, and is therefore expected to
421 * maximize fan-out. Sets *newitemonleft for us.
422 */
423 firstrightoff = _bt_bestsplitloc(&state, perfectpenalty, newitemonleft,
424 strategy);
425 pfree(state.splits);
426
427 return firstrightoff;
428}
static Size PageGetPageSize(const PageData *page)
Definition: bufpage.h:277
#define BTREE_SINGLEVAL_FILLFACTOR
Definition: nbtree.h:203
#define BTGetFillFactor(relation)
Definition: nbtree.h:1157
#define BTREE_NONLEAF_FILLFACTOR
Definition: nbtree.h:202
static void _bt_deltasortsplits(FindSplitData *state, double fillfactormult, bool usemult)
Definition: nbtsplitloc.c:566
static int _bt_strategy(FindSplitData *state, SplitPoint *leftpage, SplitPoint *rightpage, FindSplitStrat *strategy)
Definition: nbtsplitloc.c:934
static bool _bt_afternewitemoff(FindSplitData *state, OffsetNumber maxoff, int leaffillfactor, bool *usemult)
Definition: nbtsplitloc.c:630
static void _bt_recsplitloc(FindSplitData *state, OffsetNumber firstrightoff, bool newitemonleft, int olddataitemstoleft, Size firstrightofforigpagetuplesz)
Definition: nbtsplitloc.c:449
FindSplitStrat
Definition: nbtsplitloc.c:21
@ SPLIT_DEFAULT
Definition: nbtsplitloc.c:23
@ SPLIT_MANY_DUPLICATES
Definition: nbtsplitloc.c:24
@ SPLIT_SINGLE_VALUE
Definition: nbtsplitloc.c:25
static OffsetNumber _bt_bestsplitloc(FindSplitData *state, int perfectpenalty, bool *newitemonleft, FindSplitStrat strategy)
Definition: nbtsplitloc.c:788
static int _bt_defaultinterval(FindSplitData *state)
Definition: nbtsplitloc.c:876
bool newitemonleft
Definition: nbtsplitloc.c:37
OffsetNumber firstrightoff
Definition: nbtsplitloc.c:36

References _bt_afternewitemoff(), _bt_bestsplitloc(), _bt_defaultinterval(), _bt_deltasortsplits(), _bt_recsplitloc(), _bt_strategy(), Assert(), BTGetFillFactor, BTPageGetOpaque, BTREE_NONLEAF_FILLFACTOR, BTREE_SINGLEVAL_FILLFACTOR, BTreeTupleIsPosting(), elog, ERROR, SplitPoint::firstrightoff, i, IndexRelationGetNumberOfKeyAttributes, ItemIdGetLength, MAXALIGN, SplitPoint::newitemonleft, OffsetNumberNext, P_FIRSTDATAKEY, P_HIKEY, P_ISLEAF, P_RIGHTMOST, PageGetExactFreeSpace(), PageGetItemId(), PageGetMaxOffsetNumber(), PageGetPageSize(), palloc(), pfree(), RelationGetRelationName, SizeOfPageHeaderData, SPLIT_DEFAULT, SPLIT_MANY_DUPLICATES, and SPLIT_SINGLE_VALUE.

Referenced by _bt_split().

◆ _bt_finish_split()

void _bt_finish_split ( Relation  rel,
Relation  heaprel,
Buffer  lbuf,
BTStack  stack 
)

Definition at line 2241 of file nbtinsert.c.

2242{
2243 Page lpage = BufferGetPage(lbuf);
2244 BTPageOpaque lpageop = BTPageGetOpaque(lpage);
2245 Buffer rbuf;
2246 Page rpage;
2247 BTPageOpaque rpageop;
2248 bool wasroot;
2249 bool wasonly;
2250
2251 Assert(P_INCOMPLETE_SPLIT(lpageop));
2252 Assert(heaprel != NULL);
2253
2254 /* Lock right sibling, the one missing the downlink */
2255 rbuf = _bt_getbuf(rel, lpageop->btpo_next, BT_WRITE);
2256 rpage = BufferGetPage(rbuf);
2257 rpageop = BTPageGetOpaque(rpage);
2258
2259 /* Could this be a root split? */
2260 if (!stack)
2261 {
2262 Buffer metabuf;
2263 Page metapg;
2264 BTMetaPageData *metad;
2265
2266 /* acquire lock on the metapage */
2267 metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
2268 metapg = BufferGetPage(metabuf);
2269 metad = BTPageGetMeta(metapg);
2270
2271 wasroot = (metad->btm_root == BufferGetBlockNumber(lbuf));
2272
2273 _bt_relbuf(rel, metabuf);
2274 }
2275 else
2276 wasroot = false;
2277
2278 /* Was this the only page on the level before split? */
2279 wasonly = (P_LEFTMOST(lpageop) && P_RIGHTMOST(rpageop));
2280
2281 elog(DEBUG1, "finishing incomplete split of %u/%u",
2283
2284 _bt_insert_parent(rel, heaprel, lbuf, rbuf, stack, wasroot, wasonly);
2285}
static void _bt_insert_parent(Relation rel, Relation heaprel, Buffer buf, Buffer rbuf, BTStack stack, bool isroot, bool isonly)
Definition: nbtinsert.c:2099
Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access)
Definition: nbtpage.c:845
#define BTPageGetMeta(p)
Definition: nbtree.h:122
#define P_LEFTMOST(opaque)
Definition: nbtree.h:219
#define P_INCOMPLETE_SPLIT(opaque)
Definition: nbtree.h:228
#define BTREE_METAPAGE
Definition: nbtree.h:149
#define BT_WRITE
Definition: nbtree.h:731
BlockNumber btm_root
Definition: nbtree.h:108
BlockNumber btpo_next
Definition: nbtree.h:66

References _bt_getbuf(), _bt_insert_parent(), _bt_relbuf(), Assert(), BT_WRITE, BTMetaPageData::btm_root, BTPageGetMeta, BTPageGetOpaque, BTPageOpaqueData::btpo_next, BTREE_METAPAGE, BufferGetBlockNumber(), BufferGetPage(), DEBUG1, elog, P_INCOMPLETE_SPLIT, P_LEFTMOST, and P_RIGHTMOST.

Referenced by _bt_getstackbuf(), _bt_moveright(), and _bt_stepright().

◆ _bt_first()

bool _bt_first ( IndexScanDesc  scan,
ScanDirection  dir 
)

Definition at line 882 of file nbtsearch.c.

883{
884 Relation rel = scan->indexRelation;
885 BTScanOpaque so = (BTScanOpaque) scan->opaque;
886 BTStack stack;
887 OffsetNumber offnum;
888 BTScanInsertData inskey;
889 ScanKey startKeys[INDEX_MAX_KEYS];
890 ScanKeyData notnullkeys[INDEX_MAX_KEYS];
891 int keysz = 0;
892 StrategyNumber strat_total;
894 lastcurrblkno;
895
897
898 /*
899 * Examine the scan keys and eliminate any redundant keys; also mark the
900 * keys that must be matched to continue the scan.
901 */
903
904 /*
905 * Quit now if _bt_preprocess_keys() discovered that the scan keys can
906 * never be satisfied (eg, x == 1 AND x > 2).
907 */
908 if (!so->qual_ok)
909 {
910 Assert(!so->needPrimScan);
911 _bt_parallel_done(scan);
912 return false;
913 }
914
915 /*
916 * If this is a parallel scan, we must seize the scan. _bt_readfirstpage
917 * will likely release the parallel scan later on.
918 */
919 if (scan->parallel_scan != NULL &&
920 !_bt_parallel_seize(scan, &blkno, &lastcurrblkno, true))
921 return false;
922
923 /*
924 * Initialize the scan's arrays (if any) for the current scan direction
925 * (except when they were already set to later values as part of
926 * scheduling the primitive index scan that is now underway)
927 */
928 if (so->numArrayKeys && !so->needPrimScan)
929 _bt_start_array_keys(scan, dir);
930
931 if (blkno != InvalidBlockNumber)
932 {
933 /*
934 * We anticipated calling _bt_search, but another worker bet us to it.
935 * _bt_readnextpage releases the scan for us (not _bt_readfirstpage).
936 */
937 Assert(scan->parallel_scan != NULL);
938 Assert(!so->needPrimScan);
939 Assert(blkno != P_NONE);
940
941 if (!_bt_readnextpage(scan, blkno, lastcurrblkno, dir, true))
942 return false;
943
944 _bt_returnitem(scan, so);
945 return true;
946 }
947
948 /*
949 * Count an indexscan for stats, now that we know that we'll call
950 * _bt_search/_bt_endpoint below
951 */
953 if (scan->instrument)
954 scan->instrument->nsearches++;
955
956 /*----------
957 * Examine the scan keys to discover where we need to start the scan.
958 *
959 * We want to identify the keys that can be used as starting boundaries;
960 * these are =, >, or >= keys for a forward scan or =, <, <= keys for
961 * a backwards scan. We can use keys for multiple attributes so long as
962 * the prior attributes had only =, >= (resp. =, <=) keys. Once we accept
963 * a > or < boundary or find an attribute with no boundary (which can be
964 * thought of as the same as "> -infinity"), we can't use keys for any
965 * attributes to its right, because it would break our simplistic notion
966 * of what initial positioning strategy to use.
967 *
968 * When the scan keys include cross-type operators, _bt_preprocess_keys
969 * may not be able to eliminate redundant keys; in such cases we will
970 * arbitrarily pick a usable one for each attribute. This is correct
971 * but possibly not optimal behavior. (For example, with keys like
972 * "x >= 4 AND x >= 5" we would elect to scan starting at x=4 when
973 * x=5 would be more efficient.) Since the situation only arises given
974 * a poorly-worded query plus an incomplete opfamily, live with it.
975 *
976 * When both equality and inequality keys appear for a single attribute
977 * (again, only possible when cross-type operators appear), we *must*
978 * select one of the equality keys for the starting point, because
979 * _bt_checkkeys() will stop the scan as soon as an equality qual fails.
980 * For example, if we have keys like "x >= 4 AND x = 10" and we elect to
981 * start at x=4, we will fail and stop before reaching x=10. If multiple
982 * equality quals survive preprocessing, however, it doesn't matter which
983 * one we use --- by definition, they are either redundant or
984 * contradictory.
985 *
986 * In practice we rarely see any "attribute boundary key gaps" here.
987 * Preprocessing can usually backfill skip array keys for any attributes
988 * that were omitted from the original scan->keyData[] input keys. All
989 * array keys are always considered = keys, but we'll sometimes need to
990 * treat the current key value as if we were using an inequality strategy.
991 * This happens with range skip arrays, which store inequality keys in the
992 * array's low_compare/high_compare fields (used to find the first/last
993 * set of matches, when = key will lack a usable sk_argument value).
994 * These are always preferred over any redundant "standard" inequality
995 * keys on the same column (per the usual rule about preferring = keys).
996 * Note also that any column with an = skip array key can never have an
997 * additional, contradictory = key.
998 *
999 * All keys (with the exception of SK_SEARCHNULL keys and SK_BT_SKIP
1000 * array keys whose array is "null_elem=true") imply a NOT NULL qualifier.
1001 * If the index stores nulls at the end of the index we'll be starting
1002 * from, and we have no boundary key for the column (which means the key
1003 * we deduced NOT NULL from is an inequality key that constrains the other
1004 * end of the index), then we cons up an explicit SK_SEARCHNOTNULL key to
1005 * use as a boundary key. If we didn't do this, we might find ourselves
1006 * traversing a lot of null entries at the start of the scan.
1007 *
1008 * In this loop, row-comparison keys are treated the same as keys on their
1009 * first (leftmost) columns. We'll add on lower-order columns of the row
1010 * comparison below, if possible.
1011 *
1012 * The selected scan keys (at most one per index column) are remembered by
1013 * storing their addresses into the local startKeys[] array.
1014 *
1015 * _bt_checkkeys/_bt_advance_array_keys decide whether and when to start
1016 * the next primitive index scan (for scans with array keys) based in part
1017 * on an understanding of how it'll enable us to reposition the scan.
1018 * They're directly aware of how we'll sometimes cons up an explicit
1019 * SK_SEARCHNOTNULL key. They'll even end primitive scans by applying a
1020 * symmetric "deduce NOT NULL" rule of their own. This allows top-level
1021 * scans to skip large groups of NULLs through repeated deductions about
1022 * key strictness (for a required inequality key) and whether NULLs in the
1023 * key's index column are stored last or first (relative to non-NULLs).
1024 * If you update anything here, _bt_checkkeys/_bt_advance_array_keys might
1025 * need to be kept in sync.
1026 *----------
1027 */
1028 strat_total = BTEqualStrategyNumber;
1029 if (so->numberOfKeys > 0)
1030 {
1031 AttrNumber curattr;
1032 ScanKey chosen;
1033 ScanKey impliesNN;
1034 ScanKey cur;
1035
1036 /*
1037 * chosen is the so-far-chosen key for the current attribute, if any.
1038 * We don't cast the decision in stone until we reach keys for the
1039 * next attribute.
1040 */
1041 cur = so->keyData;
1042 curattr = 1;
1043 chosen = NULL;
1044 /* Also remember any scankey that implies a NOT NULL constraint */
1045 impliesNN = NULL;
1046
1047 /*
1048 * Loop iterates from 0 to numberOfKeys inclusive; we use the last
1049 * pass to handle after-last-key processing. Actual exit from the
1050 * loop is at one of the "break" statements below.
1051 */
1052 for (int i = 0;; cur++, i++)
1053 {
1054 if (i >= so->numberOfKeys || cur->sk_attno != curattr)
1055 {
1056 /*
1057 * Done looking at keys for curattr.
1058 *
1059 * If this is a scan key for a skip array whose current
1060 * element is MINVAL, choose low_compare (when scanning
1061 * backwards it'll be MAXVAL, and we'll choose high_compare).
1062 *
1063 * Note: if the array's low_compare key makes 'chosen' NULL,
1064 * then we behave as if the array's first element is -inf,
1065 * except when !array->null_elem implies a usable NOT NULL
1066 * constraint.
1067 */
1068 if (chosen != NULL &&
1069 (chosen->sk_flags & (SK_BT_MINVAL | SK_BT_MAXVAL)))
1070 {
1071 int ikey = chosen - so->keyData;
1072 ScanKey skipequalitykey = chosen;
1073 BTArrayKeyInfo *array = NULL;
1074
1075 for (int arridx = 0; arridx < so->numArrayKeys; arridx++)
1076 {
1077 array = &so->arrayKeys[arridx];
1078 if (array->scan_key == ikey)
1079 break;
1080 }
1081
1082 if (ScanDirectionIsForward(dir))
1083 {
1084 Assert(!(skipequalitykey->sk_flags & SK_BT_MAXVAL));
1085 chosen = array->low_compare;
1086 }
1087 else
1088 {
1089 Assert(!(skipequalitykey->sk_flags & SK_BT_MINVAL));
1090 chosen = array->high_compare;
1091 }
1092
1093 Assert(chosen == NULL ||
1094 chosen->sk_attno == skipequalitykey->sk_attno);
1095
1096 if (!array->null_elem)
1097 impliesNN = skipequalitykey;
1098 else
1099 Assert(chosen == NULL && impliesNN == NULL);
1100 }
1101
1102 /*
1103 * If we didn't find a usable boundary key, see if we can
1104 * deduce a NOT NULL key
1105 */
1106 if (chosen == NULL && impliesNN != NULL &&
1107 ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
1110 {
1111 /* Yes, so build the key in notnullkeys[keysz] */
1112 chosen = &notnullkeys[keysz];
1115 (impliesNN->sk_flags &
1117 curattr,
1118 ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
1121 InvalidOid,
1122 InvalidOid,
1123 InvalidOid,
1124 (Datum) 0);
1125 }
1126
1127 /*
1128 * If we still didn't find a usable boundary key, quit; else
1129 * save the boundary key pointer in startKeys.
1130 */
1131 if (chosen == NULL)
1132 break;
1133 startKeys[keysz++] = chosen;
1134
1135 /*
1136 * We can only consider adding more boundary keys when the one
1137 * that we just chose to add uses either the = or >= strategy
1138 * (during backwards scans we can only do so when the key that
1139 * we just added to startKeys[] uses the = or <= strategy)
1140 */
1141 strat_total = chosen->sk_strategy;
1142 if (strat_total == BTGreaterStrategyNumber ||
1143 strat_total == BTLessStrategyNumber)
1144 break;
1145
1146 /*
1147 * If the key that we just added to startKeys[] is a skip
1148 * array = key whose current element is marked NEXT or PRIOR,
1149 * make strat_total > or < (and stop adding boundary keys).
1150 * This can only happen with opclasses that lack skip support.
1151 */
1152 if (chosen->sk_flags & (SK_BT_NEXT | SK_BT_PRIOR))
1153 {
1154 Assert(chosen->sk_flags & SK_BT_SKIP);
1155 Assert(strat_total == BTEqualStrategyNumber);
1156
1157 if (ScanDirectionIsForward(dir))
1158 {
1159 Assert(!(chosen->sk_flags & SK_BT_PRIOR));
1160 strat_total = BTGreaterStrategyNumber;
1161 }
1162 else
1163 {
1164 Assert(!(chosen->sk_flags & SK_BT_NEXT));
1165 strat_total = BTLessStrategyNumber;
1166 }
1167
1168 /*
1169 * We're done. We'll never find an exact = match for a
1170 * NEXT or PRIOR sentinel sk_argument value. There's no
1171 * sense in trying to add more keys to startKeys[].
1172 */
1173 break;
1174 }
1175
1176 /*
1177 * Done if that was the last scan key output by preprocessing.
1178 * Also done if there is a gap index attribute that lacks a
1179 * usable key (only possible when preprocessing was unable to
1180 * generate a skip array key to "fill in the gap").
1181 */
1182 if (i >= so->numberOfKeys ||
1183 cur->sk_attno != curattr + 1)
1184 break;
1185
1186 /*
1187 * Reset for next attr.
1188 */
1189 curattr = cur->sk_attno;
1190 chosen = NULL;
1191 impliesNN = NULL;
1192 }
1193
1194 /*
1195 * Can we use this key as a starting boundary for this attr?
1196 *
1197 * If not, does it imply a NOT NULL constraint? (Because
1198 * SK_SEARCHNULL keys are always assigned BTEqualStrategyNumber,
1199 * *any* inequality key works for that; we need not test.)
1200 */
1201 switch (cur->sk_strategy)
1202 {
1205 if (chosen == NULL)
1206 {
1207 if (ScanDirectionIsBackward(dir))
1208 chosen = cur;
1209 else
1210 impliesNN = cur;
1211 }
1212 break;
1214 /* override any non-equality choice */
1215 chosen = cur;
1216 break;
1219 if (chosen == NULL)
1220 {
1221 if (ScanDirectionIsForward(dir))
1222 chosen = cur;
1223 else
1224 impliesNN = cur;
1225 }
1226 break;
1227 }
1228 }
1229 }
1230
1231 /*
1232 * If we found no usable boundary keys, we have to start from one end of
1233 * the tree. Walk down that edge to the first or last key, and scan from
1234 * there.
1235 *
1236 * Note: calls _bt_readfirstpage for us, which releases the parallel scan.
1237 */
1238 if (keysz == 0)
1239 return _bt_endpoint(scan, dir);
1240
1241 /*
1242 * We want to start the scan somewhere within the index. Set up an
1243 * insertion scankey we can use to search for the boundary point we
1244 * identified above. The insertion scankey is built using the keys
1245 * identified by startKeys[]. (Remaining insertion scankey fields are
1246 * initialized after initial-positioning scan keys are finalized.)
1247 */
1248 Assert(keysz <= INDEX_MAX_KEYS);
1249 for (int i = 0; i < keysz; i++)
1250 {
1251 ScanKey cur = startKeys[i];
1252
1253 Assert(cur->sk_attno == i + 1);
1254
1255 if (cur->sk_flags & SK_ROW_HEADER)
1256 {
1257 /*
1258 * Row comparison header: look to the first row member instead
1259 */
1260 ScanKey subkey = (ScanKey) DatumGetPointer(cur->sk_argument);
1261
1262 /*
1263 * Cannot be a NULL in the first row member: _bt_preprocess_keys
1264 * would've marked the qual as unsatisfiable, preventing us from
1265 * ever getting this far
1266 */
1267 Assert(subkey->sk_flags & SK_ROW_MEMBER);
1268 Assert(subkey->sk_attno == cur->sk_attno);
1269 Assert(!(subkey->sk_flags & SK_ISNULL));
1270
1271 /*
1272 * The member scankeys are already in insertion format (ie, they
1273 * have sk_func = 3-way-comparison function)
1274 */
1275 memcpy(inskey.scankeys + i, subkey, sizeof(ScanKeyData));
1276
1277 /*
1278 * If the row comparison is the last positioning key we accepted,
1279 * try to add additional keys from the lower-order row members.
1280 * (If we accepted independent conditions on additional index
1281 * columns, we use those instead --- doesn't seem worth trying to
1282 * determine which is more restrictive.) Note that this is OK
1283 * even if the row comparison is of ">" or "<" type, because the
1284 * condition applied to all but the last row member is effectively
1285 * ">=" or "<=", and so the extra keys don't break the positioning
1286 * scheme. But, by the same token, if we aren't able to use all
1287 * the row members, then the part of the row comparison that we
1288 * did use has to be treated as just a ">=" or "<=" condition, and
1289 * so we'd better adjust strat_total accordingly.
1290 */
1291 if (i == keysz - 1)
1292 {
1293 bool used_all_subkeys = false;
1294
1295 Assert(!(subkey->sk_flags & SK_ROW_END));
1296 for (;;)
1297 {
1298 subkey++;
1299 Assert(subkey->sk_flags & SK_ROW_MEMBER);
1300 if (subkey->sk_attno != keysz + 1)
1301 break; /* out-of-sequence, can't use it */
1302 if (subkey->sk_strategy != cur->sk_strategy)
1303 break; /* wrong direction, can't use it */
1304 if (subkey->sk_flags & SK_ISNULL)
1305 break; /* can't use null keys */
1306 Assert(keysz < INDEX_MAX_KEYS);
1307 memcpy(inskey.scankeys + keysz, subkey,
1308 sizeof(ScanKeyData));
1309 keysz++;
1310 if (subkey->sk_flags & SK_ROW_END)
1311 {
1312 used_all_subkeys = true;
1313 break;
1314 }
1315 }
1316 if (!used_all_subkeys)
1317 {
1318 switch (strat_total)
1319 {
1321 strat_total = BTLessEqualStrategyNumber;
1322 break;
1324 strat_total = BTGreaterEqualStrategyNumber;
1325 break;
1326 }
1327 }
1328 break; /* done with outer loop */
1329 }
1330 }
1331 else
1332 {
1333 /*
1334 * Ordinary comparison key. Transform the search-style scan key
1335 * to an insertion scan key by replacing the sk_func with the
1336 * appropriate btree comparison function.
1337 *
1338 * If scankey operator is not a cross-type comparison, we can use
1339 * the cached comparison function; otherwise gotta look it up in
1340 * the catalogs. (That can't lead to infinite recursion, since no
1341 * indexscan initiated by syscache lookup will use cross-data-type
1342 * operators.)
1343 *
1344 * We support the convention that sk_subtype == InvalidOid means
1345 * the opclass input type; this is a hack to simplify life for
1346 * ScanKeyInit().
1347 */
1348 if (cur->sk_subtype == rel->rd_opcintype[i] ||
1349 cur->sk_subtype == InvalidOid)
1350 {
1351 FmgrInfo *procinfo;
1352
1353 procinfo = index_getprocinfo(rel, cur->sk_attno, BTORDER_PROC);
1354 ScanKeyEntryInitializeWithInfo(inskey.scankeys + i,
1355 cur->sk_flags,
1356 cur->sk_attno,
1358 cur->sk_subtype,
1359 cur->sk_collation,
1360 procinfo,
1361 cur->sk_argument);
1362 }
1363 else
1364 {
1365 RegProcedure cmp_proc;
1366
1367 cmp_proc = get_opfamily_proc(rel->rd_opfamily[i],
1368 rel->rd_opcintype[i],
1369 cur->sk_subtype,
1370 BTORDER_PROC);
1371 if (!RegProcedureIsValid(cmp_proc))
1372 elog(ERROR, "missing support function %d(%u,%u) for attribute %d of index \"%s\"",
1373 BTORDER_PROC, rel->rd_opcintype[i], cur->sk_subtype,
1374 cur->sk_attno, RelationGetRelationName(rel));
1375 ScanKeyEntryInitialize(inskey.scankeys + i,
1376 cur->sk_flags,
1377 cur->sk_attno,
1379 cur->sk_subtype,
1380 cur->sk_collation,
1381 cmp_proc,
1382 cur->sk_argument);
1383 }
1384 }
1385 }
1386
1387 /*----------
1388 * Examine the selected initial-positioning strategy to determine exactly
1389 * where we need to start the scan, and set flag variables to control the
1390 * initial descent by _bt_search (and our _bt_binsrch call for the leaf
1391 * page _bt_search returns).
1392 *----------
1393 */
1394 _bt_metaversion(rel, &inskey.heapkeyspace, &inskey.allequalimage);
1395 inskey.anynullkeys = false; /* unused */
1396 inskey.scantid = NULL;
1397 inskey.keysz = keysz;
1398 switch (strat_total)
1399 {
1401
1402 inskey.nextkey = false;
1403 inskey.backward = true;
1404 break;
1405
1407
1408 inskey.nextkey = true;
1409 inskey.backward = true;
1410 break;
1411
1413
1414 /*
1415 * If a backward scan was specified, need to start with last equal
1416 * item not first one.
1417 */
1418 if (ScanDirectionIsBackward(dir))
1419 {
1420 /*
1421 * This is the same as the <= strategy
1422 */
1423 inskey.nextkey = true;
1424 inskey.backward = true;
1425 }
1426 else
1427 {
1428 /*
1429 * This is the same as the >= strategy
1430 */
1431 inskey.nextkey = false;
1432 inskey.backward = false;
1433 }
1434 break;
1435
1437
1438 /*
1439 * Find first item >= scankey
1440 */
1441 inskey.nextkey = false;
1442 inskey.backward = false;
1443 break;
1444
1446
1447 /*
1448 * Find first item > scankey
1449 */
1450 inskey.nextkey = true;
1451 inskey.backward = false;
1452 break;
1453
1454 default:
1455 /* can't get here, but keep compiler quiet */
1456 elog(ERROR, "unrecognized strat_total: %d", (int) strat_total);
1457 return false;
1458 }
1459
1460 /*
1461 * Use the manufactured insertion scan key to descend the tree and
1462 * position ourselves on the target leaf page.
1463 */
1464 Assert(ScanDirectionIsBackward(dir) == inskey.backward);
1465 stack = _bt_search(rel, NULL, &inskey, &so->currPos.buf, BT_READ);
1466
1467 /* don't need to keep the stack around... */
1468 _bt_freestack(stack);
1469
1470 if (!BufferIsValid(so->currPos.buf))
1471 {
1472 /*
1473 * We only get here if the index is completely empty. Lock relation
1474 * because nothing finer to lock exists. Without a buffer lock, it's
1475 * possible for another transaction to insert data between
1476 * _bt_search() and PredicateLockRelation(). We have to try again
1477 * after taking the relation-level predicate lock, to close a narrow
1478 * window where we wouldn't scan concurrently inserted tuples, but the
1479 * writer wouldn't see our predicate lock.
1480 */
1482 {
1484 stack = _bt_search(rel, NULL, &inskey, &so->currPos.buf, BT_READ);
1485 _bt_freestack(stack);
1486 }
1487
1488 if (!BufferIsValid(so->currPos.buf))
1489 {
1490 Assert(!so->needPrimScan);
1491 _bt_parallel_done(scan);
1492 return false;
1493 }
1494 }
1495
1496 /* position to the precise item on the page */
1497 offnum = _bt_binsrch(rel, &inskey, so->currPos.buf);
1498
1499 /*
1500 * Now load data from the first page of the scan (usually the page
1501 * currently in so->currPos.buf).
1502 *
1503 * If inskey.nextkey = false and inskey.backward = false, offnum is
1504 * positioned at the first non-pivot tuple >= inskey.scankeys.
1505 *
1506 * If inskey.nextkey = false and inskey.backward = true, offnum is
1507 * positioned at the last non-pivot tuple < inskey.scankeys.
1508 *
1509 * If inskey.nextkey = true and inskey.backward = false, offnum is
1510 * positioned at the first non-pivot tuple > inskey.scankeys.
1511 *
1512 * If inskey.nextkey = true and inskey.backward = true, offnum is
1513 * positioned at the last non-pivot tuple <= inskey.scankeys.
1514 *
1515 * It's possible that _bt_binsrch returned an offnum that is out of bounds
1516 * for the page. For example, when inskey is both < the leaf page's high
1517 * key and > all of its non-pivot tuples, offnum will be "maxoff + 1".
1518 */
1519 if (!_bt_readfirstpage(scan, offnum, dir))
1520 return false;
1521
1522 _bt_returnitem(scan, so);
1523 return true;
1524}
int16 AttrNumber
Definition: attnum.h:21
#define RegProcedureIsValid(p)
Definition: c.h:748
regproc RegProcedure
Definition: c.h:621
FmgrInfo * index_getprocinfo(Relation irel, AttrNumber attnum, uint16 procnum)
Definition: indexam.c:907
void _bt_metaversion(Relation rel, bool *heapkeyspace, bool *allequalimage)
Definition: nbtpage.c:739
void _bt_preprocess_keys(IndexScanDesc scan)
bool _bt_parallel_seize(IndexScanDesc scan, BlockNumber *next_scan_page, BlockNumber *last_curr_page, bool first)
Definition: nbtree.c:784
void _bt_parallel_done(IndexScanDesc scan)
Definition: nbtree.c:949
#define BTORDER_PROC
Definition: nbtree.h:717
#define SK_BT_PRIOR
Definition: nbtree.h:1142
#define SK_BT_NEXT
Definition: nbtree.h:1141
#define BTScanPosIsValid(scanpos)
Definition: nbtree.h:1021
#define P_NONE
Definition: nbtree.h:213
#define SK_BT_MAXVAL
Definition: nbtree.h:1140
#define BT_READ
Definition: nbtree.h:730
#define SK_BT_MINVAL
Definition: nbtree.h:1139
static bool _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, BlockNumber lastcurrblkno, ScanDirection dir, bool seized)
Definition: nbtsearch.c:2301
static OffsetNumber _bt_binsrch(Relation rel, BTScanInsert key, Buffer buf)
Definition: nbtsearch.c:343
static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:2636
static bool _bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum, ScanDirection dir)
Definition: nbtsearch.c:2213
BTStack _bt_search(Relation rel, Relation heaprel, BTScanInsert key, Buffer *bufP, int access)
Definition: nbtsearch.c:102
static void _bt_returnitem(IndexScanDesc scan, BTScanOpaque so)
Definition: nbtsearch.c:2084
void _bt_start_array_keys(IndexScanDesc scan, ScanDirection dir)
Definition: nbtutils.c:611
#define INDEX_MAX_KEYS
#define pgstat_count_index_scan(rel)
Definition: pgstat.h:694
#define InvalidOid
Definition: postgres_ext.h:35
void PredicateLockRelation(Relation relation, Snapshot snapshot)
Definition: predicate.c:2576
void ScanKeyEntryInitialize(ScanKey entry, int flags, AttrNumber attributeNumber, StrategyNumber strategy, Oid subtype, Oid collation, RegProcedure procedure, Datum argument)
Definition: scankey.c:32
void ScanKeyEntryInitializeWithInfo(ScanKey entry, int flags, AttrNumber attributeNumber, StrategyNumber strategy, Oid subtype, Oid collation, FmgrInfo *finfo, Datum argument)
Definition: scankey.c:101
#define ScanDirectionIsBackward(direction)
Definition: sdir.h:50
#define SK_ROW_HEADER
Definition: skey.h:117
#define SK_ROW_MEMBER
Definition: skey.h:118
#define SK_SEARCHNOTNULL
Definition: skey.h:122
#define SK_ROW_END
Definition: skey.h:119
ScanKeyData * ScanKey
Definition: skey.h:75
uint16 StrategyNumber
Definition: stratnum.h:22
#define BTGreaterStrategyNumber
Definition: stratnum.h:33
#define InvalidStrategy
Definition: stratnum.h:24
#define BTLessStrategyNumber
Definition: stratnum.h:29
#define BTLessEqualStrategyNumber
Definition: stratnum.h:30
#define BTGreaterEqualStrategyNumber
Definition: stratnum.h:32
ScanKey high_compare
Definition: nbtree.h:1050
ScanKey low_compare
Definition: nbtree.h:1049
bool null_elem
Definition: nbtree.h:1047
BTArrayKeyInfo * arrayKeys
Definition: nbtree.h:1066
ScanKey keyData
Definition: nbtree.h:1058
Buffer buf
Definition: nbtree.h:964
Definition: fmgr.h:57
struct ParallelIndexScanDescData * parallel_scan
Definition: relscan.h:191
struct IndexScanInstrumentation * instrument
Definition: relscan.h:159
struct SnapshotData * xs_snapshot
Definition: relscan.h:138
StrategyNumber sk_strategy
Definition: skey.h:68
#define IsolationIsSerializable()
Definition: xact.h:52

References _bt_binsrch(), _bt_endpoint(), _bt_freestack(), _bt_metaversion(), _bt_parallel_done(), _bt_parallel_seize(), _bt_preprocess_keys(), _bt_readfirstpage(), _bt_readnextpage(), _bt_returnitem(), _bt_search(), _bt_start_array_keys(), BTScanOpaqueData::arrayKeys, Assert(), BT_READ, BTEqualStrategyNumber, BTGreaterEqualStrategyNumber, BTGreaterStrategyNumber, BTLessEqualStrategyNumber, BTLessStrategyNumber, BTORDER_PROC, BTScanPosIsValid, BTScanPosData::buf, BufferIsValid(), cur, BTScanOpaqueData::currPos, DatumGetPointer(), elog, ERROR, get_opfamily_proc(), BTArrayKeyInfo::high_compare, i, index_getprocinfo(), INDEX_MAX_KEYS, IndexScanDescData::indexRelation, IndexScanDescData::instrument, InvalidBlockNumber, InvalidOid, InvalidStrategy, IsolationIsSerializable, BTScanOpaqueData::keyData, BTArrayKeyInfo::low_compare, BTScanOpaqueData::needPrimScan, IndexScanInstrumentation::nsearches, BTArrayKeyInfo::null_elem, BTScanOpaqueData::numArrayKeys, BTScanOpaqueData::numberOfKeys, IndexScanDescData::opaque, P_NONE, IndexScanDescData::parallel_scan, pgstat_count_index_scan, PredicateLockRelation(), BTScanOpaqueData::qual_ok, RelationData::rd_opcintype, RelationData::rd_opfamily, RegProcedureIsValid, RelationGetRelationName, BTArrayKeyInfo::scan_key, ScanDirectionIsBackward, ScanDirectionIsForward, ScanKeyEntryInitialize(), ScanKeyEntryInitializeWithInfo(), ScanKeyData::sk_attno, SK_BT_DESC, SK_BT_MAXVAL, SK_BT_MINVAL, SK_BT_NEXT, SK_BT_NULLS_FIRST, SK_BT_PRIOR, SK_BT_SKIP, ScanKeyData::sk_flags, SK_ISNULL, SK_ROW_END, SK_ROW_HEADER, SK_ROW_MEMBER, SK_SEARCHNOTNULL, ScanKeyData::sk_strategy, and IndexScanDescData::xs_snapshot.

Referenced by btgetbitmap(), and btgettuple().

◆ _bt_form_posting()

IndexTuple _bt_form_posting ( IndexTuple  base,
ItemPointer  htids,
int  nhtids 
)

Definition at line 864 of file nbtdedup.c.

865{
866 uint32 keysize,
867 newsize;
868 IndexTuple itup;
869
870 if (BTreeTupleIsPosting(base))
871 keysize = BTreeTupleGetPostingOffset(base);
872 else
873 keysize = IndexTupleSize(base);
874
876 Assert(nhtids > 0 && nhtids <= PG_UINT16_MAX);
877 Assert(keysize == MAXALIGN(keysize));
878
879 /* Determine final size of new tuple */
880 if (nhtids > 1)
881 newsize = MAXALIGN(keysize +
882 nhtids * sizeof(ItemPointerData));
883 else
884 newsize = keysize;
885
886 Assert(newsize <= INDEX_SIZE_MASK);
887 Assert(newsize == MAXALIGN(newsize));
888
889 /* Allocate memory using palloc0() (matches index_form_tuple()) */
890 itup = palloc0(newsize);
891 memcpy(itup, base, keysize);
892 itup->t_info &= ~INDEX_SIZE_MASK;
893 itup->t_info |= newsize;
894 if (nhtids > 1)
895 {
896 /* Form posting list tuple */
897 BTreeTupleSetPosting(itup, nhtids, keysize);
898 memcpy(BTreeTupleGetPosting(itup), htids,
899 sizeof(ItemPointerData) * nhtids);
900 Assert(_bt_posting_valid(itup));
901 }
902 else
903 {
904 /* Form standard non-pivot tuple */
905 itup->t_info &= ~INDEX_ALT_TID_MASK;
906 ItemPointerCopy(htids, &itup->t_tid);
908 }
909
910 return itup;
911}
#define PG_UINT16_MAX
Definition: c.h:558
static void ItemPointerCopy(const ItemPointerData *fromPointer, ItemPointerData *toPointer)
Definition: itemptr.h:172
static bool ItemPointerIsValid(const ItemPointerData *pointer)
Definition: itemptr.h:83
void * palloc0(Size size)
Definition: mcxt.c:1347
static void BTreeTupleSetPosting(IndexTuple itup, uint16 nhtids, int postingoffset)
Definition: nbtree.h:505
unsigned short t_info
Definition: itup.h:49

References Assert(), BTreeTupleGetPosting(), BTreeTupleGetPostingOffset(), BTreeTupleIsPivot(), BTreeTupleIsPosting(), BTreeTupleSetPosting(), INDEX_SIZE_MASK, IndexTupleSize(), ItemPointerCopy(), ItemPointerIsValid(), MAXALIGN, palloc0(), PG_UINT16_MAX, IndexTupleData::t_info, and IndexTupleData::t_tid.

Referenced by _bt_dedup_finish_pending(), _bt_sort_dedup_finish_pending(), and bt_posting_plain_tuple().

◆ _bt_freestack()

void _bt_freestack ( BTStack  stack)

Definition at line 187 of file nbtutils.c.

188{
189 BTStack ostack;
190
191 while (stack != NULL)
192 {
193 ostack = stack;
194 stack = stack->bts_parent;
195 pfree(ostack);
196 }
197}
struct BTStackData * bts_parent
Definition: nbtree.h:747

References BTStackData::bts_parent, and pfree().

Referenced by _bt_doinsert(), _bt_first(), and bt_rootdescend().

◆ _bt_get_endpoint()

Buffer _bt_get_endpoint ( Relation  rel,
uint32  level,
bool  rightmost 
)

Definition at line 2553 of file nbtsearch.c.

2554{
2555 Buffer buf;
2556 Page page;
2557 BTPageOpaque opaque;
2558 OffsetNumber offnum;
2559 BlockNumber blkno;
2560 IndexTuple itup;
2561
2562 /*
2563 * If we are looking for a leaf page, okay to descend from fast root;
2564 * otherwise better descend from true root. (There is no point in being
2565 * smarter about intermediate levels.)
2566 */
2567 if (level == 0)
2568 buf = _bt_getroot(rel, NULL, BT_READ);
2569 else
2570 buf = _bt_gettrueroot(rel);
2571
2572 if (!BufferIsValid(buf))
2573 return InvalidBuffer;
2574
2575 page = BufferGetPage(buf);
2576 opaque = BTPageGetOpaque(page);
2577
2578 for (;;)
2579 {
2580 /*
2581 * If we landed on a deleted page, step right to find a live page
2582 * (there must be one). Also, if we want the rightmost page, step
2583 * right if needed to get to it (this could happen if the page split
2584 * since we obtained a pointer to it).
2585 */
2586 while (P_IGNORE(opaque) ||
2587 (rightmost && !P_RIGHTMOST(opaque)))
2588 {
2589 blkno = opaque->btpo_next;
2590 if (blkno == P_NONE)
2591 elog(ERROR, "fell off the end of index \"%s\"",
2593 buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2594 page = BufferGetPage(buf);
2595 opaque = BTPageGetOpaque(page);
2596 }
2597
2598 /* Done? */
2599 if (opaque->btpo_level == level)
2600 break;
2601 if (opaque->btpo_level < level)
2602 ereport(ERROR,
2603 (errcode(ERRCODE_INDEX_CORRUPTED),
2604 errmsg_internal("btree level %u not found in index \"%s\"",
2605 level, RelationGetRelationName(rel))));
2606
2607 /* Descend to leftmost or rightmost child page */
2608 if (rightmost)
2609 offnum = PageGetMaxOffsetNumber(page);
2610 else
2611 offnum = P_FIRSTDATAKEY(opaque);
2612
2613 itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
2614 blkno = BTreeTupleGetDownLink(itup);
2615
2616 buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2617 page = BufferGetPage(buf);
2618 opaque = BTPageGetOpaque(page);
2619 }
2620
2621 return buf;
2622}
Buffer _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
Definition: nbtpage.c:1003
Buffer _bt_gettrueroot(Relation rel)
Definition: nbtpage.c:580
Buffer _bt_getroot(Relation rel, Relation heaprel, int access)
Definition: nbtpage.c:344
static BlockNumber BTreeTupleGetDownLink(IndexTuple pivot)
Definition: nbtree.h:557
uint32 btpo_level
Definition: nbtree.h:67

References _bt_getroot(), _bt_gettrueroot(), _bt_relandgetbuf(), BT_READ, BTPageGetOpaque, BTPageOpaqueData::btpo_level, BTPageOpaqueData::btpo_next, BTreeTupleGetDownLink(), buf, BufferGetPage(), BufferIsValid(), elog, ereport, errcode(), errmsg_internal(), ERROR, InvalidBuffer, P_FIRSTDATAKEY, P_IGNORE, P_NONE, P_RIGHTMOST, PageGetItem(), PageGetItemId(), PageGetMaxOffsetNumber(), and RelationGetRelationName.

Referenced by _bt_endpoint(), and _bt_insert_parent().

◆ _bt_getbuf()

Buffer _bt_getbuf ( Relation  rel,
BlockNumber  blkno,
int  access 
)

Definition at line 845 of file nbtpage.c.

846{
847 Buffer buf;
848
850
851 /* Read an existing block of the relation */
852 buf = ReadBuffer(rel, blkno);
853 _bt_lockbuf(rel, buf, access);
854 _bt_checkpage(rel, buf);
855
856 return buf;
857}
void _bt_checkpage(Relation rel, Buffer buf)
Definition: nbtpage.c:797
void _bt_lockbuf(Relation rel, Buffer buf, int access)
Definition: nbtpage.c:1039
short access
Definition: preproc-type.c:36

References _bt_checkpage(), _bt_lockbuf(), Assert(), BlockNumberIsValid(), buf, and ReadBuffer().

Referenced by _bt_finish_split(), _bt_getroot(), _bt_getrootheight(), _bt_getstackbuf(), _bt_gettrueroot(), _bt_insertonpg(), _bt_killitems(), _bt_leftsib_splitflag(), _bt_lock_and_validate_left(), _bt_metaversion(), _bt_moveright(), _bt_newlevel(), _bt_pagedel(), _bt_readnextpage(), _bt_rightsib_halfdeadflag(), _bt_set_cleanup_info(), _bt_split(), _bt_unlink_halfdead_page(), and _bt_vacuum_needs_cleanup().

◆ _bt_getroot()

Buffer _bt_getroot ( Relation  rel,
Relation  heaprel,
int  access 
)

Definition at line 344 of file nbtpage.c.

345{
346 Buffer metabuf;
347 Buffer rootbuf;
348 Page rootpage;
349 BTPageOpaque rootopaque;
350 BlockNumber rootblkno;
351 uint32 rootlevel;
352 BTMetaPageData *metad;
353
354 Assert(access == BT_READ || heaprel != NULL);
355
356 /*
357 * Try to use previously-cached metapage data to find the root. This
358 * normally saves one buffer access per index search, which is a very
359 * helpful savings in bufmgr traffic and hence contention.
360 */
361 if (rel->rd_amcache != NULL)
362 {
363 metad = (BTMetaPageData *) rel->rd_amcache;
364 /* We shouldn't have cached it if any of these fail */
365 Assert(metad->btm_magic == BTREE_MAGIC);
368 Assert(!metad->btm_allequalimage ||
370 Assert(metad->btm_root != P_NONE);
371
372 rootblkno = metad->btm_fastroot;
373 Assert(rootblkno != P_NONE);
374 rootlevel = metad->btm_fastlevel;
375
376 rootbuf = _bt_getbuf(rel, rootblkno, BT_READ);
377 rootpage = BufferGetPage(rootbuf);
378 rootopaque = BTPageGetOpaque(rootpage);
379
380 /*
381 * Since the cache might be stale, we check the page more carefully
382 * here than normal. We *must* check that it's not deleted. If it's
383 * not alone on its level, then we reject too --- this may be overly
384 * paranoid but better safe than sorry. Note we don't check P_ISROOT,
385 * because that's not set in a "fast root".
386 */
387 if (!P_IGNORE(rootopaque) &&
388 rootopaque->btpo_level == rootlevel &&
389 P_LEFTMOST(rootopaque) &&
390 P_RIGHTMOST(rootopaque))
391 {
392 /* OK, accept cached page as the root */
393 return rootbuf;
394 }
395 _bt_relbuf(rel, rootbuf);
396 /* Cache is stale, throw it away */
397 if (rel->rd_amcache)
398 pfree(rel->rd_amcache);
399 rel->rd_amcache = NULL;
400 }
401
402 metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
403 metad = _bt_getmeta(rel, metabuf);
404
405 /* if no root page initialized yet, do it */
406 if (metad->btm_root == P_NONE)
407 {
408 Page metapg;
409
410 /* If access = BT_READ, caller doesn't want us to create root yet */
411 if (access == BT_READ)
412 {
413 _bt_relbuf(rel, metabuf);
414 return InvalidBuffer;
415 }
416
417 /* trade in our read lock for a write lock */
418 _bt_unlockbuf(rel, metabuf);
419 _bt_lockbuf(rel, metabuf, BT_WRITE);
420
421 /*
422 * Race condition: if someone else initialized the metadata between
423 * the time we released the read lock and acquired the write lock, we
424 * must avoid doing it again.
425 */
426 if (metad->btm_root != P_NONE)
427 {
428 /*
429 * Metadata initialized by someone else. In order to guarantee no
430 * deadlocks, we have to release the metadata page and start all
431 * over again. (Is that really true? But it's hardly worth trying
432 * to optimize this case.)
433 */
434 _bt_relbuf(rel, metabuf);
435 return _bt_getroot(rel, heaprel, access);
436 }
437
438 /*
439 * Get, initialize, write, and leave a lock of the appropriate type on
440 * the new root page. Since this is the first page in the tree, it's
441 * a leaf as well as the root.
442 */
443 rootbuf = _bt_allocbuf(rel, heaprel);
444 rootblkno = BufferGetBlockNumber(rootbuf);
445 rootpage = BufferGetPage(rootbuf);
446 rootopaque = BTPageGetOpaque(rootpage);
447 rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE;
448 rootopaque->btpo_flags = (BTP_LEAF | BTP_ROOT);
449 rootopaque->btpo_level = 0;
450 rootopaque->btpo_cycleid = 0;
451 /* Get raw page pointer for metapage */
452 metapg = BufferGetPage(metabuf);
453
454 /* NO ELOG(ERROR) till meta is updated */
456
457 /* upgrade metapage if needed */
458 if (metad->btm_version < BTREE_NOVAC_VERSION)
459 _bt_upgrademetapage(metapg);
460
461 metad->btm_root = rootblkno;
462 metad->btm_level = 0;
463 metad->btm_fastroot = rootblkno;
464 metad->btm_fastlevel = 0;
467
468 MarkBufferDirty(rootbuf);
469 MarkBufferDirty(metabuf);
470
471 /* XLOG stuff */
472 if (RelationNeedsWAL(rel))
473 {
474 xl_btree_newroot xlrec;
475 XLogRecPtr recptr;
477
481
483 md.version = metad->btm_version;
484 md.root = rootblkno;
485 md.level = 0;
486 md.fastroot = rootblkno;
487 md.fastlevel = 0;
490
491 XLogRegisterBufData(2, &md, sizeof(xl_btree_metadata));
492
493 xlrec.rootblk = rootblkno;
494 xlrec.level = 0;
495
497
498 recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT);
499
500 PageSetLSN(rootpage, recptr);
501 PageSetLSN(metapg, recptr);
502 }
503
505
506 /*
507 * swap root write lock for read lock. There is no danger of anyone
508 * else accessing the new root page while it's unlocked, since no one
509 * else knows where it is yet.
510 */
511 _bt_unlockbuf(rel, rootbuf);
512 _bt_lockbuf(rel, rootbuf, BT_READ);
513
514 /* okay, metadata is correct, release lock on it without caching */
515 _bt_relbuf(rel, metabuf);
516 }
517 else
518 {
519 rootblkno = metad->btm_fastroot;
520 Assert(rootblkno != P_NONE);
521 rootlevel = metad->btm_fastlevel;
522
523 /*
524 * Cache the metapage data for next time
525 */
527 sizeof(BTMetaPageData));
528 memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));
529
530 /*
531 * We are done with the metapage; arrange to release it via first
532 * _bt_relandgetbuf call
533 */
534 rootbuf = metabuf;
535
536 for (;;)
537 {
538 rootbuf = _bt_relandgetbuf(rel, rootbuf, rootblkno, BT_READ);
539 rootpage = BufferGetPage(rootbuf);
540 rootopaque = BTPageGetOpaque(rootpage);
541
542 if (!P_IGNORE(rootopaque))
543 break;
544
545 /* it's dead, Jim. step right one page */
546 if (P_RIGHTMOST(rootopaque))
547 elog(ERROR, "no live root page found in index \"%s\"",
549 rootblkno = rootopaque->btpo_next;
550 }
551
552 if (rootopaque->btpo_level != rootlevel)
553 elog(ERROR, "root page %u of index \"%s\" has level %u, expected %u",
554 rootblkno, RelationGetRelationName(rel),
555 rootopaque->btpo_level, rootlevel);
556 }
557
558 /*
559 * By here, we have a pin and read lock on the root page, and no lock set
560 * on the metadata page. Return the root page's buffer.
561 */
562 return rootbuf;
563}
void * MemoryContextAlloc(MemoryContext context, Size size)
Definition: mcxt.c:1181
void _bt_upgrademetapage(Page page)
Definition: nbtpage.c:107
Buffer _bt_allocbuf(Relation rel, Relation heaprel)
Definition: nbtpage.c:869
static BTMetaPageData * _bt_getmeta(Relation rel, Buffer metabuf)
Definition: nbtpage.c:142
void _bt_unlockbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:1070
#define BTREE_MIN_VERSION
Definition: nbtree.h:152
#define BTP_LEAF
Definition: nbtree.h:77
#define BTREE_MAGIC
Definition: nbtree.h:150
#define BTP_ROOT
Definition: nbtree.h:78
#define SizeOfBtreeNewroot
Definition: nbtxlog.h:347
#define XLOG_BTREE_NEWROOT
Definition: nbtxlog.h:37
uint32 btm_last_cleanup_num_delpages
Definition: nbtree.h:115
uint32 btm_level
Definition: nbtree.h:109
float8 btm_last_cleanup_num_heap_tuples
Definition: nbtree.h:117
BlockNumber btm_fastroot
Definition: nbtree.h:110
uint32 btm_version
Definition: nbtree.h:107
uint32 btm_magic
Definition: nbtree.h:106
bool btm_allequalimage
Definition: nbtree.h:119
uint32 btm_fastlevel
Definition: nbtree.h:111
BlockNumber btpo_prev
Definition: nbtree.h:65
void * rd_amcache
Definition: rel.h:229
MemoryContext rd_indexcxt
Definition: rel.h:204
uint32 level
Definition: nbtxlog.h:50
uint32 version
Definition: nbtxlog.h:48
bool allequalimage
Definition: nbtxlog.h:54
BlockNumber fastroot
Definition: nbtxlog.h:51
uint32 fastlevel
Definition: nbtxlog.h:52
BlockNumber root
Definition: nbtxlog.h:49
uint32 last_cleanup_num_delpages
Definition: nbtxlog.h:53
uint32 level
Definition: nbtxlog.h:344
BlockNumber rootblk
Definition: nbtxlog.h:343
#define REGBUF_WILL_INIT
Definition: xloginsert.h:34

References _bt_allocbuf(), _bt_getbuf(), _bt_getmeta(), _bt_getroot(), _bt_lockbuf(), _bt_relandgetbuf(), _bt_relbuf(), _bt_unlockbuf(), _bt_upgrademetapage(), xl_btree_metadata::allequalimage, Assert(), BT_READ, BT_WRITE, BTMetaPageData::btm_allequalimage, BTMetaPageData::btm_fastlevel, BTMetaPageData::btm_fastroot, BTMetaPageData::btm_last_cleanup_num_delpages, BTMetaPageData::btm_last_cleanup_num_heap_tuples, BTMetaPageData::btm_level, BTMetaPageData::btm_magic, BTMetaPageData::btm_root, BTMetaPageData::btm_version, BTP_LEAF, BTP_ROOT, BTPageGetOpaque, BTPageOpaqueData::btpo_cycleid, BTPageOpaqueData::btpo_flags, BTPageOpaqueData::btpo_level, BTPageOpaqueData::btpo_next, BTPageOpaqueData::btpo_prev, BTREE_MAGIC, BTREE_METAPAGE, BTREE_MIN_VERSION, BTREE_NOVAC_VERSION, BTREE_VERSION, BufferGetBlockNumber(), BufferGetPage(), elog, END_CRIT_SECTION, ERROR, xl_btree_metadata::fastlevel, xl_btree_metadata::fastroot, InvalidBuffer, xl_btree_metadata::last_cleanup_num_delpages, xl_btree_metadata::level, xl_btree_newroot::level, MarkBufferDirty(), MemoryContextAlloc(), P_IGNORE, P_LEFTMOST, P_NONE, P_RIGHTMOST, PageSetLSN(), pfree(), RelationData::rd_amcache, RelationData::rd_indexcxt, REGBUF_STANDARD, REGBUF_WILL_INIT, RelationGetRelationName, RelationNeedsWAL, xl_btree_metadata::root, xl_btree_newroot::rootblk, SizeOfBtreeNewroot, START_CRIT_SECTION, xl_btree_metadata::version, XLOG_BTREE_NEWROOT, XLogBeginInsert(), XLogInsert(), XLogRegisterBufData(), XLogRegisterBuffer(), and XLogRegisterData().

Referenced by _bt_get_endpoint(), _bt_getroot(), and _bt_search().

◆ _bt_getrootheight()

int _bt_getrootheight ( Relation  rel)

Definition at line 675 of file nbtpage.c.

676{
677 BTMetaPageData *metad;
678
679 if (rel->rd_amcache == NULL)
680 {
681 Buffer metabuf;
682
683 metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
684 metad = _bt_getmeta(rel, metabuf);
685
686 /*
687 * If there's no root page yet, _bt_getroot() doesn't expect a cache
688 * to be made, so just stop here and report the index height is zero.
689 * (XXX perhaps _bt_getroot() should be changed to allow this case.)
690 */
691 if (metad->btm_root == P_NONE)
692 {
693 _bt_relbuf(rel, metabuf);
694 return 0;
695 }
696
697 /*
698 * Cache the metapage data for next time
699 */
701 sizeof(BTMetaPageData));
702 memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));
703 _bt_relbuf(rel, metabuf);
704 }
705
706 /* Get cached page */
707 metad = (BTMetaPageData *) rel->rd_amcache;
708 /* We shouldn't have cached it if any of these fail */
709 Assert(metad->btm_magic == BTREE_MAGIC);
712 Assert(!metad->btm_allequalimage ||
714 Assert(metad->btm_fastroot != P_NONE);
715
716 return metad->btm_fastlevel;
717}

References _bt_getbuf(), _bt_getmeta(), _bt_relbuf(), Assert(), BT_READ, BTMetaPageData::btm_allequalimage, BTMetaPageData::btm_fastlevel, BTMetaPageData::btm_fastroot, BTMetaPageData::btm_magic, BTMetaPageData::btm_root, BTMetaPageData::btm_version, BTREE_MAGIC, BTREE_METAPAGE, BTREE_MIN_VERSION, BTREE_NOVAC_VERSION, BTREE_VERSION, MemoryContextAlloc(), P_NONE, RelationData::rd_amcache, and RelationData::rd_indexcxt.

Referenced by _bt_insertonpg(), and btgettreeheight().

◆ _bt_getstackbuf()

Buffer _bt_getstackbuf ( Relation  rel,
Relation  heaprel,
BTStack  stack,
BlockNumber  child 
)

Definition at line 2319 of file nbtinsert.c.

2320{
2321 BlockNumber blkno;
2323
2324 blkno = stack->bts_blkno;
2325 start = stack->bts_offset;
2326
2327 for (;;)
2328 {
2329 Buffer buf;
2330 Page page;
2331 BTPageOpaque opaque;
2332
2333 buf = _bt_getbuf(rel, blkno, BT_WRITE);
2334 page = BufferGetPage(buf);
2335 opaque = BTPageGetOpaque(page);
2336
2337 Assert(heaprel != NULL);
2338 if (P_INCOMPLETE_SPLIT(opaque))
2339 {
2340 _bt_finish_split(rel, heaprel, buf, stack->bts_parent);
2341 continue;
2342 }
2343
2344 if (!P_IGNORE(opaque))
2345 {
2346 OffsetNumber offnum,
2347 minoff,
2348 maxoff;
2349 ItemId itemid;
2350 IndexTuple item;
2351
2352 minoff = P_FIRSTDATAKEY(opaque);
2353 maxoff = PageGetMaxOffsetNumber(page);
2354
2355 /*
2356 * start = InvalidOffsetNumber means "search the whole page". We
2357 * need this test anyway due to possibility that page has a high
2358 * key now when it didn't before.
2359 */
2360 if (start < minoff)
2361 start = minoff;
2362
2363 /*
2364 * Need this check too, to guard against possibility that page
2365 * split since we visited it originally.
2366 */
2367 if (start > maxoff)
2368 start = OffsetNumberNext(maxoff);
2369
2370 /*
2371 * These loops will check every item on the page --- but in an
2372 * order that's attuned to the probability of where it actually
2373 * is. Scan to the right first, then to the left.
2374 */
2375 for (offnum = start;
2376 offnum <= maxoff;
2377 offnum = OffsetNumberNext(offnum))
2378 {
2379 itemid = PageGetItemId(page, offnum);
2380 item = (IndexTuple) PageGetItem(page, itemid);
2381
2382 if (BTreeTupleGetDownLink(item) == child)
2383 {
2384 /* Return accurate pointer to where link is now */
2385 stack->bts_blkno = blkno;
2386 stack->bts_offset = offnum;
2387 return buf;
2388 }
2389 }
2390
2391 for (offnum = OffsetNumberPrev(start);
2392 offnum >= minoff;
2393 offnum = OffsetNumberPrev(offnum))
2394 {
2395 itemid = PageGetItemId(page, offnum);
2396 item = (IndexTuple) PageGetItem(page, itemid);
2397
2398 if (BTreeTupleGetDownLink(item) == child)
2399 {
2400 /* Return accurate pointer to where link is now */
2401 stack->bts_blkno = blkno;
2402 stack->bts_offset = offnum;
2403 return buf;
2404 }
2405 }
2406 }
2407
2408 /*
2409 * The item we're looking for moved right at least one page.
2410 *
2411 * Lehman and Yao couple/chain locks when moving right here, which we
2412 * can avoid. See nbtree/README.
2413 */
2414 if (P_RIGHTMOST(opaque))
2415 {
2416 _bt_relbuf(rel, buf);
2417 return InvalidBuffer;
2418 }
2419 blkno = opaque->btpo_next;
2421 _bt_relbuf(rel, buf);
2422 }
2423}
return str start
void _bt_finish_split(Relation rel, Relation heaprel, Buffer lbuf, BTStack stack)
Definition: nbtinsert.c:2241
#define OffsetNumberPrev(offsetNumber)
Definition: off.h:54
BlockNumber bts_blkno
Definition: nbtree.h:745
OffsetNumber bts_offset
Definition: nbtree.h:746

References _bt_finish_split(), _bt_getbuf(), _bt_relbuf(), Assert(), BT_WRITE, BTPageGetOpaque, BTPageOpaqueData::btpo_next, BTreeTupleGetDownLink(), BTStackData::bts_blkno, BTStackData::bts_offset, BTStackData::bts_parent, buf, BufferGetPage(), InvalidBuffer, InvalidOffsetNumber, OffsetNumberNext, OffsetNumberPrev, P_FIRSTDATAKEY, P_IGNORE, P_INCOMPLETE_SPLIT, P_RIGHTMOST, PageGetItem(), PageGetItemId(), PageGetMaxOffsetNumber(), and start.

Referenced by _bt_insert_parent(), and _bt_lock_subtree_parent().

◆ _bt_gettrueroot()

Buffer _bt_gettrueroot ( Relation  rel)

Definition at line 580 of file nbtpage.c.

581{
582 Buffer metabuf;
583 Page metapg;
584 BTPageOpaque metaopaque;
585 Buffer rootbuf;
586 Page rootpage;
587 BTPageOpaque rootopaque;
588 BlockNumber rootblkno;
589 uint32 rootlevel;
590 BTMetaPageData *metad;
591
592 /*
593 * We don't try to use cached metapage data here, since (a) this path is
594 * not performance-critical, and (b) if we are here it suggests our cache
595 * is out-of-date anyway. In light of point (b), it's probably safest to
596 * actively flush any cached metapage info.
597 */
598 if (rel->rd_amcache)
599 pfree(rel->rd_amcache);
600 rel->rd_amcache = NULL;
601
602 metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
603 metapg = BufferGetPage(metabuf);
604 metaopaque = BTPageGetOpaque(metapg);
605 metad = BTPageGetMeta(metapg);
606
607 if (!P_ISMETA(metaopaque) ||
608 metad->btm_magic != BTREE_MAGIC)
610 (errcode(ERRCODE_INDEX_CORRUPTED),
611 errmsg("index \"%s\" is not a btree",
613
614 if (metad->btm_version < BTREE_MIN_VERSION ||
615 metad->btm_version > BTREE_VERSION)
617 (errcode(ERRCODE_INDEX_CORRUPTED),
618 errmsg("version mismatch in index \"%s\": file version %d, "
619 "current version %d, minimal supported version %d",
622
623 /* if no root page initialized yet, fail */
624 if (metad->btm_root == P_NONE)
625 {
626 _bt_relbuf(rel, metabuf);
627 return InvalidBuffer;
628 }
629
630 rootblkno = metad->btm_root;
631 rootlevel = metad->btm_level;
632
633 /*
634 * We are done with the metapage; arrange to release it via first
635 * _bt_relandgetbuf call
636 */
637 rootbuf = metabuf;
638
639 for (;;)
640 {
641 rootbuf = _bt_relandgetbuf(rel, rootbuf, rootblkno, BT_READ);
642 rootpage = BufferGetPage(rootbuf);
643 rootopaque = BTPageGetOpaque(rootpage);
644
645 if (!P_IGNORE(rootopaque))
646 break;
647
648 /* it's dead, Jim. step right one page */
649 if (P_RIGHTMOST(rootopaque))
650 elog(ERROR, "no live root page found in index \"%s\"",
652 rootblkno = rootopaque->btpo_next;
653 }
654
655 if (rootopaque->btpo_level != rootlevel)
656 elog(ERROR, "root page %u of index \"%s\" has level %u, expected %u",
657 rootblkno, RelationGetRelationName(rel),
658 rootopaque->btpo_level, rootlevel);
659
660 return rootbuf;
661}
#define P_ISMETA(opaque)
Definition: nbtree.h:224

References _bt_getbuf(), _bt_relandgetbuf(), _bt_relbuf(), BT_READ, BTMetaPageData::btm_level, BTMetaPageData::btm_magic, BTMetaPageData::btm_root, BTMetaPageData::btm_version, BTPageGetMeta, BTPageGetOpaque, BTPageOpaqueData::btpo_level, BTPageOpaqueData::btpo_next, BTREE_MAGIC, BTREE_METAPAGE, BTREE_MIN_VERSION, BTREE_VERSION, BufferGetPage(), elog, ereport, errcode(), errmsg(), ERROR, InvalidBuffer, P_IGNORE, P_ISMETA, P_NONE, P_RIGHTMOST, pfree(), RelationData::rd_amcache, and RelationGetRelationName.

Referenced by _bt_get_endpoint().

◆ _bt_initmetapage()

void _bt_initmetapage ( Page  page,
BlockNumber  rootbknum,
uint32  level,
bool  allequalimage 
)

Definition at line 67 of file nbtpage.c.

69{
70 BTMetaPageData *metad;
71 BTPageOpaque metaopaque;
72
73 _bt_pageinit(page, BLCKSZ);
74
75 metad = BTPageGetMeta(page);
76 metad->btm_magic = BTREE_MAGIC;
78 metad->btm_root = rootbknum;
79 metad->btm_level = level;
80 metad->btm_fastroot = rootbknum;
81 metad->btm_fastlevel = level;
84 metad->btm_allequalimage = allequalimage;
85
86 metaopaque = BTPageGetOpaque(page);
87 metaopaque->btpo_flags = BTP_META;
88
89 /*
90 * Set pd_lower just past the end of the metadata. This is essential,
91 * because without doing so, metadata will be lost if xlog.c compresses
92 * the page.
93 */
94 ((PageHeader) page)->pd_lower =
95 ((char *) metad + sizeof(BTMetaPageData)) - (char *) page;
96}
PageHeaderData * PageHeader
Definition: bufpage.h:174
#define BTP_META
Definition: nbtree.h:80

References _bt_pageinit(), BTMetaPageData::btm_allequalimage, BTMetaPageData::btm_fastlevel, BTMetaPageData::btm_fastroot, BTMetaPageData::btm_last_cleanup_num_delpages, BTMetaPageData::btm_last_cleanup_num_heap_tuples, BTMetaPageData::btm_level, BTMetaPageData::btm_magic, BTMetaPageData::btm_root, BTMetaPageData::btm_version, BTP_META, BTPageGetMeta, BTPageGetOpaque, BTPageOpaqueData::btpo_flags, BTREE_MAGIC, and BTREE_VERSION.

Referenced by _bt_uppershutdown(), and btbuildempty().

◆ _bt_keep_natts_fast()

int _bt_keep_natts_fast ( Relation  rel,
IndexTuple  lastleft,
IndexTuple  firstright 
)

Definition at line 4009 of file nbtutils.c.

4010{
4011 TupleDesc itupdesc = RelationGetDescr(rel);
4013 int keepnatts;
4014
4015 keepnatts = 1;
4016 for (int attnum = 1; attnum <= keysz; attnum++)
4017 {
4018 Datum datum1,
4019 datum2;
4020 bool isNull1,
4021 isNull2;
4022 CompactAttribute *att;
4023
4024 datum1 = index_getattr(lastleft, attnum, itupdesc, &isNull1);
4025 datum2 = index_getattr(firstright, attnum, itupdesc, &isNull2);
4026 att = TupleDescCompactAttr(itupdesc, attnum - 1);
4027
4028 if (isNull1 != isNull2)
4029 break;
4030
4031 if (!isNull1 &&
4032 !datum_image_eq(datum1, datum2, att->attbyval, att->attlen))
4033 break;
4034
4035 keepnatts++;
4036 }
4037
4038 return keepnatts;
4039}
bool datum_image_eq(Datum value1, Datum value2, bool typByVal, int typLen)
Definition: datum.c:266
int16 attnum
Definition: pg_attribute.h:74
int16 attlen
Definition: tupdesc.h:71
static CompactAttribute * TupleDescCompactAttr(TupleDesc tupdesc, int i)
Definition: tupdesc.h:175

References CompactAttribute::attbyval, CompactAttribute::attlen, attnum, datum_image_eq(), index_getattr(), IndexRelationGetNumberOfKeyAttributes, RelationGetDescr, and TupleDescCompactAttr().

Referenced by _bt_afternewitemoff(), _bt_bottomupdel_pass(), _bt_dedup_pass(), _bt_do_singleval(), _bt_keep_natts(), _bt_load(), _bt_set_startikey(), _bt_split_penalty(), and _bt_strategy().

◆ _bt_killitems()

void _bt_killitems ( IndexScanDesc  scan)

Definition at line 3310 of file nbtutils.c.

3311{
3312 BTScanOpaque so = (BTScanOpaque) scan->opaque;
3313 Page page;
3314 BTPageOpaque opaque;
3315 OffsetNumber minoff;
3316 OffsetNumber maxoff;
3317 int i;
3318 int numKilled = so->numKilled;
3319 bool killedsomething = false;
3320 bool droppedpin PG_USED_FOR_ASSERTS_ONLY;
3321
3323
3324 /*
3325 * Always reset the scan state, so we don't look for same items on other
3326 * pages.
3327 */
3328 so->numKilled = 0;
3329
3330 if (BTScanPosIsPinned(so->currPos))
3331 {
3332 /*
3333 * We have held the pin on this page since we read the index tuples,
3334 * so all we need to do is lock it. The pin will have prevented
3335 * re-use of any TID on the page, so there is no need to check the
3336 * LSN.
3337 */
3338 droppedpin = false;
3340
3341 page = BufferGetPage(so->currPos.buf);
3342 }
3343 else
3344 {
3345 Buffer buf;
3346
3347 droppedpin = true;
3348 /* Attempt to re-read the buffer, getting pin and lock. */
3350
3351 page = BufferGetPage(buf);
3352 if (BufferGetLSNAtomic(buf) == so->currPos.lsn)
3353 so->currPos.buf = buf;
3354 else
3355 {
3356 /* Modified while not pinned means hinting is not safe. */
3358 return;
3359 }
3360 }
3361
3362 opaque = BTPageGetOpaque(page);
3363 minoff = P_FIRSTDATAKEY(opaque);
3364 maxoff = PageGetMaxOffsetNumber(page);
3365
3366 for (i = 0; i < numKilled; i++)
3367 {
3368 int itemIndex = so->killedItems[i];
3369 BTScanPosItem *kitem = &so->currPos.items[itemIndex];
3370 OffsetNumber offnum = kitem->indexOffset;
3371
3372 Assert(itemIndex >= so->currPos.firstItem &&
3373 itemIndex <= so->currPos.lastItem);
3374 if (offnum < minoff)
3375 continue; /* pure paranoia */
3376 while (offnum <= maxoff)
3377 {
3378 ItemId iid = PageGetItemId(page, offnum);
3379 IndexTuple ituple = (IndexTuple) PageGetItem(page, iid);
3380 bool killtuple = false;
3381
3382 if (BTreeTupleIsPosting(ituple))
3383 {
3384 int pi = i + 1;
3385 int nposting = BTreeTupleGetNPosting(ituple);
3386 int j;
3387
3388 /*
3389 * We rely on the convention that heap TIDs in the scanpos
3390 * items array are stored in ascending heap TID order for a
3391 * group of TIDs that originally came from a posting list
3392 * tuple. This convention even applies during backwards
3393 * scans, where returning the TIDs in descending order might
3394 * seem more natural. This is about effectiveness, not
3395 * correctness.
3396 *
3397 * Note that the page may have been modified in almost any way
3398 * since we first read it (in the !droppedpin case), so it's
3399 * possible that this posting list tuple wasn't a posting list
3400 * tuple when we first encountered its heap TIDs.
3401 */
3402 for (j = 0; j < nposting; j++)
3403 {
3404 ItemPointer item = BTreeTupleGetPostingN(ituple, j);
3405
3406 if (!ItemPointerEquals(item, &kitem->heapTid))
3407 break; /* out of posting list loop */
3408
3409 /*
3410 * kitem must have matching offnum when heap TIDs match,
3411 * though only in the common case where the page can't
3412 * have been concurrently modified
3413 */
3414 Assert(kitem->indexOffset == offnum || !droppedpin);
3415
3416 /*
3417 * Read-ahead to later kitems here.
3418 *
3419 * We rely on the assumption that not advancing kitem here
3420 * will prevent us from considering the posting list tuple
3421 * fully dead by not matching its next heap TID in next
3422 * loop iteration.
3423 *
3424 * If, on the other hand, this is the final heap TID in
3425 * the posting list tuple, then tuple gets killed
3426 * regardless (i.e. we handle the case where the last
3427 * kitem is also the last heap TID in the last index tuple
3428 * correctly -- posting tuple still gets killed).
3429 */
3430 if (pi < numKilled)
3431 kitem = &so->currPos.items[so->killedItems[pi++]];
3432 }
3433
3434 /*
3435 * Don't bother advancing the outermost loop's int iterator to
3436 * avoid processing killed items that relate to the same
3437 * offnum/posting list tuple. This micro-optimization hardly
3438 * seems worth it. (Further iterations of the outermost loop
3439 * will fail to match on this same posting list's first heap
3440 * TID instead, so we'll advance to the next offnum/index
3441 * tuple pretty quickly.)
3442 */
3443 if (j == nposting)
3444 killtuple = true;
3445 }
3446 else if (ItemPointerEquals(&ituple->t_tid, &kitem->heapTid))
3447 killtuple = true;
3448
3449 /*
3450 * Mark index item as dead, if it isn't already. Since this
3451 * happens while holding a buffer lock possibly in shared mode,
3452 * it's possible that multiple processes attempt to do this
3453 * simultaneously, leading to multiple full-page images being sent
3454 * to WAL (if wal_log_hints or data checksums are enabled), which
3455 * is undesirable.
3456 */
3457 if (killtuple && !ItemIdIsDead(iid))
3458 {
3459 /* found the item/all posting list items */
3460 ItemIdMarkDead(iid);
3461 killedsomething = true;
3462 break; /* out of inner search loop */
3463 }
3464 offnum = OffsetNumberNext(offnum);
3465 }
3466 }
3467
3468 /*
3469 * Since this can be redone later if needed, mark as dirty hint.
3470 *
3471 * Whenever we mark anything LP_DEAD, we also set the page's
3472 * BTP_HAS_GARBAGE flag, which is likewise just a hint. (Note that we
3473 * only rely on the page-level flag in !heapkeyspace indexes.)
3474 */
3475 if (killedsomething)
3476 {
3477 opaque->btpo_flags |= BTP_HAS_GARBAGE;
3478 MarkBufferDirtyHint(so->currPos.buf, true);
3479 }
3480
3482}
XLogRecPtr BufferGetLSNAtomic(Buffer buffer)
Definition: bufmgr.c:4423
void MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
Definition: bufmgr.c:5367
int j
Definition: isn.c:78
#define ItemIdMarkDead(itemId)
Definition: itemid.h:179
#define BTP_HAS_GARBAGE
Definition: nbtree.h:83
int * killedItems
Definition: nbtree.h:1071
BlockNumber currPage
Definition: nbtree.h:967
int firstItem
Definition: nbtree.h:995
BTScanPosItem items[MaxTIDsPerBTreePage]
Definition: nbtree.h:999
XLogRecPtr lsn
Definition: nbtree.h:970
ItemPointerData heapTid
Definition: nbtree.h:957
OffsetNumber indexOffset
Definition: nbtree.h:958

References _bt_getbuf(), _bt_lockbuf(), _bt_relbuf(), _bt_unlockbuf(), Assert(), BT_READ, BTP_HAS_GARBAGE, BTPageGetOpaque, BTreeTupleGetNPosting(), BTreeTupleGetPostingN(), BTreeTupleIsPosting(), BTScanPosIsPinned, BTScanPosIsValid, buf, BTScanPosData::buf, BufferGetLSNAtomic(), BufferGetPage(), BTScanPosData::currPage, BTScanOpaqueData::currPos, BTScanPosData::firstItem, BTScanPosItem::heapTid, i, BTScanPosItem::indexOffset, IndexScanDescData::indexRelation, ItemIdIsDead, ItemIdMarkDead, ItemPointerEquals(), BTScanPosData::items, j, BTScanOpaqueData::killedItems, BTScanPosData::lsn, MarkBufferDirtyHint(), BTScanOpaqueData::numKilled, OffsetNumberNext, IndexScanDescData::opaque, P_FIRSTDATAKEY, PageGetItem(), PageGetItemId(), PageGetMaxOffsetNumber(), PG_USED_FOR_ASSERTS_ONLY, and IndexTupleData::t_tid.

Referenced by _bt_steppage(), btendscan(), btrescan(), and btrestrpos().

◆ _bt_lockbuf()

void _bt_lockbuf ( Relation  rel,
Buffer  buf,
int  access 
)

Definition at line 1039 of file nbtpage.c.

1040{
1041 /* LockBuffer() asserts that pin is held by this backend */
1043
1044 /*
1045 * It doesn't matter that _bt_unlockbuf() won't get called in the event of
1046 * an nbtree error (e.g. a unique violation error). That won't cause
1047 * Valgrind false positives.
1048 *
1049 * The nbtree client requests are superimposed on top of the bufmgr.c
1050 * buffer pin client requests. In the event of an nbtree error the buffer
1051 * will certainly get marked as defined when the backend once again
1052 * acquires its first pin on the buffer. (Of course, if the backend never
1053 * touches the buffer again then it doesn't matter that it remains
1054 * non-accessible to Valgrind.)
1055 *
1056 * Note: When an IndexTuple C pointer gets computed using an ItemId read
1057 * from a page while a lock was held, the C pointer becomes unsafe to
1058 * dereference forever as soon as the lock is released. Valgrind can only
1059 * detect cases where the pointer gets dereferenced with no _current_
1060 * lock/pin held, though.
1061 */
1062 if (!RelationUsesLocalBuffers(rel))
1064}
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:5537

References buf, BufferGetPage(), LockBuffer(), RelationUsesLocalBuffers, and VALGRIND_MAKE_MEM_DEFINED.

Referenced by _bt_getbuf(), _bt_getroot(), _bt_killitems(), _bt_moveright(), _bt_pagedel(), _bt_relandgetbuf(), _bt_search(), _bt_set_cleanup_info(), _bt_unlink_halfdead_page(), and btvacuumpage().

◆ _bt_metaversion()

void _bt_metaversion ( Relation  rel,
bool *  heapkeyspace,
bool *  allequalimage 
)

Definition at line 739 of file nbtpage.c.

740{
741 BTMetaPageData *metad;
742
743 if (rel->rd_amcache == NULL)
744 {
745 Buffer metabuf;
746
747 metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
748 metad = _bt_getmeta(rel, metabuf);
749
750 /*
751 * If there's no root page yet, _bt_getroot() doesn't expect a cache
752 * to be made, so just stop here. (XXX perhaps _bt_getroot() should
753 * be changed to allow this case.)
754 */
755 if (metad->btm_root == P_NONE)
756 {
757 *heapkeyspace = metad->btm_version > BTREE_NOVAC_VERSION;
758 *allequalimage = metad->btm_allequalimage;
759
760 _bt_relbuf(rel, metabuf);
761 return;
762 }
763
764 /*
765 * Cache the metapage data for next time
766 *
767 * An on-the-fly version upgrade performed by _bt_upgrademetapage()
768 * can change the nbtree version for an index without invalidating any
769 * local cache. This is okay because it can only happen when moving
770 * from version 2 to version 3, both of which are !heapkeyspace
771 * versions.
772 */
774 sizeof(BTMetaPageData));
775 memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));
776 _bt_relbuf(rel, metabuf);
777 }
778
779 /* Get cached page */
780 metad = (BTMetaPageData *) rel->rd_amcache;
781 /* We shouldn't have cached it if any of these fail */
782 Assert(metad->btm_magic == BTREE_MAGIC);
785 Assert(!metad->btm_allequalimage ||
787 Assert(metad->btm_fastroot != P_NONE);
788
789 *heapkeyspace = metad->btm_version > BTREE_NOVAC_VERSION;
790 *allequalimage = metad->btm_allequalimage;
791}

References _bt_getbuf(), _bt_getmeta(), _bt_relbuf(), Assert(), BT_READ, BTMetaPageData::btm_allequalimage, BTMetaPageData::btm_fastroot, BTMetaPageData::btm_magic, BTMetaPageData::btm_root, BTMetaPageData::btm_version, BTREE_MAGIC, BTREE_METAPAGE, BTREE_MIN_VERSION, BTREE_NOVAC_VERSION, BTREE_VERSION, MemoryContextAlloc(), P_NONE, RelationData::rd_amcache, and RelationData::rd_indexcxt.

Referenced by _bt_first(), _bt_mkscankey(), and bt_index_check_callback().

◆ _bt_mkscankey()

BTScanInsert _bt_mkscankey ( Relation  rel,
IndexTuple  itup 
)

Definition at line 95 of file nbtutils.c.

96{
98 ScanKey skey;
99 TupleDesc itupdesc;
100 int indnkeyatts;
101 int16 *indoption;
102 int tupnatts;
103 int i;
104
105 itupdesc = RelationGetDescr(rel);
106 indnkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
107 indoption = rel->rd_indoption;
108 tupnatts = itup ? BTreeTupleGetNAtts(itup, rel) : 0;
109
111
112 /*
113 * We'll execute search using scan key constructed on key columns.
114 * Truncated attributes and non-key attributes are omitted from the final
115 * scan key.
116 */
117 key = palloc(offsetof(BTScanInsertData, scankeys) +
118 sizeof(ScanKeyData) * indnkeyatts);
119 if (itup)
120 _bt_metaversion(rel, &key->heapkeyspace, &key->allequalimage);
121 else
122 {
123 /* Utility statement callers can set these fields themselves */
124 key->heapkeyspace = true;
125 key->allequalimage = false;
126 }
127 key->anynullkeys = false; /* initial assumption */
128 key->nextkey = false; /* usual case, required by btinsert */
129 key->backward = false; /* usual case, required by btinsert */
130 key->keysz = Min(indnkeyatts, tupnatts);
131 key->scantid = key->heapkeyspace && itup ?
132 BTreeTupleGetHeapTID(itup) : NULL;
133 skey = key->scankeys;
134 for (i = 0; i < indnkeyatts; i++)
135 {
136 FmgrInfo *procinfo;
137 Datum arg;
138 bool null;
139 int flags;
140
141 /*
142 * We can use the cached (default) support procs since no cross-type
143 * comparison can be needed.
144 */
145 procinfo = index_getprocinfo(rel, i + 1, BTORDER_PROC);
146
147 /*
148 * Key arguments built from truncated attributes (or when caller
149 * provides no tuple) are defensively represented as NULL values. They
150 * should never be used.
151 */
152 if (i < tupnatts)
153 arg = index_getattr(itup, i + 1, itupdesc, &null);
154 else
155 {
156 arg = (Datum) 0;
157 null = true;
158 }
159 flags = (null ? SK_ISNULL : 0) | (indoption[i] << SK_BT_INDOPTION_SHIFT);
161 flags,
162 (AttrNumber) (i + 1),
165 rel->rd_indcollation[i],
166 procinfo,
167 arg);
168 /* Record if any key attribute is NULL (or truncated) */
169 if (null)
170 key->anynullkeys = true;
171 }
172
173 /*
174 * In NULLS NOT DISTINCT mode, we pretend that there are no null keys, so
175 * that full uniqueness check is done.
176 */
177 if (rel->rd_index->indnullsnotdistinct)
178 key->anynullkeys = false;
179
180 return key;
181}
#define SK_BT_INDOPTION_SHIFT
Definition: nbtree.h:1145
int16 * rd_indoption
Definition: rel.h:211
Form_pg_index rd_index
Definition: rel.h:192

References _bt_metaversion(), arg, Assert(), BTORDER_PROC, BTreeTupleGetHeapTID(), BTreeTupleGetNAtts, i, index_getattr(), index_getprocinfo(), IndexRelationGetNumberOfAttributes, IndexRelationGetNumberOfKeyAttributes, InvalidOid, InvalidStrategy, sort-test::key, Min, palloc(), RelationData::rd_indcollation, RelationData::rd_index, RelationData::rd_indoption, RelationGetDescr, ScanKeyEntryInitializeWithInfo(), SK_BT_INDOPTION_SHIFT, and SK_ISNULL.

Referenced by _bt_doinsert(), _bt_leafbuild(), _bt_pagedel(), bt_mkscankey_pivotsearch(), bt_rootdescend(), tuplesort_begin_cluster(), and tuplesort_begin_index_btree().

◆ _bt_next()

bool _bt_next ( IndexScanDesc  scan,
ScanDirection  dir 
)

Definition at line 1541 of file nbtsearch.c.

1542{
1543 BTScanOpaque so = (BTScanOpaque) scan->opaque;
1544
1546
1547 /*
1548 * Advance to next tuple on current page; or if there's no more, try to
1549 * step to the next page with data.
1550 */
1551 if (ScanDirectionIsForward(dir))
1552 {
1553 if (++so->currPos.itemIndex > so->currPos.lastItem)
1554 {
1555 if (!_bt_steppage(scan, dir))
1556 return false;
1557 }
1558 }
1559 else
1560 {
1561 if (--so->currPos.itemIndex < so->currPos.firstItem)
1562 {
1563 if (!_bt_steppage(scan, dir))
1564 return false;
1565 }
1566 }
1567
1568 _bt_returnitem(scan, so);
1569 return true;
1570}
static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:2110
int lastItem
Definition: nbtree.h:996
int itemIndex
Definition: nbtree.h:997

References _bt_returnitem(), _bt_steppage(), Assert(), BTScanPosIsValid, BTScanOpaqueData::currPos, BTScanPosData::firstItem, BTScanPosData::itemIndex, BTScanPosData::lastItem, IndexScanDescData::opaque, and ScanDirectionIsForward.

Referenced by btgetbitmap(), and btgettuple().

◆ _bt_pagedel()

void _bt_pagedel ( Relation  rel,
Buffer  leafbuf,
BTVacState vstate 
)

Definition at line 1802 of file nbtpage.c.

1803{
1804 BlockNumber rightsib;
1805 bool rightsib_empty;
1806 Page page;
1807 BTPageOpaque opaque;
1808
1809 /*
1810 * Save original leafbuf block number from caller. Only deleted blocks
1811 * that are <= scanblkno are added to bulk delete stat's pages_deleted
1812 * count.
1813 */
1814 BlockNumber scanblkno = BufferGetBlockNumber(leafbuf);
1815
1816 /*
1817 * "stack" is a search stack leading (approximately) to the target page.
1818 * It is initially NULL, but when iterating, we keep it to avoid
1819 * duplicated search effort.
1820 *
1821 * Also, when "stack" is not NULL, we have already checked that the
1822 * current page is not the right half of an incomplete split, i.e. the
1823 * left sibling does not have its INCOMPLETE_SPLIT flag set, including
1824 * when the current target page is to the right of caller's initial page
1825 * (the scanblkno page).
1826 */
1827 BTStack stack = NULL;
1828
1829 for (;;)
1830 {
1831 page = BufferGetPage(leafbuf);
1832 opaque = BTPageGetOpaque(page);
1833
1834 /*
1835 * Internal pages are never deleted directly, only as part of deleting
1836 * the whole subtree all the way down to leaf level.
1837 *
1838 * Also check for deleted pages here. Caller never passes us a fully
1839 * deleted page. Only VACUUM can delete pages, so there can't have
1840 * been a concurrent deletion. Assume that we reached any deleted
1841 * page encountered here by following a sibling link, and that the
1842 * index is corrupt.
1843 */
1844 Assert(!P_ISDELETED(opaque));
1845 if (!P_ISLEAF(opaque) || P_ISDELETED(opaque))
1846 {
1847 /*
1848 * Pre-9.4 page deletion only marked internal pages as half-dead,
1849 * but now we only use that flag on leaf pages. The old algorithm
1850 * was never supposed to leave half-dead pages in the tree, it was
1851 * just a transient state, but it was nevertheless possible in
1852 * error scenarios. We don't know how to deal with them here. They
1853 * are harmless as far as searches are considered, but inserts
1854 * into the deleted keyspace could add out-of-order downlinks in
1855 * the upper levels. Log a notice, hopefully the admin will notice
1856 * and reindex.
1857 */
1858 if (P_ISHALFDEAD(opaque))
1859 ereport(LOG,
1860 (errcode(ERRCODE_INDEX_CORRUPTED),
1861 errmsg("index \"%s\" contains a half-dead internal page",
1863 errhint("This can be caused by an interrupted VACUUM in version 9.3 or older, before upgrade. Please REINDEX it.")));
1864
1865 if (P_ISDELETED(opaque))
1866 ereport(LOG,
1867 (errcode(ERRCODE_INDEX_CORRUPTED),
1868 errmsg_internal("found deleted block %u while following right link from block %u in index \"%s\"",
1869 BufferGetBlockNumber(leafbuf),
1870 scanblkno,
1872
1873 _bt_relbuf(rel, leafbuf);
1874 return;
1875 }
1876
1877 /*
1878 * We can never delete rightmost pages nor root pages. While at it,
1879 * check that page is empty, since it's possible that the leafbuf page
1880 * was empty a moment ago, but has since had some inserts.
1881 *
1882 * To keep the algorithm simple, we also never delete an incompletely
1883 * split page (they should be rare enough that this doesn't make any
1884 * meaningful difference to disk usage):
1885 *
1886 * The INCOMPLETE_SPLIT flag on the page tells us if the page is the
1887 * left half of an incomplete split, but ensuring that it's not the
1888 * right half is more complicated. For that, we have to check that
1889 * the left sibling doesn't have its INCOMPLETE_SPLIT flag set using
1890 * _bt_leftsib_splitflag(). On the first iteration, we temporarily
1891 * release the lock on scanblkno/leafbuf, check the left sibling, and
1892 * construct a search stack to scanblkno. On subsequent iterations,
1893 * we know we stepped right from a page that passed these tests, so
1894 * it's OK.
1895 */
1896 if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) ||
1897 P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page) ||
1898 P_INCOMPLETE_SPLIT(opaque))
1899 {
1900 /* Should never fail to delete a half-dead page */
1901 Assert(!P_ISHALFDEAD(opaque));
1902
1903 _bt_relbuf(rel, leafbuf);
1904 return;
1905 }
1906
1907 /*
1908 * First, remove downlink pointing to the page (or a parent of the
1909 * page, if we are going to delete a taller subtree), and mark the
1910 * leafbuf page half-dead
1911 */
1912 if (!P_ISHALFDEAD(opaque))
1913 {
1914 /*
1915 * We need an approximate pointer to the page's parent page. We
1916 * use a variant of the standard search mechanism to search for
1917 * the page's high key; this will give us a link to either the
1918 * current parent or someplace to its left (if there are multiple
1919 * equal high keys, which is possible with !heapkeyspace indexes).
1920 *
1921 * Also check if this is the right-half of an incomplete split
1922 * (see comment above).
1923 */
1924 if (!stack)
1925 {
1926 BTScanInsert itup_key;
1927 ItemId itemid;
1928 IndexTuple targetkey;
1929 BlockNumber leftsib,
1930 leafblkno;
1931 Buffer sleafbuf;
1932
1933 itemid = PageGetItemId(page, P_HIKEY);
1934 targetkey = CopyIndexTuple((IndexTuple) PageGetItem(page, itemid));
1935
1936 leftsib = opaque->btpo_prev;
1937 leafblkno = BufferGetBlockNumber(leafbuf);
1938
1939 /*
1940 * To avoid deadlocks, we'd better drop the leaf page lock
1941 * before going further.
1942 */
1943 _bt_unlockbuf(rel, leafbuf);
1944
1945 /*
1946 * Check that the left sibling of leafbuf (if any) is not
1947 * marked with INCOMPLETE_SPLIT flag before proceeding
1948 */
1949 Assert(leafblkno == scanblkno);
1950 if (_bt_leftsib_splitflag(rel, leftsib, leafblkno))
1951 {
1952 ReleaseBuffer(leafbuf);
1953 return;
1954 }
1955
1956 /*
1957 * We need an insertion scan key, so build one.
1958 *
1959 * _bt_search searches for the leaf page that contains any
1960 * matching non-pivot tuples, but we need it to "search" for
1961 * the high key pivot from the page that we're set to delete.
1962 * Compensate for the mismatch by having _bt_search locate the
1963 * last position < equal-to-untruncated-prefix non-pivots.
1964 */
1965 itup_key = _bt_mkscankey(rel, targetkey);
1966
1967 /* Set up a BTLessStrategyNumber-like insertion scan key */
1968 itup_key->nextkey = false;
1969 itup_key->backward = true;
1970 stack = _bt_search(rel, NULL, itup_key, &sleafbuf, BT_READ);
1971 /* won't need a second lock or pin on leafbuf */
1972 _bt_relbuf(rel, sleafbuf);
1973
1974 /*
1975 * Re-lock the leaf page, and start over to use our stack
1976 * within _bt_mark_page_halfdead. We must do it that way
1977 * because it's possible that leafbuf can no longer be
1978 * deleted. We need to recheck.
1979 *
1980 * Note: We can't simply hold on to the sleafbuf lock instead,
1981 * because it's barely possible that sleafbuf is not the same
1982 * page as leafbuf. This happens when leafbuf split after our
1983 * original lock was dropped, but before _bt_search finished
1984 * its descent. We rely on the assumption that we'll find
1985 * leafbuf isn't safe to delete anymore in this scenario.
1986 * (Page deletion can cope with the stack being to the left of
1987 * leafbuf, but not to the right of leafbuf.)
1988 */
1989 _bt_lockbuf(rel, leafbuf, BT_WRITE);
1990 continue;
1991 }
1992
1993 /*
1994 * See if it's safe to delete the leaf page, and determine how
1995 * many parent/internal pages above the leaf level will be
1996 * deleted. If it's safe then _bt_mark_page_halfdead will also
1997 * perform the first phase of deletion, which includes marking the
1998 * leafbuf page half-dead.
1999 */
2000 Assert(P_ISLEAF(opaque) && !P_IGNORE(opaque));
2001 if (!_bt_mark_page_halfdead(rel, vstate->info->heaprel, leafbuf,
2002 stack))
2003 {
2004 _bt_relbuf(rel, leafbuf);
2005 return;
2006 }
2007 }
2008
2009 /*
2010 * Then unlink it from its siblings. Each call to
2011 * _bt_unlink_halfdead_page unlinks the topmost page from the subtree,
2012 * making it shallower. Iterate until the leafbuf page is deleted.
2013 */
2014 rightsib_empty = false;
2015 Assert(P_ISLEAF(opaque) && P_ISHALFDEAD(opaque));
2016 while (P_ISHALFDEAD(opaque))
2017 {
2018 /* Check for interrupts in _bt_unlink_halfdead_page */
2019 if (!_bt_unlink_halfdead_page(rel, leafbuf, scanblkno,
2020 &rightsib_empty, vstate))
2021 {
2022 /*
2023 * _bt_unlink_halfdead_page should never fail, since we
2024 * established that deletion is generally safe in
2025 * _bt_mark_page_halfdead -- index must be corrupt.
2026 *
2027 * Note that _bt_unlink_halfdead_page already released the
2028 * lock and pin on leafbuf for us.
2029 */
2030 Assert(false);
2031 return;
2032 }
2033 }
2034
2035 Assert(P_ISLEAF(opaque) && P_ISDELETED(opaque));
2036
2037 rightsib = opaque->btpo_next;
2038
2039 _bt_relbuf(rel, leafbuf);
2040
2041 /*
2042 * Check here, as calling loops will have locks held, preventing
2043 * interrupts from being processed.
2044 */
2046
2047 /*
2048 * The page has now been deleted. If its right sibling is completely
2049 * empty, it's possible that the reason we haven't deleted it earlier
2050 * is that it was the rightmost child of the parent. Now that we
2051 * removed the downlink for this page, the right sibling might now be
2052 * the only child of the parent, and could be removed. It would be
2053 * picked up by the next vacuum anyway, but might as well try to
2054 * remove it now, so loop back to process the right sibling.
2055 *
2056 * Note: This relies on the assumption that _bt_getstackbuf() will be
2057 * able to reuse our original descent stack with a different child
2058 * block (provided that the child block is to the right of the
2059 * original leaf page reached by _bt_search()). It will even update
2060 * the descent stack each time we loop around, avoiding repeated work.
2061 */
2062 if (!rightsib_empty)
2063 break;
2064
2065 leafbuf = _bt_getbuf(rel, rightsib, BT_WRITE);
2066 }
2067}
#define LOG
Definition: elog.h:31
IndexTuple CopyIndexTuple(IndexTuple source)
Definition: indextuple.c:547
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:122
static bool _bt_leftsib_splitflag(Relation rel, BlockNumber leftsib, BlockNumber target)
Definition: nbtpage.c:1695
static bool _bt_mark_page_halfdead(Relation rel, Relation heaprel, Buffer leafbuf, BTStack stack)
Definition: nbtpage.c:2088
static bool _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno, bool *rightsib_empty, BTVacState *vstate)
Definition: nbtpage.c:2314
#define P_ISHALFDEAD(opaque)
Definition: nbtree.h:225
#define P_ISDELETED(opaque)
Definition: nbtree.h:223
#define P_ISROOT(opaque)
Definition: nbtree.h:222
IndexVacuumInfo * info
Definition: nbtree.h:333
Relation heaprel
Definition: genam.h:70

References _bt_getbuf(), _bt_leftsib_splitflag(), _bt_lockbuf(), _bt_mark_page_halfdead(), _bt_mkscankey(), _bt_relbuf(), _bt_search(), _bt_unlink_halfdead_page(), _bt_unlockbuf(), Assert(), BTScanInsertData::backward, BT_READ, BT_WRITE, BTPageGetOpaque, BTPageOpaqueData::btpo_next, BTPageOpaqueData::btpo_prev, BufferGetBlockNumber(), BufferGetPage(), CHECK_FOR_INTERRUPTS, CopyIndexTuple(), ereport, errcode(), errhint(), errmsg(), errmsg_internal(), IndexVacuumInfo::heaprel, BTVacState::info, LOG, BTScanInsertData::nextkey, P_FIRSTDATAKEY, P_HIKEY, P_IGNORE, P_INCOMPLETE_SPLIT, P_ISDELETED, P_ISHALFDEAD, P_ISLEAF, P_ISROOT, P_RIGHTMOST, PageGetItem(), PageGetItemId(), PageGetMaxOffsetNumber(), RelationGetRelationName, and ReleaseBuffer().

Referenced by btvacuumpage().

◆ _bt_pageinit()

void _bt_pageinit ( Page  page,
Size  size 
)

Definition at line 1129 of file nbtpage.c.

1130{
1131 PageInit(page, size, sizeof(BTPageOpaqueData));
1132}
void PageInit(Page page, Size pageSize, Size specialSize)
Definition: bufpage.c:42

References PageInit().

Referenced by _bt_allocbuf(), _bt_blnewpage(), _bt_initmetapage(), _bt_restore_meta(), _bt_split(), btree_xlog_mark_page_halfdead(), btree_xlog_newroot(), btree_xlog_split(), and btree_xlog_unlink_page().

◆ _bt_parallel_build_main()

void _bt_parallel_build_main ( dsm_segment seg,
shm_toc toc 
)

Definition at line 1742 of file nbtsort.c.

1743{
1744 char *sharedquery;
1745 BTSpool *btspool;
1746 BTSpool *btspool2;
1747 BTShared *btshared;
1748 Sharedsort *sharedsort;
1749 Sharedsort *sharedsort2;
1750 Relation heapRel;
1751 Relation indexRel;
1752 LOCKMODE heapLockmode;
1753 LOCKMODE indexLockmode;
1754 WalUsage *walusage;
1755 BufferUsage *bufferusage;
1756 int sortmem;
1757
1758#ifdef BTREE_BUILD_STATS
1760 ResetUsage();
1761#endif /* BTREE_BUILD_STATS */
1762
1763 /*
1764 * The only possible status flag that can be set to the parallel worker is
1765 * PROC_IN_SAFE_IC.
1766 */
1767 Assert((MyProc->statusFlags == 0) ||
1769
1770 /* Set debug_query_string for individual workers first */
1771 sharedquery = shm_toc_lookup(toc, PARALLEL_KEY_QUERY_TEXT, true);
1772 debug_query_string = sharedquery;
1773
1774 /* Report the query string from leader */
1776
1777 /* Look up nbtree shared state */
1778 btshared = shm_toc_lookup(toc, PARALLEL_KEY_BTREE_SHARED, false);
1779
1780 /* Open relations using lock modes known to be obtained by index.c */
1781 if (!btshared->isconcurrent)
1782 {
1783 heapLockmode = ShareLock;
1784 indexLockmode = AccessExclusiveLock;
1785 }
1786 else
1787 {
1788 heapLockmode = ShareUpdateExclusiveLock;
1789 indexLockmode = RowExclusiveLock;
1790 }
1791
1792 /* Track query ID */
1793 pgstat_report_query_id(btshared->queryid, false);
1794
1795 /* Open relations within worker */
1796 heapRel = table_open(btshared->heaprelid, heapLockmode);
1797 indexRel = index_open(btshared->indexrelid, indexLockmode);
1798
1799 /* Initialize worker's own spool */
1800 btspool = (BTSpool *) palloc0(sizeof(BTSpool));
1801 btspool->heap = heapRel;
1802 btspool->index = indexRel;
1803 btspool->isunique = btshared->isunique;
1804 btspool->nulls_not_distinct = btshared->nulls_not_distinct;
1805
1806 /* Look up shared state private to tuplesort.c */
1807 sharedsort = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT, false);
1808 tuplesort_attach_shared(sharedsort, seg);
1809 if (!btshared->isunique)
1810 {
1811 btspool2 = NULL;
1812 sharedsort2 = NULL;
1813 }
1814 else
1815 {
1816 /* Allocate memory for worker's own private secondary spool */
1817 btspool2 = (BTSpool *) palloc0(sizeof(BTSpool));
1818
1819 /* Initialize worker's own secondary spool */
1820 btspool2->heap = btspool->heap;
1821 btspool2->index = btspool->index;
1822 btspool2->isunique = false;
1823 /* Look up shared state private to tuplesort.c */
1824 sharedsort2 = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT_SPOOL2, false);
1825 tuplesort_attach_shared(sharedsort2, seg);
1826 }
1827
1828 /* Prepare to track buffer usage during parallel execution */
1830
1831 /* Perform sorting of spool, and possibly a spool2 */
1832 sortmem = maintenance_work_mem / btshared->scantuplesortstates;
1833 _bt_parallel_scan_and_sort(btspool, btspool2, btshared, sharedsort,
1834 sharedsort2, sortmem, false);
1835
1836 /* Report WAL/buffer usage during parallel execution */
1837 bufferusage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false);
1838 walusage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false);
1840 &walusage[ParallelWorkerNumber]);
1841
1842#ifdef BTREE_BUILD_STATS
1844 {
1845 ShowUsage("BTREE BUILD (Worker Partial Spool) STATISTICS");
1846 ResetUsage();
1847 }
1848#endif /* BTREE_BUILD_STATS */
1849
1850 index_close(indexRel, indexLockmode);
1851 table_close(heapRel, heapLockmode);
1852}
int ParallelWorkerNumber
Definition: parallel.c:115
void pgstat_report_query_id(uint64 query_id, bool force)
void pgstat_report_activity(BackendState state, const char *cmd_str)
@ STATE_RUNNING
int maintenance_work_mem
Definition: globals.c:133
bool log_btree_build_stats
Definition: guc_tables.c:512
void index_close(Relation relation, LOCKMODE lockmode)
Definition: indexam.c:177
Relation index_open(Oid relationId, LOCKMODE lockmode)
Definition: indexam.c:133
void InstrEndParallelQuery(BufferUsage *bufusage, WalUsage *walusage)
Definition: instrument.c:208
void InstrStartParallelQuery(void)
Definition: instrument.c:200
int LOCKMODE
Definition: lockdefs.h:26
#define AccessExclusiveLock
Definition: lockdefs.h:43
#define ShareUpdateExclusiveLock
Definition: lockdefs.h:39
#define ShareLock
Definition: lockdefs.h:40
#define RowExclusiveLock
Definition: lockdefs.h:38
#define PARALLEL_KEY_BUFFER_USAGE
Definition: nbtsort.c:66
#define PARALLEL_KEY_TUPLESORT_SPOOL2
Definition: nbtsort.c:63
static void _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2, BTShared *btshared, Sharedsort *sharedsort, Sharedsort *sharedsort2, int sortmem, bool progress)
Definition: nbtsort.c:1867
#define PARALLEL_KEY_BTREE_SHARED
Definition: nbtsort.c:61
#define PARALLEL_KEY_TUPLESORT
Definition: nbtsort.c:62
#define PARALLEL_KEY_QUERY_TEXT
Definition: nbtsort.c:64
#define PARALLEL_KEY_WAL_USAGE
Definition: nbtsort.c:65
const char * debug_query_string
Definition: postgres.c:88
void ShowUsage(const char *title)
Definition: postgres.c:5061
void ResetUsage(void)
Definition: postgres.c:5054
#define PROC_IN_SAFE_IC
Definition: proc.h:59
void * shm_toc_lookup(shm_toc *toc, uint64 key, bool noError)
Definition: shm_toc.c:232
PGPROC * MyProc
Definition: proc.c:66
bool isconcurrent
Definition: nbtsort.c:104
Oid heaprelid
Definition: nbtsort.c:100
bool isunique
Definition: nbtsort.c:102
int scantuplesortstates
Definition: nbtsort.c:105
uint64 queryid
Definition: nbtsort.c:108
Oid indexrelid
Definition: nbtsort.c:101
bool nulls_not_distinct
Definition: nbtsort.c:103
bool isunique
Definition: nbtsort.c:84
bool nulls_not_distinct
Definition: nbtsort.c:85
Relation heap
Definition: nbtsort.c:82
Relation index
Definition: nbtsort.c:83
uint8 statusFlags
Definition: proc.h:243
void table_close(Relation relation, LOCKMODE lockmode)
Definition: table.c:126
Relation table_open(Oid relationId, LOCKMODE lockmode)
Definition: table.c:40
void tuplesort_attach_shared(Sharedsort *shared, dsm_segment *seg)
Definition: tuplesort.c:2961

References _bt_parallel_scan_and_sort(), AccessExclusiveLock, Assert(), debug_query_string, BTSpool::heap, BTShared::heaprelid, BTSpool::index, index_close(), index_open(), BTShared::indexrelid, InstrEndParallelQuery(), InstrStartParallelQuery(), BTShared::isconcurrent, BTSpool::isunique, BTShared::isunique, log_btree_build_stats, maintenance_work_mem, MyProc, BTSpool::nulls_not_distinct, BTShared::nulls_not_distinct, palloc0(), PARALLEL_KEY_BTREE_SHARED, PARALLEL_KEY_BUFFER_USAGE, PARALLEL_KEY_QUERY_TEXT, PARALLEL_KEY_TUPLESORT, PARALLEL_KEY_TUPLESORT_SPOOL2, PARALLEL_KEY_WAL_USAGE, ParallelWorkerNumber, pgstat_report_activity(), pgstat_report_query_id(), PROC_IN_SAFE_IC, BTShared::queryid, ResetUsage(), RowExclusiveLock, BTShared::scantuplesortstates, ShareLock, ShareUpdateExclusiveLock, shm_toc_lookup(), ShowUsage(), STATE_RUNNING, PGPROC::statusFlags, table_close(), table_open(), and tuplesort_attach_shared().

◆ _bt_parallel_done()

void _bt_parallel_done ( IndexScanDesc  scan)

Definition at line 949 of file nbtree.c.

950{
951 BTScanOpaque so = (BTScanOpaque) scan->opaque;
952 ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
953 BTParallelScanDesc btscan;
954 bool status_changed = false;
955
957
958 /* Do nothing, for non-parallel scans */
959 if (parallel_scan == NULL)
960 return;
961
962 /*
963 * Should not mark parallel scan done when there's still a pending
964 * primitive index scan
965 */
966 if (so->needPrimScan)
967 return;
968
969 btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
970 parallel_scan->ps_offset_am);
971
972 /*
973 * Mark the parallel scan as done, unless some other process did so
974 * already
975 */
976 LWLockAcquire(&btscan->btps_lock, LW_EXCLUSIVE);
977 Assert(btscan->btps_pageStatus != BTPARALLEL_NEED_PRIMSCAN);
978 if (btscan->btps_pageStatus != BTPARALLEL_DONE)
979 {
980 btscan->btps_pageStatus = BTPARALLEL_DONE;
981 status_changed = true;
982 }
983 LWLockRelease(&btscan->btps_lock);
984
985 /* wake up all the workers associated with this parallel scan */
986 if (status_changed)
987 ConditionVariableBroadcast(&btscan->btps_cv);
988}
#define OffsetToPointer(base, offset)
Definition: c.h:743
void ConditionVariableBroadcast(ConditionVariable *cv)
@ BTPARALLEL_NEED_PRIMSCAN
Definition: nbtree.c:57
@ BTPARALLEL_DONE
Definition: nbtree.c:60
struct BTParallelScanDescData * BTParallelScanDesc
Definition: nbtree.c:93

References Assert(), BTPARALLEL_DONE, BTPARALLEL_NEED_PRIMSCAN, BTScanPosIsValid, ConditionVariableBroadcast(), BTScanOpaqueData::currPos, LW_EXCLUSIVE, LWLockAcquire(), LWLockRelease(), BTScanOpaqueData::needPrimScan, OffsetToPointer, IndexScanDescData::opaque, and IndexScanDescData::parallel_scan.

Referenced by _bt_endpoint(), _bt_first(), _bt_parallel_seize(), _bt_readnextpage(), and _bt_start_prim_scan().

◆ _bt_parallel_primscan_schedule()

void _bt_parallel_primscan_schedule ( IndexScanDesc  scan,
BlockNumber  curr_page 
)

Definition at line 999 of file nbtree.c.

1000{
1001 Relation rel = scan->indexRelation;
1002 BTScanOpaque so = (BTScanOpaque) scan->opaque;
1003 ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
1004 BTParallelScanDesc btscan;
1005
1006 Assert(so->numArrayKeys);
1007
1008 btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
1009 parallel_scan->ps_offset_am);
1010
1011 LWLockAcquire(&btscan->btps_lock, LW_EXCLUSIVE);
1012 if (btscan->btps_lastCurrPage == curr_page &&
1013 btscan->btps_pageStatus == BTPARALLEL_IDLE)
1014 {
1015 btscan->btps_nextScanPage = InvalidBlockNumber;
1016 btscan->btps_lastCurrPage = InvalidBlockNumber;
1017 btscan->btps_pageStatus = BTPARALLEL_NEED_PRIMSCAN;
1018
1019 /* Serialize scan's current array keys */
1020 _bt_parallel_serialize_arrays(rel, btscan, so);
1021 }
1022 LWLockRelease(&btscan->btps_lock);
1023}
@ BTPARALLEL_IDLE
Definition: nbtree.c:59
static void _bt_parallel_serialize_arrays(Relation rel, BTParallelScanDesc btscan, BTScanOpaque so)
Definition: nbtree.c:631

References _bt_parallel_serialize_arrays(), Assert(), BTPARALLEL_IDLE, BTPARALLEL_NEED_PRIMSCAN, IndexScanDescData::indexRelation, InvalidBlockNumber, LW_EXCLUSIVE, LWLockAcquire(), LWLockRelease(), BTScanOpaqueData::numArrayKeys, OffsetToPointer, IndexScanDescData::opaque, and IndexScanDescData::parallel_scan.

Referenced by _bt_advance_array_keys(), and _bt_readpage().

◆ _bt_parallel_release()

void _bt_parallel_release ( IndexScanDesc  scan,
BlockNumber  next_scan_page,
BlockNumber  curr_page 
)

Definition at line 922 of file nbtree.c.

924{
925 ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
926 BTParallelScanDesc btscan;
927
928 Assert(BlockNumberIsValid(next_scan_page));
929
930 btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
931 parallel_scan->ps_offset_am);
932
934 btscan->btps_nextScanPage = next_scan_page;
935 btscan->btps_lastCurrPage = curr_page;
937 LWLockRelease(&btscan->btps_lock);
939}
void ConditionVariableSignal(ConditionVariable *cv)
BTPS_State btps_pageStatus
Definition: nbtree.c:72
BlockNumber btps_lastCurrPage
Definition: nbtree.c:70
ConditionVariable btps_cv
Definition: nbtree.c:76
BlockNumber btps_nextScanPage
Definition: nbtree.c:69

References Assert(), BlockNumberIsValid(), BTPARALLEL_IDLE, BTParallelScanDescData::btps_cv, BTParallelScanDescData::btps_lastCurrPage, BTParallelScanDescData::btps_lock, BTParallelScanDescData::btps_nextScanPage, BTParallelScanDescData::btps_pageStatus, ConditionVariableSignal(), LW_EXCLUSIVE, LWLockAcquire(), LWLockRelease(), OffsetToPointer, IndexScanDescData::parallel_scan, and ParallelIndexScanDescData::ps_offset_am.

Referenced by _bt_readnextpage(), and _bt_readpage().

◆ _bt_parallel_seize()

bool _bt_parallel_seize ( IndexScanDesc  scan,
BlockNumber next_scan_page,
BlockNumber last_curr_page,
bool  first 
)

Definition at line 784 of file nbtree.c.

786{
787 Relation rel = scan->indexRelation;
788 BTScanOpaque so = (BTScanOpaque) scan->opaque;
789 bool exit_loop = false,
790 status = true,
791 endscan = false;
792 ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
793 BTParallelScanDesc btscan;
794
795 *next_scan_page = InvalidBlockNumber;
796 *last_curr_page = InvalidBlockNumber;
797
798 /*
799 * Reset so->currPos, and initialize moreLeft/moreRight such that the next
800 * call to _bt_readnextpage treats this backend similarly to a serial
801 * backend that steps from *last_curr_page to *next_scan_page (unless this
802 * backend's so->currPos is initialized by _bt_readfirstpage before then).
803 */
805 so->currPos.moreLeft = so->currPos.moreRight = true;
806
807 if (first)
808 {
809 /*
810 * Initialize array related state when called from _bt_first, assuming
811 * that this will be the first primitive index scan for the scan
812 */
813 so->needPrimScan = false;
814 so->scanBehind = false;
815 so->oppositeDirCheck = false;
816 }
817 else
818 {
819 /*
820 * Don't attempt to seize the scan when it requires another primitive
821 * index scan, since caller's backend cannot start it right now
822 */
823 if (so->needPrimScan)
824 return false;
825 }
826
827 btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
828 parallel_scan->ps_offset_am);
829
830 while (1)
831 {
832 LWLockAcquire(&btscan->btps_lock, LW_EXCLUSIVE);
833
834 if (btscan->btps_pageStatus == BTPARALLEL_DONE)
835 {
836 /* We're done with this parallel index scan */
837 status = false;
838 }
839 else if (btscan->btps_pageStatus == BTPARALLEL_IDLE &&
840 btscan->btps_nextScanPage == P_NONE)
841 {
842 /* End this parallel index scan */
843 status = false;
844 endscan = true;
845 }
846 else if (btscan->btps_pageStatus == BTPARALLEL_NEED_PRIMSCAN)
847 {
848 Assert(so->numArrayKeys);
849
850 if (first)
851 {
852 /* Can start scheduled primitive scan right away, so do so */
853 btscan->btps_pageStatus = BTPARALLEL_ADVANCING;
854
855 /* Restore scan's array keys from serialized values */
856 _bt_parallel_restore_arrays(rel, btscan, so);
857 exit_loop = true;
858 }
859 else
860 {
861 /*
862 * Don't attempt to seize the scan when it requires another
863 * primitive index scan, since caller's backend cannot start
864 * it right now
865 */
866 status = false;
867 }
868
869 /*
870 * Either way, update backend local state to indicate that a
871 * pending primitive scan is required
872 */
873 so->needPrimScan = true;
874 so->scanBehind = false;
875 so->oppositeDirCheck = false;
876 }
877 else if (btscan->btps_pageStatus != BTPARALLEL_ADVANCING)
878 {
879 /*
880 * We have successfully seized control of the scan for the purpose
881 * of advancing it to a new page!
882 */
883 btscan->btps_pageStatus = BTPARALLEL_ADVANCING;
884 Assert(btscan->btps_nextScanPage != P_NONE);
885 *next_scan_page = btscan->btps_nextScanPage;
886 *last_curr_page = btscan->btps_lastCurrPage;
887 exit_loop = true;
888 }
889 LWLockRelease(&btscan->btps_lock);
890 if (exit_loop || !status)
891 break;
892 ConditionVariableSleep(&btscan->btps_cv, WAIT_EVENT_BTREE_PAGE);
893 }
895
896 /* When the scan has reached the rightmost (or leftmost) page, end it */
897 if (endscan)
898 _bt_parallel_done(scan);
899
900 return status;
901}
bool ConditionVariableCancelSleep(void)
void ConditionVariableSleep(ConditionVariable *cv, uint32 wait_event_info)
@ BTPARALLEL_ADVANCING
Definition: nbtree.c:58
static void _bt_parallel_restore_arrays(Relation rel, BTParallelScanDesc btscan, BTScanOpaque so)
Definition: nbtree.c:674
#define BTScanPosInvalidate(scanpos)
Definition: nbtree.h:1027
bool moreRight
Definition: nbtree.h:986
bool moreLeft
Definition: nbtree.h:985

References _bt_parallel_done(), _bt_parallel_restore_arrays(), Assert(), BTPARALLEL_ADVANCING, BTPARALLEL_DONE, BTPARALLEL_IDLE, BTPARALLEL_NEED_PRIMSCAN, BTScanPosInvalidate, ConditionVariableCancelSleep(), ConditionVariableSleep(), BTScanOpaqueData::currPos, IndexScanDescData::indexRelation, InvalidBlockNumber, LW_EXCLUSIVE, LWLockAcquire(), LWLockRelease(), BTScanPosData::moreLeft, BTScanPosData::moreRight, BTScanOpaqueData::needPrimScan, BTScanOpaqueData::numArrayKeys, OffsetToPointer, IndexScanDescData::opaque, BTScanOpaqueData::oppositeDirCheck, P_NONE, IndexScanDescData::parallel_scan, and BTScanOpaqueData::scanBehind.

Referenced by _bt_first(), and _bt_readnextpage().

◆ _bt_pendingfsm_finalize()

void _bt_pendingfsm_finalize ( Relation  rel,
BTVacState vstate 
)

Definition at line 2996 of file nbtpage.c.

2997{
2998 IndexBulkDeleteResult *stats = vstate->stats;
2999 Relation heaprel = vstate->info->heaprel;
3000
3001 Assert(stats->pages_newly_deleted >= vstate->npendingpages);
3002 Assert(heaprel != NULL);
3003
3004 if (vstate->npendingpages == 0)
3005 {
3006 /* Just free memory when nothing to do */
3007 if (vstate->pendingpages)
3008 pfree(vstate->pendingpages);
3009
3010 return;
3011 }
3012
3013#ifdef DEBUG_BTREE_PENDING_FSM
3014
3015 /*
3016 * Debugging aid: Sleep for 5 seconds to greatly increase the chances of
3017 * placing pending pages in the FSM. Note that the optimization will
3018 * never be effective without some other backend concurrently consuming an
3019 * XID.
3020 */
3021 pg_usleep(5000000L);
3022#endif
3023
3024 /*
3025 * Recompute VACUUM XID boundaries.
3026 *
3027 * We don't actually care about the oldest non-removable XID. Computing
3028 * the oldest such XID has a useful side-effect that we rely on: it
3029 * forcibly updates the XID horizon state for this backend. This step is
3030 * essential; GlobalVisCheckRemovableFullXid() will not reliably recognize
3031 * that it is now safe to recycle newly deleted pages without this step.
3032 */
3034
3035 for (int i = 0; i < vstate->npendingpages; i++)
3036 {
3037 BlockNumber target = vstate->pendingpages[i].target;
3038 FullTransactionId safexid = vstate->pendingpages[i].safexid;
3039
3040 /*
3041 * Do the equivalent of checking BTPageIsRecyclable(), but without
3042 * accessing the page again a second time.
3043 *
3044 * Give up on finding the first non-recyclable page -- all later pages
3045 * must be non-recyclable too, since _bt_pendingfsm_add() adds pages
3046 * to the array in safexid order.
3047 */
3048 if (!GlobalVisCheckRemovableFullXid(heaprel, safexid))
3049 break;
3050
3051 RecordFreeIndexPage(rel, target);
3052 stats->pages_free++;
3053 }
3054
3055 pfree(vstate->pendingpages);
3056}
void RecordFreeIndexPage(Relation rel, BlockNumber freeBlock)
Definition: indexfsm.c:52
TransactionId GetOldestNonRemovableTransactionId(Relation rel)
Definition: procarray.c:2005
bool GlobalVisCheckRemovableFullXid(Relation rel, FullTransactionId fxid)
Definition: procarray.c:4286
void pg_usleep(long microsec)
Definition: signal.c:53
FullTransactionId safexid
Definition: nbtree.h:328
BlockNumber target
Definition: nbtree.h:327
IndexBulkDeleteResult * stats
Definition: nbtree.h:334
BTPendingFSM * pendingpages
Definition: nbtree.h:345
int npendingpages
Definition: nbtree.h:346
BlockNumber pages_newly_deleted
Definition: genam.h:104
BlockNumber pages_free
Definition: genam.h:106

References Assert(), GetOldestNonRemovableTransactionId(), GlobalVisCheckRemovableFullXid(), IndexVacuumInfo::heaprel, i, BTVacState::info, BTVacState::npendingpages, IndexBulkDeleteResult::pages_free, IndexBulkDeleteResult::pages_newly_deleted, BTVacState::pendingpages, pfree(), pg_usleep(), RecordFreeIndexPage(), BTPendingFSM::safexid, BTVacState::stats, and BTPendingFSM::target.

Referenced by btvacuumscan().

◆ _bt_pendingfsm_init()

void _bt_pendingfsm_init ( Relation  rel,
BTVacState vstate,
bool  cleanuponly 
)

Definition at line 2954 of file nbtpage.c.

2955{
2956 Size maxbufsize;
2957
2958 /*
2959 * Don't bother with optimization in cleanup-only case -- we don't expect
2960 * any newly deleted pages. Besides, cleanup-only calls to btvacuumscan()
2961 * can only take place because this optimization didn't work out during
2962 * the last VACUUM.
2963 */
2964 if (cleanuponly)
2965 return;
2966
2967 /*
2968 * Cap maximum size of array so that we always respect work_mem. Avoid
2969 * int overflow here.
2970 */
2971 vstate->bufsize = 256;
2972 maxbufsize = (work_mem * (Size) 1024) / sizeof(BTPendingFSM);
2973 maxbufsize = Min(maxbufsize, MaxAllocSize / sizeof(BTPendingFSM));
2974 /* BTVacState.maxbufsize has type int */
2975 maxbufsize = Min(maxbufsize, INT_MAX);
2976 /* Stay sane with small work_mem */
2977 maxbufsize = Max(maxbufsize, vstate->bufsize);
2978 vstate->maxbufsize = (int) maxbufsize;
2979
2980 /* Allocate buffer, indicate that there are currently 0 pending pages */
2981 vstate->pendingpages = palloc(sizeof(BTPendingFSM) * vstate->bufsize);
2982 vstate->npendingpages = 0;
2983}
#define MaxAllocSize
Definition: fe_memutils.h:22
int work_mem
Definition: globals.c:131
struct BTPendingFSM BTPendingFSM
int bufsize
Definition: nbtree.h:343
int maxbufsize
Definition: nbtree.h:344

References BTVacState::bufsize, Max, MaxAllocSize, BTVacState::maxbufsize, Min, BTVacState::npendingpages, palloc(), BTVacState::pendingpages, and work_mem.

Referenced by btvacuumscan().

◆ _bt_preprocess_keys()

void _bt_preprocess_keys ( IndexScanDesc  scan)

Definition at line 182 of file nbtpreprocesskeys.c.

183{
184 BTScanOpaque so = (BTScanOpaque) scan->opaque;
185 int numberOfKeys = scan->numberOfKeys;
186 int16 *indoption = scan->indexRelation->rd_indoption;
187 int new_numberOfKeys;
188 int numberOfEqualCols;
189 ScanKey inkeys;
191 bool test_result;
192 AttrNumber attno;
193 ScanKey arrayKeyData;
194 int *keyDataMap = NULL;
195 int arrayidx = 0;
196
197 if (so->numberOfKeys > 0)
198 {
199 /*
200 * Only need to do preprocessing once per btrescan, at most. All
201 * calls after the first are handled as no-ops.
202 */
203 return;
204 }
205
206 /* initialize result variables */
207 so->qual_ok = true;
208 so->numberOfKeys = 0;
209
210 if (numberOfKeys < 1)
211 return; /* done if qual-less scan */
212
213 /* If any keys are SK_SEARCHARRAY type, set up array-key info */
214 arrayKeyData = _bt_preprocess_array_keys(scan, &numberOfKeys);
215 if (!so->qual_ok)
216 {
217 /* unmatchable array, so give up */
218 return;
219 }
220
221 /*
222 * Treat arrayKeyData[] (a partially preprocessed copy of scan->keyData[])
223 * as our input if _bt_preprocess_array_keys just allocated it, else just
224 * use scan->keyData[]
225 */
226 if (arrayKeyData)
227 {
228 inkeys = arrayKeyData;
229
230 /* Also maintain keyDataMap for remapping so->orderProcs[] later */
231 keyDataMap = MemoryContextAlloc(so->arrayContext,
232 numberOfKeys * sizeof(int));
233
234 /*
235 * Also enlarge output array when it might otherwise not have room for
236 * a skip array's scan key
237 */
238 if (numberOfKeys > scan->numberOfKeys)
239 so->keyData = repalloc(so->keyData,
240 numberOfKeys * sizeof(ScanKeyData));
241 }
242 else
243 inkeys = scan->keyData;
244
245 /* we check that input keys are correctly ordered */
246 if (inkeys[0].sk_attno < 1)
247 elog(ERROR, "btree index keys must be ordered by attribute");
248
249 /* We can short-circuit most of the work if there's just one key */
250 if (numberOfKeys == 1)
251 {
252 /* Apply indoption to scankey (might change sk_strategy!) */
253 if (!_bt_fix_scankey_strategy(&inkeys[0], indoption))
254 so->qual_ok = false;
255 memcpy(&so->keyData[0], &inkeys[0], sizeof(ScanKeyData));
256 so->numberOfKeys = 1;
257 /* We can mark the qual as required if it's for first index col */
258 if (inkeys[0].sk_attno == 1)
260 if (arrayKeyData)
261 {
262 /*
263 * Don't call _bt_preprocess_array_keys_final in this fast path
264 * (we'll miss out on the single value array transformation, but
265 * that's not nearly as important when there's only one scan key)
266 */
269 (so->arrayKeys[0].scan_key == 0 &&
270 !(so->keyData[0].sk_flags & SK_BT_SKIP) &&
271 OidIsValid(so->orderProcs[0].fn_oid)));
272 }
273
274 return;
275 }
276
277 /*
278 * Otherwise, do the full set of pushups.
279 */
280 new_numberOfKeys = 0;
281 numberOfEqualCols = 0;
282
283 /*
284 * Initialize for processing of keys for attr 1.
285 *
286 * xform[i] points to the currently best scan key of strategy type i+1; it
287 * is NULL if we haven't yet found such a key for this attr.
288 */
289 attno = 1;
290 memset(xform, 0, sizeof(xform));
291
292 /*
293 * Loop iterates from 0 to numberOfKeys inclusive; we use the last pass to
294 * handle after-last-key processing. Actual exit from the loop is at the
295 * "break" statement below.
296 */
297 for (int i = 0;; i++)
298 {
299 ScanKey inkey = inkeys + i;
300 int j;
301
302 if (i < numberOfKeys)
303 {
304 /* Apply indoption to scankey (might change sk_strategy!) */
305 if (!_bt_fix_scankey_strategy(inkey, indoption))
306 {
307 /* NULL can't be matched, so give up */
308 so->qual_ok = false;
309 return;
310 }
311 }
312
313 /*
314 * If we are at the end of the keys for a particular attr, finish up
315 * processing and emit the cleaned-up keys.
316 */
317 if (i == numberOfKeys || inkey->sk_attno != attno)
318 {
319 int priorNumberOfEqualCols = numberOfEqualCols;
320
321 /* check input keys are correctly ordered */
322 if (i < numberOfKeys && inkey->sk_attno < attno)
323 elog(ERROR, "btree index keys must be ordered by attribute");
324
325 /*
326 * If = has been specified, all other keys can be eliminated as
327 * redundant. Note that this is no less true if the = key is
328 * SEARCHARRAY; the only real difference is that the inequality
329 * key _becomes_ redundant by making _bt_compare_scankey_args
330 * eliminate the subset of elements that won't need to be matched
331 * (with SAOP arrays and skip arrays alike).
332 *
333 * If we have a case like "key = 1 AND key > 2", we set qual_ok to
334 * false and abandon further processing. We'll do the same thing
335 * given a case like "key IN (0, 1) AND key > 2".
336 *
337 * We also have to deal with the case of "key IS NULL", which is
338 * unsatisfiable in combination with any other index condition. By
339 * the time we get here, that's been classified as an equality
340 * check, and we've rejected any combination of it with a regular
341 * equality condition; but not with other types of conditions.
342 */
343 if (xform[BTEqualStrategyNumber - 1].inkey)
344 {
345 ScanKey eq = xform[BTEqualStrategyNumber - 1].inkey;
346 BTArrayKeyInfo *array = NULL;
347 FmgrInfo *orderproc = NULL;
348
349 if (arrayKeyData && (eq->sk_flags & SK_SEARCHARRAY))
350 {
351 int eq_in_ikey,
352 eq_arrayidx;
353
354 eq_in_ikey = xform[BTEqualStrategyNumber - 1].inkeyi;
355 eq_arrayidx = xform[BTEqualStrategyNumber - 1].arrayidx;
356 array = &so->arrayKeys[eq_arrayidx - 1];
357 orderproc = so->orderProcs + eq_in_ikey;
358
359 Assert(array->scan_key == eq_in_ikey);
360 Assert(OidIsValid(orderproc->fn_oid));
361 }
362
363 for (j = BTMaxStrategyNumber; --j >= 0;)
364 {
365 ScanKey chk = xform[j].inkey;
366
367 if (!chk || j == (BTEqualStrategyNumber - 1))
368 continue;
369
370 if (eq->sk_flags & SK_SEARCHNULL)
371 {
372 /* IS NULL is contradictory to anything else */
373 so->qual_ok = false;
374 return;
375 }
376
377 if (_bt_compare_scankey_args(scan, chk, eq, chk,
378 array, orderproc,
379 &test_result))
380 {
381 if (!test_result)
382 {
383 /* keys proven mutually contradictory */
384 so->qual_ok = false;
385 return;
386 }
387 /* else discard the redundant non-equality key */
388 xform[j].inkey = NULL;
389 xform[j].inkeyi = -1;
390 }
391 /* else, cannot determine redundancy, keep both keys */
392 }
393 /* track number of attrs for which we have "=" keys */
394 numberOfEqualCols++;
395 }
396
397 /* try to keep only one of <, <= */
398 if (xform[BTLessStrategyNumber - 1].inkey &&
399 xform[BTLessEqualStrategyNumber - 1].inkey)
400 {
401 ScanKey lt = xform[BTLessStrategyNumber - 1].inkey;
402 ScanKey le = xform[BTLessEqualStrategyNumber - 1].inkey;
403
404 if (_bt_compare_scankey_args(scan, le, lt, le, NULL, NULL,
405 &test_result))
406 {
407 if (test_result)
408 xform[BTLessEqualStrategyNumber - 1].inkey = NULL;
409 else
410 xform[BTLessStrategyNumber - 1].inkey = NULL;
411 }
412 }
413
414 /* try to keep only one of >, >= */
415 if (xform[BTGreaterStrategyNumber - 1].inkey &&
416 xform[BTGreaterEqualStrategyNumber - 1].inkey)
417 {
418 ScanKey gt = xform[BTGreaterStrategyNumber - 1].inkey;
419 ScanKey ge = xform[BTGreaterEqualStrategyNumber - 1].inkey;
420
421 if (_bt_compare_scankey_args(scan, ge, gt, ge, NULL, NULL,
422 &test_result))
423 {
424 if (test_result)
425 xform[BTGreaterEqualStrategyNumber - 1].inkey = NULL;
426 else
427 xform[BTGreaterStrategyNumber - 1].inkey = NULL;
428 }
429 }
430
431 /*
432 * Emit the cleaned-up keys into the so->keyData[] array, and then
433 * mark them if they are required. They are required (possibly
434 * only in one direction) if all attrs before this one had "=".
435 *
436 * In practice we'll rarely output non-required scan keys here;
437 * typically, _bt_preprocess_array_keys has already added "=" keys
438 * sufficient to form an unbroken series of "=" constraints on all
439 * attrs prior to the attr from the final scan->keyData[] key.
440 */
441 for (j = BTMaxStrategyNumber; --j >= 0;)
442 {
443 if (xform[j].inkey)
444 {
445 ScanKey outkey = &so->keyData[new_numberOfKeys++];
446
447 memcpy(outkey, xform[j].inkey, sizeof(ScanKeyData));
448 if (arrayKeyData)
449 keyDataMap[new_numberOfKeys - 1] = xform[j].inkeyi;
450 if (priorNumberOfEqualCols == attno - 1)
452 }
453 }
454
455 /*
456 * Exit loop here if done.
457 */
458 if (i == numberOfKeys)
459 break;
460
461 /* Re-initialize for new attno */
462 attno = inkey->sk_attno;
463 memset(xform, 0, sizeof(xform));
464 }
465
466 /* check strategy this key's operator corresponds to */
467 j = inkey->sk_strategy - 1;
468
469 /* if row comparison, push it directly to the output array */
470 if (inkey->sk_flags & SK_ROW_HEADER)
471 {
472 ScanKey outkey = &so->keyData[new_numberOfKeys++];
473
474 memcpy(outkey, inkey, sizeof(ScanKeyData));
475 if (arrayKeyData)
476 keyDataMap[new_numberOfKeys - 1] = i;
477 if (numberOfEqualCols == attno - 1)
479
480 /*
481 * We don't support RowCompare using equality; such a qual would
482 * mess up the numberOfEqualCols tracking.
483 */
485 continue;
486 }
487
488 if (inkey->sk_strategy == BTEqualStrategyNumber &&
489 (inkey->sk_flags & SK_SEARCHARRAY))
490 {
491 /* must track how input scan keys map to arrays */
492 Assert(arrayKeyData);
493 arrayidx++;
494 }
495
496 /*
497 * have we seen a scan key for this same attribute and using this same
498 * operator strategy before now?
499 */
500 if (xform[j].inkey == NULL)
501 {
502 /* nope, so this scan key wins by default (at least for now) */
503 xform[j].inkey = inkey;
504 xform[j].inkeyi = i;
505 xform[j].arrayidx = arrayidx;
506 }
507 else
508 {
509 FmgrInfo *orderproc = NULL;
510 BTArrayKeyInfo *array = NULL;
511
512 /*
513 * Seen one of these before, so keep only the more restrictive key
514 * if possible
515 */
516 if (j == (BTEqualStrategyNumber - 1) && arrayKeyData)
517 {
518 /*
519 * Have to set up array keys
520 */
521 if (inkey->sk_flags & SK_SEARCHARRAY)
522 {
523 array = &so->arrayKeys[arrayidx - 1];
524 orderproc = so->orderProcs + i;
525
526 Assert(array->scan_key == i);
527 Assert(OidIsValid(orderproc->fn_oid));
528 Assert(!(inkey->sk_flags & SK_BT_SKIP));
529 }
530 else if (xform[j].inkey->sk_flags & SK_SEARCHARRAY)
531 {
532 array = &so->arrayKeys[xform[j].arrayidx - 1];
533 orderproc = so->orderProcs + xform[j].inkeyi;
534
535 Assert(array->scan_key == xform[j].inkeyi);
536 Assert(OidIsValid(orderproc->fn_oid));
537 Assert(!(xform[j].inkey->sk_flags & SK_BT_SKIP));
538 }
539
540 /*
541 * Both scan keys might have arrays, in which case we'll
542 * arbitrarily pass only one of the arrays. That won't
543 * matter, since _bt_compare_scankey_args is aware that two
544 * SEARCHARRAY scan keys mean that _bt_preprocess_array_keys
545 * failed to eliminate redundant arrays through array merging.
546 * _bt_compare_scankey_args just returns false when it sees
547 * this; it won't even try to examine either array.
548 */
549 }
550
551 if (_bt_compare_scankey_args(scan, inkey, inkey, xform[j].inkey,
552 array, orderproc, &test_result))
553 {
554 /* Have all we need to determine redundancy */
555 if (test_result)
556 {
557 /*
558 * New key is more restrictive, and so replaces old key...
559 */
560 if (j != (BTEqualStrategyNumber - 1) ||
561 !(xform[j].inkey->sk_flags & SK_SEARCHARRAY))
562 {
563 xform[j].inkey = inkey;
564 xform[j].inkeyi = i;
565 xform[j].arrayidx = arrayidx;
566 }
567 else
568 {
569 /*
570 * ...unless we have to keep the old key because it's
571 * an array that rendered the new key redundant. We
572 * need to make sure that we don't throw away an array
573 * scan key. _bt_preprocess_array_keys_final expects
574 * us to keep all of the arrays that weren't already
575 * eliminated by _bt_preprocess_array_keys earlier on.
576 */
577 Assert(!(inkey->sk_flags & SK_SEARCHARRAY));
578 }
579 }
580 else if (j == (BTEqualStrategyNumber - 1))
581 {
582 /* key == a && key == b, but a != b */
583 so->qual_ok = false;
584 return;
585 }
586 /* else old key is more restrictive, keep it */
587 }
588 else
589 {
590 /*
591 * We can't determine which key is more restrictive. Push
592 * xform[j] directly to the output array, then set xform[j] to
593 * the new scan key.
594 *
595 * Note: We do things this way around so that our arrays are
596 * always in the same order as their corresponding scan keys,
597 * even with incomplete opfamilies. _bt_advance_array_keys
598 * depends on this.
599 */
600 ScanKey outkey = &so->keyData[new_numberOfKeys++];
601
602 memcpy(outkey, xform[j].inkey, sizeof(ScanKeyData));
603 if (arrayKeyData)
604 keyDataMap[new_numberOfKeys - 1] = xform[j].inkeyi;
605 if (numberOfEqualCols == attno - 1)
607 xform[j].inkey = inkey;
608 xform[j].inkeyi = i;
609 xform[j].arrayidx = arrayidx;
610 }
611 }
612 }
613
614 so->numberOfKeys = new_numberOfKeys;
615
616 /*
617 * Now that we've built a temporary mapping from so->keyData[] (output
618 * scan keys) to arrayKeyData[] (our input scan keys), fix array->scan_key
619 * references. Also consolidate the so->orderProcs[] array such that it
620 * can be subscripted using so->keyData[]-wise offsets.
621 */
622 if (arrayKeyData)
623 _bt_preprocess_array_keys_final(scan, keyDataMap);
624
625 /* Could pfree arrayKeyData/keyDataMap now, but not worth the cycles */
626}
if(TABLE==NULL||TABLE_index==NULL)
Definition: isn.c:81
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:1544
static bool _bt_fix_scankey_strategy(ScanKey skey, int16 *indoption)
static void _bt_mark_scankey_required(ScanKey skey)
static ScanKey _bt_preprocess_array_keys(IndexScanDesc scan, int *new_numberOfKeys)
static void _bt_preprocess_array_keys_final(IndexScanDesc scan, int *keyDataMap)
static bool _bt_compare_scankey_args(IndexScanDesc scan, ScanKey op, ScanKey leftarg, ScanKey rightarg, BTArrayKeyInfo *array, FmgrInfo *orderproc, bool *result)
#define SK_SEARCHNULL
Definition: skey.h:121
#define BTMaxStrategyNumber
Definition: stratnum.h:35
FmgrInfo * orderProcs
Definition: nbtree.h:1067
MemoryContext arrayContext
Definition: nbtree.h:1068
Oid fn_oid
Definition: fmgr.h:59
struct ScanKeyData * keyData
Definition: relscan.h:141

References _bt_compare_scankey_args(), _bt_fix_scankey_strategy(), _bt_mark_scankey_required(), _bt_preprocess_array_keys(), _bt_preprocess_array_keys_final(), BTScanOpaqueData::arrayContext, BTScanOpaqueData::arrayKeys, Assert(), BTEqualStrategyNumber, BTGreaterEqualStrategyNumber, BTGreaterStrategyNumber, BTLessEqualStrategyNumber, BTLessStrategyNumber, BTMaxStrategyNumber, elog, ERROR, FmgrInfo::fn_oid, i, if(), IndexScanDescData::indexRelation, j, BTScanOpaqueData::keyData, IndexScanDescData::keyData, MemoryContextAlloc(), BTScanOpaqueData::numberOfKeys, IndexScanDescData::numberOfKeys, OidIsValid, IndexScanDescData::opaque, BTScanOpaqueData::orderProcs, BTScanOpaqueData::qual_ok, RelationData::rd_indoption, repalloc(), BTArrayKeyInfo::scan_key, ScanKeyData::sk_attno, SK_BT_SKIP, ScanKeyData::sk_flags, SK_ROW_HEADER, SK_SEARCHARRAY, SK_SEARCHNULL, and ScanKeyData::sk_strategy.

Referenced by _bt_first().

◆ _bt_relandgetbuf()

Buffer _bt_relandgetbuf ( Relation  rel,
Buffer  obuf,
BlockNumber  blkno,
int  access 
)

Definition at line 1003 of file nbtpage.c.

1004{
1005 Buffer buf;
1006
1007 Assert(BlockNumberIsValid(blkno));
1008 if (BufferIsValid(obuf))
1009 _bt_unlockbuf(rel, obuf);
1010 buf = ReleaseAndReadBuffer(obuf, rel, blkno);
1011 _bt_lockbuf(rel, buf, access);
1012
1013 _bt_checkpage(rel, buf);
1014 return buf;
1015}
Buffer ReleaseAndReadBuffer(Buffer buffer, Relation relation, BlockNumber blockNum)
Definition: bufmgr.c:3007

References _bt_checkpage(), _bt_lockbuf(), _bt_unlockbuf(), Assert(), BlockNumberIsValid(), buf, BufferIsValid(), and ReleaseAndReadBuffer().

Referenced by _bt_check_unique(), _bt_get_endpoint(), _bt_getroot(), _bt_gettrueroot(), _bt_lock_and_validate_left(), _bt_moveright(), _bt_search(), and _bt_stepright().

◆ _bt_relbuf()

◆ _bt_scanbehind_checkkeys()

bool _bt_scanbehind_checkkeys ( IndexScanDesc  scan,
ScanDirection  dir,
IndexTuple  finaltup 
)

Definition at line 2389 of file nbtutils.c.

2391{
2392 Relation rel = scan->indexRelation;
2393 TupleDesc tupdesc = RelationGetDescr(rel);
2394 BTScanOpaque so = (BTScanOpaque) scan->opaque;
2395 int nfinaltupatts = BTreeTupleGetNAtts(finaltup, rel);
2396
2397 Assert(so->numArrayKeys);
2398
2399 if (_bt_tuple_before_array_skeys(scan, dir, finaltup, tupdesc,
2400 nfinaltupatts, false, 0, NULL))
2401 return false;
2402
2403 if (!so->oppositeDirCheck)
2404 return true;
2405
2406 return _bt_oppodir_checkkeys(scan, dir, finaltup);
2407}
static bool _bt_oppodir_checkkeys(IndexScanDesc scan, ScanDirection dir, IndexTuple finaltup)
Definition: nbtutils.c:2428

References _bt_oppodir_checkkeys(), _bt_tuple_before_array_skeys(), Assert(), BTreeTupleGetNAtts, IndexScanDescData::indexRelation, BTScanOpaqueData::numArrayKeys, IndexScanDescData::opaque, BTScanOpaqueData::oppositeDirCheck, and RelationGetDescr.

Referenced by _bt_readpage().

◆ _bt_search()

BTStack _bt_search ( Relation  rel,
Relation  heaprel,
BTScanInsert  key,
Buffer bufP,
int  access 
)

Definition at line 102 of file nbtsearch.c.

104{
105 BTStack stack_in = NULL;
106 int page_access = BT_READ;
107
108 /* heaprel must be set whenever _bt_allocbuf is reachable */
110 Assert(access == BT_READ || heaprel != NULL);
111
112 /* Get the root page to start with */
113 *bufP = _bt_getroot(rel, heaprel, access);
114
115 /* If index is empty and access = BT_READ, no root page is created. */
116 if (!BufferIsValid(*bufP))
117 return (BTStack) NULL;
118
119 /* Loop iterates once per level descended in the tree */
120 for (;;)
121 {
122 Page page;
123 BTPageOpaque opaque;
124 OffsetNumber offnum;
125 ItemId itemid;
126 IndexTuple itup;
127 BlockNumber child;
128 BTStack new_stack;
129
130 /*
131 * Race -- the page we just grabbed may have split since we read its
132 * downlink in its parent page (or the metapage). If it has, we may
133 * need to move right to its new sibling. Do that.
134 *
135 * In write-mode, allow _bt_moveright to finish any incomplete splits
136 * along the way. Strictly speaking, we'd only need to finish an
137 * incomplete split on the leaf page we're about to insert to, not on
138 * any of the upper levels (internal pages with incomplete splits are
139 * also taken care of in _bt_getstackbuf). But this is a good
140 * opportunity to finish splits of internal pages too.
141 */
142 *bufP = _bt_moveright(rel, heaprel, key, *bufP, (access == BT_WRITE),
143 stack_in, page_access);
144
145 /* if this is a leaf page, we're done */
146 page = BufferGetPage(*bufP);
147 opaque = BTPageGetOpaque(page);
148 if (P_ISLEAF(opaque))
149 break;
150
151 /*
152 * Find the appropriate pivot tuple on this page. Its downlink points
153 * to the child page that we're about to descend to.
154 */
155 offnum = _bt_binsrch(rel, key, *bufP);
156 itemid = PageGetItemId(page, offnum);
157 itup = (IndexTuple) PageGetItem(page, itemid);
158 Assert(BTreeTupleIsPivot(itup) || !key->heapkeyspace);
159 child = BTreeTupleGetDownLink(itup);
160
161 /*
162 * We need to save the location of the pivot tuple we chose in a new
163 * stack entry for this page/level. If caller ends up splitting a
164 * page one level down, it usually ends up inserting a new pivot
165 * tuple/downlink immediately after the location recorded here.
166 */
167 new_stack = (BTStack) palloc(sizeof(BTStackData));
168 new_stack->bts_blkno = BufferGetBlockNumber(*bufP);
169 new_stack->bts_offset = offnum;
170 new_stack->bts_parent = stack_in;
171
172 /*
173 * Page level 1 is lowest non-leaf page level prior to leaves. So, if
174 * we're on the level 1 and asked to lock leaf page in write mode,
175 * then lock next page in write mode, because it must be a leaf.
176 */
177 if (opaque->btpo_level == 1 && access == BT_WRITE)
178 page_access = BT_WRITE;
179
180 /* drop the read lock on the page, then acquire one on its child */
181 *bufP = _bt_relandgetbuf(rel, *bufP, child, page_access);
182
183 /* okay, all set to move down a level */
184 stack_in = new_stack;
185 }
186
187 /*
188 * If we're asked to lock leaf in write mode, but didn't manage to, then
189 * relock. This should only happen when the root page is a leaf page (and
190 * the only page in the index other than the metapage).
191 */
192 if (access == BT_WRITE && page_access == BT_READ)
193 {
194 /* trade in our read lock for a write lock */
195 _bt_unlockbuf(rel, *bufP);
196 _bt_lockbuf(rel, *bufP, BT_WRITE);
197
198 /*
199 * Race -- the leaf page may have split after we dropped the read lock
200 * but before we acquired a write lock. If it has, we may need to
201 * move right to its new sibling. Do that.
202 */
203 *bufP = _bt_moveright(rel, heaprel, key, *bufP, true, stack_in, BT_WRITE);
204 }
205
206 return stack_in;
207}
BTStackData * BTStack
Definition: nbtree.h:750
static Buffer _bt_moveright(Relation rel, Relation heaprel, BTScanInsert key, Buffer buf, bool forupdate, BTStack stack, int access)
Definition: nbtsearch.c:241

References _bt_binsrch(), _bt_getroot(), _bt_lockbuf(), _bt_moveright(), _bt_relandgetbuf(), _bt_unlockbuf(), Assert(), BT_READ, BT_WRITE, BTPageGetOpaque, BTPageOpaqueData::btpo_level, BTreeTupleGetDownLink(), BTreeTupleIsPivot(), BTStackData::bts_blkno, BTStackData::bts_offset, BTStackData::bts_parent, BufferGetBlockNumber(), BufferGetPage(), BufferIsValid(), sort-test::key, P_ISLEAF, PageGetItem(), PageGetItemId(), and palloc().

Referenced by _bt_first(), _bt_pagedel(), _bt_search_insert(), and bt_rootdescend().

◆ _bt_set_cleanup_info()

void _bt_set_cleanup_info ( Relation  rel,
BlockNumber  num_delpages 
)

Definition at line 232 of file nbtpage.c.

233{
234 Buffer metabuf;
235 Page metapg;
236 BTMetaPageData *metad;
237
238 /*
239 * On-disk compatibility note: The btm_last_cleanup_num_delpages metapage
240 * field started out as a TransactionId field called btm_oldest_btpo_xact.
241 * Both "versions" are just uint32 fields. It was convenient to repurpose
242 * the field when we began to use 64-bit XIDs in deleted pages.
243 *
244 * It's possible that a pg_upgrade'd database will contain an XID value in
245 * what is now recognized as the metapage's btm_last_cleanup_num_delpages
246 * field. _bt_vacuum_needs_cleanup() may even believe that this value
247 * indicates that there are lots of pages that it needs to recycle, when
248 * in reality there are only one or two. The worst that can happen is
249 * that there will be a call to btvacuumscan a little earlier, which will
250 * set btm_last_cleanup_num_delpages to a sane value when we're called.
251 *
252 * Note also that the metapage's btm_last_cleanup_num_heap_tuples field is
253 * no longer used as of PostgreSQL 14. We set it to -1.0 on rewrite, just
254 * to be consistent.
255 */
256 metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
257 metapg = BufferGetPage(metabuf);
258 metad = BTPageGetMeta(metapg);
259
260 /* Don't miss chance to upgrade index/metapage when BTREE_MIN_VERSION */
261 if (metad->btm_version >= BTREE_NOVAC_VERSION &&
262 metad->btm_last_cleanup_num_delpages == num_delpages)
263 {
264 /* Usually means index continues to have num_delpages of 0 */
265 _bt_relbuf(rel, metabuf);
266 return;
267 }
268
269 /* trade in our read lock for a write lock */
270 _bt_unlockbuf(rel, metabuf);
271 _bt_lockbuf(rel, metabuf, BT_WRITE);
272
274
275 /* upgrade meta-page if needed */
276 if (metad->btm_version < BTREE_NOVAC_VERSION)
277 _bt_upgrademetapage(metapg);
278
279 /* update cleanup-related information */
280 metad->btm_last_cleanup_num_delpages = num_delpages;
282 MarkBufferDirty(metabuf);
283
284 /* write wal record if needed */
285 if (RelationNeedsWAL(rel))
286 {
288 XLogRecPtr recptr;
289
292
294 md.version = metad->btm_version;
295 md.root = metad->btm_root;
296 md.level = metad->btm_level;
297 md.fastroot = metad->btm_fastroot;
298 md.fastlevel = metad->btm_fastlevel;
299 md.last_cleanup_num_delpages = num_delpages;
301
302 XLogRegisterBufData(0, &md, sizeof(xl_btree_metadata));
303
304 recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_META_CLEANUP);
305
306 PageSetLSN(metapg, recptr);
307 }
308
310
311 _bt_relbuf(rel, metabuf);
312}
#define XLOG_BTREE_META_CLEANUP
Definition: nbtxlog.h:41

References _bt_getbuf(), _bt_lockbuf(), _bt_relbuf(), _bt_unlockbuf(), _bt_upgrademetapage(), xl_btree_metadata::allequalimage, Assert(), BT_READ, BT_WRITE, BTMetaPageData::btm_allequalimage, BTMetaPageData::btm_fastlevel, BTMetaPageData::btm_fastroot, BTMetaPageData::btm_last_cleanup_num_delpages, BTMetaPageData::btm_last_cleanup_num_heap_tuples, BTMetaPageData::btm_level, BTMetaPageData::btm_root, BTMetaPageData::btm_version, BTPageGetMeta, BTREE_METAPAGE, BTREE_NOVAC_VERSION, BufferGetPage(), END_CRIT_SECTION, xl_btree_metadata::fastlevel, xl_btree_metadata::fastroot, xl_btree_metadata::last_cleanup_num_delpages, xl_btree_metadata::level, MarkBufferDirty(), PageSetLSN(), REGBUF_STANDARD, REGBUF_WILL_INIT, RelationNeedsWAL, xl_btree_metadata::root, START_CRIT_SECTION, xl_btree_metadata::version, XLOG_BTREE_META_CLEANUP, XLogBeginInsert(), XLogInsert(), XLogRegisterBufData(), and XLogRegisterBuffer().

Referenced by btvacuumcleanup().

◆ _bt_set_startikey()

void _bt_set_startikey ( IndexScanDesc  scan,
BTReadPageState pstate 
)

Definition at line 2485 of file nbtutils.c.

2486{
2487 BTScanOpaque so = (BTScanOpaque) scan->opaque;
2488 Relation rel = scan->indexRelation;
2489 TupleDesc tupdesc = RelationGetDescr(rel);
2490 ItemId iid;
2491 IndexTuple firsttup,
2492 lasttup;
2493 int startikey = 0,
2494 arrayidx = 0,
2495 firstchangingattnum;
2496 bool start_past_saop_eq = false;
2497
2498 Assert(!so->scanBehind);
2499 Assert(pstate->minoff < pstate->maxoff);
2500 Assert(!pstate->firstpage);
2501 Assert(pstate->startikey == 0);
2502 Assert(!so->numArrayKeys || pstate->finaltup ||
2503 P_RIGHTMOST(BTPageGetOpaque(pstate->page)) ||
2504 P_LEFTMOST(BTPageGetOpaque(pstate->page)));
2505
2506 if (so->numberOfKeys == 0)
2507 return;
2508
2509 /* minoff is an offset to the lowest non-pivot tuple on the page */
2510 iid = PageGetItemId(pstate->page, pstate->minoff);
2511 firsttup = (IndexTuple) PageGetItem(pstate->page, iid);
2512
2513 /* maxoff is an offset to the highest non-pivot tuple on the page */
2514 iid = PageGetItemId(pstate->page, pstate->maxoff);
2515 lasttup = (IndexTuple) PageGetItem(pstate->page, iid);
2516
2517 /* Determine the first attribute whose values change on caller's page */
2518 firstchangingattnum = _bt_keep_natts_fast(rel, firsttup, lasttup);
2519
2520 for (; startikey < so->numberOfKeys; startikey++)
2521 {
2522 ScanKey key = so->keyData + startikey;
2523 BTArrayKeyInfo *array;
2524 Datum firstdatum,
2525 lastdatum;
2526 bool firstnull,
2527 lastnull;
2528 int32 result;
2529
2530 /*
2531 * Determine if it's safe to set pstate.startikey to an offset to a
2532 * key that comes after this key, by examining this key
2533 */
2534 if (!(key->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)))
2535 {
2536 /* Scan key isn't marked required (corner case) */
2537 Assert(!(key->sk_flags & SK_ROW_HEADER));
2538 break; /* unsafe */
2539 }
2540 if (key->sk_flags & SK_ROW_HEADER)
2541 {
2542 /*
2543 * Can't let pstate.startikey get set to an ikey beyond a
2544 * RowCompare inequality
2545 */
2546 break; /* unsafe */
2547 }
2548 if (key->sk_strategy != BTEqualStrategyNumber)
2549 {
2550 /*
2551 * Scalar inequality key.
2552 *
2553 * It's definitely safe for _bt_checkkeys to avoid assessing this
2554 * inequality when the page's first and last non-pivot tuples both
2555 * satisfy the inequality (since the same must also be true of all
2556 * the tuples in between these two).
2557 *
2558 * Unlike the "=" case, it doesn't matter if this attribute has
2559 * more than one distinct value (though it _is_ necessary for any
2560 * and all _prior_ attributes to contain no more than one distinct
2561 * value amongst all of the tuples from pstate.page).
2562 */
2563 if (key->sk_attno > firstchangingattnum) /* >, not >= */
2564 break; /* unsafe, preceding attr has multiple
2565 * distinct values */
2566
2567 firstdatum = index_getattr(firsttup, key->sk_attno, tupdesc, &firstnull);
2568 lastdatum = index_getattr(lasttup, key->sk_attno, tupdesc, &lastnull);
2569
2570 if (key->sk_flags & SK_ISNULL)
2571 {
2572 /* IS NOT NULL key */
2573 Assert(key->sk_flags & SK_SEARCHNOTNULL);
2574
2575 if (firstnull || lastnull)
2576 break; /* unsafe */
2577
2578 /* Safe, IS NOT NULL key satisfied by every tuple */
2579 continue;
2580 }
2581
2582 /* Test firsttup */
2583 if (firstnull ||
2585 key->sk_collation, firstdatum,
2586 key->sk_argument)))
2587 break; /* unsafe */
2588
2589 /* Test lasttup */
2590 if (lastnull ||
2592 key->sk_collation, lastdatum,
2593 key->sk_argument)))
2594 break; /* unsafe */
2595
2596 /* Safe, scalar inequality satisfied by every tuple */
2597 continue;
2598 }
2599
2600 /* Some = key (could be a a scalar = key, could be an array = key) */
2601 Assert(key->sk_strategy == BTEqualStrategyNumber);
2602
2603 if (!(key->sk_flags & SK_SEARCHARRAY))
2604 {
2605 /*
2606 * Scalar = key (possibly an IS NULL key).
2607 *
2608 * It is unsafe to set pstate.startikey to an ikey beyond this
2609 * key, unless the = key is satisfied by every possible tuple on
2610 * the page (possible only when attribute has just one distinct
2611 * value among all tuples on the page).
2612 */
2613 if (key->sk_attno >= firstchangingattnum)
2614 break; /* unsafe, multiple distinct attr values */
2615
2616 firstdatum = index_getattr(firsttup, key->sk_attno, tupdesc,
2617 &firstnull);
2618 if (key->sk_flags & SK_ISNULL)
2619 {
2620 /* IS NULL key */
2621 Assert(key->sk_flags & SK_SEARCHNULL);
2622
2623 if (!firstnull)
2624 break; /* unsafe */
2625
2626 /* Safe, IS NULL key satisfied by every tuple */
2627 continue;
2628 }
2629 if (firstnull ||
2631 key->sk_collation, firstdatum,
2632 key->sk_argument)))
2633 break; /* unsafe */
2634
2635 /* Safe, scalar = key satisfied by every tuple */
2636 continue;
2637 }
2638
2639 /* = array key (could be a SAOP array, could be a skip array) */
2640 array = &so->arrayKeys[arrayidx++];
2641 Assert(array->scan_key == startikey);
2642 if (array->num_elems != -1)
2643 {
2644 /*
2645 * SAOP array = key.
2646 *
2647 * Handle this like we handle scalar = keys (though binary search
2648 * for a matching element, to avoid relying on key's sk_argument).
2649 */
2650 if (key->sk_attno >= firstchangingattnum)
2651 break; /* unsafe, multiple distinct attr values */
2652
2653 firstdatum = index_getattr(firsttup, key->sk_attno, tupdesc,
2654 &firstnull);
2655 _bt_binsrch_array_skey(&so->orderProcs[startikey],
2657 firstdatum, firstnull, array, key,
2658 &result);
2659 if (result != 0)
2660 break; /* unsafe */
2661
2662 /* Safe, SAOP = key satisfied by every tuple */
2663 start_past_saop_eq = true;
2664 continue;
2665 }
2666
2667 /*
2668 * Skip array = key
2669 */
2670 Assert(key->sk_flags & SK_BT_SKIP);
2671 if (array->null_elem)
2672 {
2673 /*
2674 * Non-range skip array = key.
2675 *
2676 * Safe, non-range skip array "satisfied" by every tuple on page
2677 * (safe even when "key->sk_attno > firstchangingattnum").
2678 */
2679 continue;
2680 }
2681
2682 /*
2683 * Range skip array = key.
2684 *
2685 * Handle this like we handle scalar inequality keys (but avoid using
2686 * key's sk_argument directly, as in the SAOP array case).
2687 */
2688 if (key->sk_attno > firstchangingattnum) /* >, not >= */
2689 break; /* unsafe, preceding attr has multiple
2690 * distinct values */
2691
2692 firstdatum = index_getattr(firsttup, key->sk_attno, tupdesc, &firstnull);
2693 lastdatum = index_getattr(lasttup, key->sk_attno, tupdesc, &lastnull);
2694
2695 /* Test firsttup */
2697 firstdatum, firstnull, array, key,
2698 &result);
2699 if (result != 0)
2700 break; /* unsafe */
2701
2702 /* Test lasttup */
2704 lastdatum, lastnull, array, key,
2705 &result);
2706 if (result != 0)
2707 break; /* unsafe */
2708
2709 /* Safe, range skip array satisfied by every tuple on page */
2710 }
2711
2712 /*
2713 * Use of forcenonrequired is typically undesirable, since it'll force
2714 * _bt_readpage caller to read every tuple on the page -- even though, in
2715 * general, it might well be possible to end the scan on an earlier tuple.
2716 * However, caller must use forcenonrequired when start_past_saop_eq=true,
2717 * since the usual required array behavior might fail to roll over to the
2718 * SAOP array.
2719 *
2720 * We always prefer forcenonrequired=true during scans with skip arrays
2721 * (except on the first page of each primitive index scan), though -- even
2722 * when "startikey == 0". That way, _bt_advance_array_keys's low-order
2723 * key precheck optimization can always be used (unless on the first page
2724 * of the scan). It seems slightly preferable to check more tuples when
2725 * that allows us to do significantly less skip array maintenance.
2726 */
2727 pstate->forcenonrequired = (start_past_saop_eq || so->skipScan);
2728 pstate->startikey = startikey;
2729
2730 /*
2731 * _bt_readpage caller is required to call _bt_checkkeys against page's
2732 * finaltup with forcenonrequired=false whenever we initially set
2733 * forcenonrequired=true. That way the scan's arrays will reliably track
2734 * its progress through the index's key space.
2735 *
2736 * We don't expect this when _bt_readpage caller has no finaltup due to
2737 * its page being the rightmost (or the leftmost, during backwards scans).
2738 * When we see that _bt_readpage has no finaltup, back out of everything.
2739 */
2740 Assert(!pstate->forcenonrequired || so->numArrayKeys);
2741 if (pstate->forcenonrequired && !pstate->finaltup)
2742 {
2743 pstate->forcenonrequired = false;
2744 pstate->startikey = 0;
2745 }
2746}
#define SK_BT_REQBKWD
Definition: nbtree.h:1135
static void _bt_binsrch_skiparray_skey(bool cur_elem_trig, ScanDirection dir, Datum tupdatum, bool tupnull, BTArrayKeyInfo *array, ScanKey cur, int32 *set_elem_result)
Definition: nbtutils.c:443
int _bt_binsrch_array_skey(FmgrInfo *orderproc, bool cur_elem_trig, ScanDirection dir, Datum tupdatum, bool tupnull, BTArrayKeyInfo *array, ScanKey cur, int32 *set_elem_result)
Definition: nbtutils.c:287
@ NoMovementScanDirection
Definition: sdir.h:27
@ ForwardScanDirection
Definition: sdir.h:28
bool firstpage
Definition: nbtree.h:1108
IndexTuple finaltup
Definition: nbtree.h:1106
OffsetNumber minoff
Definition: nbtree.h:1104
OffsetNumber maxoff
Definition: nbtree.h:1105

References _bt_binsrch_array_skey(), _bt_binsrch_skiparray_skey(), _bt_keep_natts_fast(), BTScanOpaqueData::arrayKeys, Assert(), BTEqualStrategyNumber, BTPageGetOpaque, DatumGetBool(), BTReadPageState::finaltup, BTReadPageState::firstpage, BTReadPageState::forcenonrequired, ForwardScanDirection, FunctionCall2Coll(), index_getattr(), IndexScanDescData::indexRelation, sort-test::key, BTScanOpaqueData::keyData, BTReadPageState::maxoff, BTReadPageState::minoff, NoMovementScanDirection, BTArrayKeyInfo::null_elem, BTArrayKeyInfo::num_elems, BTScanOpaqueData::numArrayKeys, BTScanOpaqueData::numberOfKeys, IndexScanDescData::opaque, BTScanOpaqueData::orderProcs, P_LEFTMOST, P_RIGHTMOST, BTReadPageState::page, PageGetItem(), PageGetItemId(), RelationGetDescr, BTArrayKeyInfo::scan_key, BTScanOpaqueData::scanBehind, SK_BT_REQBKWD, SK_BT_REQFWD, SK_BT_SKIP, SK_ISNULL, SK_ROW_HEADER, SK_SEARCHARRAY, SK_SEARCHNOTNULL, SK_SEARCHNULL, BTScanOpaqueData::skipScan, and BTReadPageState::startikey.

Referenced by _bt_readpage().

◆ _bt_start_array_keys()

void _bt_start_array_keys ( IndexScanDesc  scan,
ScanDirection  dir 
)

Definition at line 611 of file nbtutils.c.

612{
613 Relation rel = scan->indexRelation;
614 BTScanOpaque so = (BTScanOpaque) scan->opaque;
615
616 Assert(so->numArrayKeys);
617 Assert(so->qual_ok);
618
619 for (int i = 0; i < so->numArrayKeys; i++)
620 {
621 BTArrayKeyInfo *array = &so->arrayKeys[i];
622 ScanKey skey = &so->keyData[array->scan_key];
623
625
626 _bt_array_set_low_or_high(rel, skey, array,
628 }
629 so->scanBehind = so->oppositeDirCheck = false; /* reset */
630}
static void _bt_array_set_low_or_high(Relation rel, ScanKey skey, BTArrayKeyInfo *array, bool low_not_high)
Definition: nbtutils.c:639

References _bt_array_set_low_or_high(), BTScanOpaqueData::arrayKeys, Assert(), i, IndexScanDescData::indexRelation, BTScanOpaqueData::keyData, BTScanOpaqueData::numArrayKeys, IndexScanDescData::opaque, BTScanOpaqueData::oppositeDirCheck, BTScanOpaqueData::qual_ok, BTArrayKeyInfo::scan_key, BTScanOpaqueData::scanBehind, ScanDirectionIsForward, ScanKeyData::sk_flags, and SK_SEARCHARRAY.

Referenced by _bt_advance_array_keys_increment(), _bt_first(), and btrestrpos().

◆ _bt_start_prim_scan()

bool _bt_start_prim_scan ( IndexScanDesc  scan,
ScanDirection  dir 
)

Definition at line 1339 of file nbtutils.c.

1340{
1341 BTScanOpaque so = (BTScanOpaque) scan->opaque;
1342
1343 Assert(so->numArrayKeys);
1344
1345 so->scanBehind = so->oppositeDirCheck = false; /* reset */
1346
1347 /*
1348 * Array keys are advanced within _bt_checkkeys when the scan reaches the
1349 * leaf level (more precisely, they're advanced when the scan reaches the
1350 * end of each distinct set of array elements). This process avoids
1351 * repeat access to leaf pages (across multiple primitive index scans) by
1352 * advancing the scan's array keys when it allows the primitive index scan
1353 * to find nearby matching tuples (or when it eliminates ranges of array
1354 * key space that can't possibly be satisfied by any index tuple).
1355 *
1356 * _bt_checkkeys sets a simple flag variable to schedule another primitive
1357 * index scan. The flag tells us what to do.
1358 *
1359 * We cannot rely on _bt_first always reaching _bt_checkkeys. There are
1360 * various cases where that won't happen. For example, if the index is
1361 * completely empty, then _bt_first won't call _bt_readpage/_bt_checkkeys.
1362 * We also don't expect a call to _bt_checkkeys during searches for a
1363 * non-existent value that happens to be lower/higher than any existing
1364 * value in the index.
1365 *
1366 * We don't require special handling for these cases -- we don't need to
1367 * be explicitly instructed to _not_ perform another primitive index scan.
1368 * It's up to code under the control of _bt_first to always set the flag
1369 * when another primitive index scan will be required.
1370 *
1371 * This works correctly, even with the tricky cases listed above, which
1372 * all involve access to leaf pages "near the boundaries of the key space"
1373 * (whether it's from a leftmost/rightmost page, or an imaginary empty
1374 * leaf root page). If _bt_checkkeys cannot be reached by a primitive
1375 * index scan for one set of array keys, then it also won't be reached for
1376 * any later set ("later" in terms of the direction that we scan the index
1377 * and advance the arrays). The array keys won't have advanced in these
1378 * cases, but that's the correct behavior (even _bt_advance_array_keys
1379 * won't always advance the arrays at the point they become "exhausted").
1380 */
1381 if (so->needPrimScan)
1382 {
1383 Assert(_bt_verify_arrays_bt_first(scan, dir));
1384
1385 /*
1386 * Flag was set -- must call _bt_first again, which will reset the
1387 * scan's needPrimScan flag
1388 */
1389 return true;
1390 }
1391
1392 /* The top-level index scan ran out of tuples in this scan direction */
1393 if (scan->parallel_scan != NULL)
1394 _bt_parallel_done(scan);
1395
1396 return false;
1397}

References _bt_parallel_done(), Assert(), BTScanOpaqueData::needPrimScan, BTScanOpaqueData::numArrayKeys, IndexScanDescData::opaque, BTScanOpaqueData::oppositeDirCheck, IndexScanDescData::parallel_scan, and BTScanOpaqueData::scanBehind.

◆ _bt_start_vacuum()

BTCycleId _bt_start_vacuum ( Relation  rel)

Definition at line 3561 of file nbtutils.c.

3562{
3563 BTCycleId result;
3564 int i;
3565 BTOneVacInfo *vac;
3566
3567 LWLockAcquire(BtreeVacuumLock, LW_EXCLUSIVE);
3568
3569 /*
3570 * Assign the next cycle ID, being careful to avoid zero as well as the
3571 * reserved high values.
3572 */
3573 result = ++(btvacinfo->cycle_ctr);
3574 if (result == 0 || result > MAX_BT_CYCLE_ID)
3575 result = btvacinfo->cycle_ctr = 1;
3576
3577 /* Let's just make sure there's no entry already for this index */
3578 for (i = 0; i < btvacinfo->num_vacuums; i++)
3579 {
3580 vac = &btvacinfo->vacuums[i];
3581 if (vac->relid.relId == rel->rd_lockInfo.lockRelId.relId &&
3582 vac->relid.dbId == rel->rd_lockInfo.lockRelId.dbId)
3583 {
3584 /*
3585 * Unlike most places in the backend, we have to explicitly
3586 * release our LWLock before throwing an error. This is because
3587 * we expect _bt_end_vacuum() to be called before transaction
3588 * abort cleanup can run to release LWLocks.
3589 */
3590 LWLockRelease(BtreeVacuumLock);
3591 elog(ERROR, "multiple active vacuums for index \"%s\"",
3593 }
3594 }
3595
3596 /* OK, add an entry */
3598 {
3599 LWLockRelease(BtreeVacuumLock);
3600 elog(ERROR, "out of btvacinfo slots");
3601 }
3603 vac->relid = rel->rd_lockInfo.lockRelId;
3604 vac->cycleid = result;
3606
3607 LWLockRelease(BtreeVacuumLock);
3608 return result;
3609}
#define MAX_BT_CYCLE_ID
Definition: nbtree.h:94
uint16 BTCycleId
Definition: nbtree.h:30
BTCycleId cycleid
Definition: nbtutils.c:3503
BTCycleId cycle_ctr
Definition: nbtutils.c:3508
int max_vacuums
Definition: nbtutils.c:3510

References btvacinfo, BTVacInfo::cycle_ctr, BTOneVacInfo::cycleid, LockRelId::dbId, elog, ERROR, i, LockInfoData::lockRelId, LW_EXCLUSIVE, LWLockAcquire(), LWLockRelease(), MAX_BT_CYCLE_ID, BTVacInfo::max_vacuums, BTVacInfo::num_vacuums, RelationData::rd_lockInfo, RelationGetRelationName, BTOneVacInfo::relid, LockRelId::relId, and BTVacInfo::vacuums.

Referenced by btbulkdelete().

◆ _bt_swap_posting()

IndexTuple _bt_swap_posting ( IndexTuple  newitem,
IndexTuple  oposting,
int  postingoff 
)

Definition at line 1022 of file nbtdedup.c.

1023{
1024 int nhtids;
1025 char *replacepos;
1026 char *replaceposright;
1027 Size nmovebytes;
1028 IndexTuple nposting;
1029
1030 nhtids = BTreeTupleGetNPosting(oposting);
1031 Assert(_bt_posting_valid(oposting));
1032
1033 /*
1034 * The postingoff argument originated as a _bt_binsrch_posting() return
1035 * value. It will be 0 in the event of corruption that makes a leaf page
1036 * contain a non-pivot tuple that's somehow identical to newitem (no two
1037 * non-pivot tuples should ever have the same TID). This has been known
1038 * to happen in the field from time to time.
1039 *
1040 * Perform a basic sanity check to catch this case now.
1041 */
1042 if (!(postingoff > 0 && postingoff < nhtids))
1043 elog(ERROR, "posting list tuple with %d items cannot be split at offset %d",
1044 nhtids, postingoff);
1045
1046 /*
1047 * Move item pointers in posting list to make a gap for the new item's
1048 * heap TID. We shift TIDs one place to the right, losing original
1049 * rightmost TID. (nmovebytes must not include TIDs to the left of
1050 * postingoff, nor the existing rightmost/max TID that gets overwritten.)
1051 */
1052 nposting = CopyIndexTuple(oposting);
1053 replacepos = (char *) BTreeTupleGetPostingN(nposting, postingoff);
1054 replaceposright = (char *) BTreeTupleGetPostingN(nposting, postingoff + 1);
1055 nmovebytes = (nhtids - postingoff - 1) * sizeof(ItemPointerData);
1056 memmove(replaceposright, replacepos, nmovebytes);
1057
1058 /* Fill the gap at postingoff with TID of new item (original new TID) */
1059 Assert(!BTreeTupleIsPivot(newitem) && !BTreeTupleIsPosting(newitem));
1060 ItemPointerCopy(&newitem->t_tid, (ItemPointer) replacepos);
1061
1062 /* Now copy oposting's rightmost/max TID into new item (final new TID) */
1063 ItemPointerCopy(BTreeTupleGetMaxHeapTID(oposting), &newitem->t_tid);
1064
1066 BTreeTupleGetHeapTID(newitem)) < 0);
1067 Assert(_bt_posting_valid(nposting));
1068
1069 return nposting;
1070}

References Assert(), BTreeTupleGetHeapTID(), BTreeTupleGetMaxHeapTID(), BTreeTupleGetNPosting(), BTreeTupleGetPostingN(), BTreeTupleIsPivot(), BTreeTupleIsPosting(), CopyIndexTuple(), elog, ERROR, ItemPointerCompare(), ItemPointerCopy(), and IndexTupleData::t_tid.

Referenced by _bt_insertonpg(), btree_xlog_insert(), and btree_xlog_split().

◆ _bt_truncate()

IndexTuple _bt_truncate ( Relation  rel,
IndexTuple  lastleft,
IndexTuple  firstright,
BTScanInsert  itup_key 
)

Definition at line 3790 of file nbtutils.c.

3792{
3793 TupleDesc itupdesc = RelationGetDescr(rel);
3795 int keepnatts;
3796 IndexTuple pivot;
3797 IndexTuple tidpivot;
3798 ItemPointer pivotheaptid;
3799 Size newsize;
3800
3801 /*
3802 * We should only ever truncate non-pivot tuples from leaf pages. It's
3803 * never okay to truncate when splitting an internal page.
3804 */
3805 Assert(!BTreeTupleIsPivot(lastleft) && !BTreeTupleIsPivot(firstright));
3806
3807 /* Determine how many attributes must be kept in truncated tuple */
3808 keepnatts = _bt_keep_natts(rel, lastleft, firstright, itup_key);
3809
3810#ifdef DEBUG_NO_TRUNCATE
3811 /* Force truncation to be ineffective for testing purposes */
3812 keepnatts = nkeyatts + 1;
3813#endif
3814
3815 pivot = index_truncate_tuple(itupdesc, firstright,
3816 Min(keepnatts, nkeyatts));
3817
3818 if (BTreeTupleIsPosting(pivot))
3819 {
3820 /*
3821 * index_truncate_tuple() just returns a straight copy of firstright
3822 * when it has no attributes to truncate. When that happens, we may
3823 * need to truncate away a posting list here instead.
3824 */
3825 Assert(keepnatts == nkeyatts || keepnatts == nkeyatts + 1);
3827 pivot->t_info &= ~INDEX_SIZE_MASK;
3828 pivot->t_info |= MAXALIGN(BTreeTupleGetPostingOffset(firstright));
3829 }
3830
3831 /*
3832 * If there is a distinguishing key attribute within pivot tuple, we're
3833 * done
3834 */
3835 if (keepnatts <= nkeyatts)
3836 {
3837 BTreeTupleSetNAtts(pivot, keepnatts, false);
3838 return pivot;
3839 }
3840
3841 /*
3842 * We have to store a heap TID in the new pivot tuple, since no non-TID
3843 * key attribute value in firstright distinguishes the right side of the
3844 * split from the left side. nbtree conceptualizes this case as an
3845 * inability to truncate away any key attributes, since heap TID is
3846 * treated as just another key attribute (despite lacking a pg_attribute
3847 * entry).
3848 *
3849 * Use enlarged space that holds a copy of pivot. We need the extra space
3850 * to store a heap TID at the end (using the special pivot tuple
3851 * representation). Note that the original pivot already has firstright's
3852 * possible posting list/non-key attribute values removed at this point.
3853 */
3854 newsize = MAXALIGN(IndexTupleSize(pivot)) + MAXALIGN(sizeof(ItemPointerData));
3855 tidpivot = palloc0(newsize);
3856 memcpy(tidpivot, pivot, MAXALIGN(IndexTupleSize(pivot)));
3857 /* Cannot leak memory here */
3858 pfree(pivot);
3859
3860 /*
3861 * Store all of firstright's key attribute values plus a tiebreaker heap
3862 * TID value in enlarged pivot tuple
3863 */
3864 tidpivot->t_info &= ~INDEX_SIZE_MASK;
3865 tidpivot->t_info |= newsize;
3866 BTreeTupleSetNAtts(tidpivot, nkeyatts, true);
3867 pivotheaptid = BTreeTupleGetHeapTID(tidpivot);
3868
3869 /*
3870 * Lehman & Yao use lastleft as the leaf high key in all cases, but don't
3871 * consider suffix truncation. It seems like a good idea to follow that
3872 * example in cases where no truncation takes place -- use lastleft's heap
3873 * TID. (This is also the closest value to negative infinity that's
3874 * legally usable.)
3875 */
3876 ItemPointerCopy(BTreeTupleGetMaxHeapTID(lastleft), pivotheaptid);
3877
3878 /*
3879 * We're done. Assert() that heap TID invariants hold before returning.
3880 *
3881 * Lehman and Yao require that the downlink to the right page, which is to
3882 * be inserted into the parent page in the second phase of a page split be
3883 * a strict lower bound on items on the right page, and a non-strict upper
3884 * bound for items on the left page. Assert that heap TIDs follow these
3885 * invariants, since a heap TID value is apparently needed as a
3886 * tiebreaker.
3887 */
3888#ifndef DEBUG_NO_TRUNCATE
3890 BTreeTupleGetHeapTID(firstright)) < 0);
3891 Assert(ItemPointerCompare(pivotheaptid,
3892 BTreeTupleGetHeapTID(lastleft)) >= 0);
3893 Assert(ItemPointerCompare(pivotheaptid,
3894 BTreeTupleGetHeapTID(firstright)) < 0);
3895#else
3896
3897 /*
3898 * Those invariants aren't guaranteed to hold for lastleft + firstright
3899 * heap TID attribute values when they're considered here only because
3900 * DEBUG_NO_TRUNCATE is defined (a heap TID is probably not actually
3901 * needed as a tiebreaker). DEBUG_NO_TRUNCATE must therefore use a heap
3902 * TID value that always works as a strict lower bound for items to the
3903 * right. In particular, it must avoid using firstright's leading key
3904 * attribute values along with lastleft's heap TID value when lastleft's
3905 * TID happens to be greater than firstright's TID.
3906 */
3907 ItemPointerCopy(BTreeTupleGetHeapTID(firstright), pivotheaptid);
3908
3909 /*
3910 * Pivot heap TID should never be fully equal to firstright. Note that
3911 * the pivot heap TID will still end up equal to lastleft's heap TID when
3912 * that's the only usable value.
3913 */
3914 ItemPointerSetOffsetNumber(pivotheaptid,
3916 Assert(ItemPointerCompare(pivotheaptid,
3917 BTreeTupleGetHeapTID(firstright)) < 0);
3918#endif
3919
3920 return tidpivot;
3921}
IndexTuple index_truncate_tuple(TupleDesc sourceDescriptor, IndexTuple source, int leavenatts)
Definition: indextuple.c:576
static void ItemPointerSetOffsetNumber(ItemPointerData *pointer, OffsetNumber offsetNumber)
Definition: itemptr.h:158
static void BTreeTupleSetNAtts(IndexTuple itup, uint16 nkeyatts, bool heaptid)
Definition: nbtree.h:596
static int _bt_keep_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright, BTScanInsert itup_key)
Definition: nbtutils.c:3935

References _bt_keep_natts(), Assert(), BTreeTupleGetHeapTID(), BTreeTupleGetMaxHeapTID(), BTreeTupleGetPostingOffset(), BTreeTupleIsPivot(), BTreeTupleIsPosting(), BTreeTupleSetNAtts(), index_truncate_tuple(), IndexRelationGetNumberOfAttributes, IndexRelationGetNumberOfKeyAttributes, IndexTupleSize(), ItemPointerCompare(), ItemPointerCopy(), ItemPointerGetOffsetNumber(), ItemPointerSetOffsetNumber(), MAXALIGN, Min, OffsetNumberPrev, palloc0(), pfree(), RelationGetDescr, and IndexTupleData::t_info.

Referenced by _bt_buildadd(), and _bt_split().

◆ _bt_unlockbuf()

void _bt_unlockbuf ( Relation  rel,
Buffer  buf 
)

Definition at line 1070 of file nbtpage.c.

1071{
1072 /*
1073 * Buffer is pinned and locked, which means that it is expected to be
1074 * defined and addressable. Check that proactively.
1075 */
1077
1078 /* LockBuffer() asserts that pin is held by this backend */
1080
1081 if (!RelationUsesLocalBuffers(rel))
1083}
#define BUFFER_LOCK_UNLOCK
Definition: bufmgr.h:196
#define VALGRIND_CHECK_MEM_IS_DEFINED(addr, size)
Definition: memdebug.h:23
#define VALGRIND_MAKE_MEM_NOACCESS(addr, size)
Definition: memdebug.h:27

References buf, BUFFER_LOCK_UNLOCK, BufferGetPage(), LockBuffer(), RelationUsesLocalBuffers, VALGRIND_CHECK_MEM_IS_DEFINED, and VALGRIND_MAKE_MEM_NOACCESS.

Referenced by _bt_drop_lock_and_maybe_pin(), _bt_getroot(), _bt_killitems(), _bt_moveright(), _bt_pagedel(), _bt_readfirstpage(), _bt_relandgetbuf(), _bt_relbuf(), _bt_search(), _bt_set_cleanup_info(), and _bt_unlink_halfdead_page().

◆ _bt_update_posting()

void _bt_update_posting ( BTVacuumPosting  vacposting)

Definition at line 924 of file nbtdedup.c.

925{
926 IndexTuple origtuple = vacposting->itup;
927 uint32 keysize,
928 newsize;
929 IndexTuple itup;
930 int nhtids;
931 int ui,
932 d;
933 ItemPointer htids;
934
935 nhtids = BTreeTupleGetNPosting(origtuple) - vacposting->ndeletedtids;
936
937 Assert(_bt_posting_valid(origtuple));
938 Assert(nhtids > 0 && nhtids < BTreeTupleGetNPosting(origtuple));
939
940 /*
941 * Determine final size of new tuple.
942 *
943 * This calculation needs to match the code used within _bt_form_posting()
944 * for new posting list tuples. We avoid calling _bt_form_posting() here
945 * to save ourselves a second memory allocation for a htids workspace.
946 */
947 keysize = BTreeTupleGetPostingOffset(origtuple);
948 if (nhtids > 1)
949 newsize = MAXALIGN(keysize +
950 nhtids * sizeof(ItemPointerData));
951 else
952 newsize = keysize;
953
954 Assert(newsize <= INDEX_SIZE_MASK);
955 Assert(newsize == MAXALIGN(newsize));
956
957 /* Allocate memory using palloc0() (matches index_form_tuple()) */
958 itup = palloc0(newsize);
959 memcpy(itup, origtuple, keysize);
960 itup->t_info &= ~INDEX_SIZE_MASK;
961 itup->t_info |= newsize;
962
963 if (nhtids > 1)
964 {
965 /* Form posting list tuple */
966 BTreeTupleSetPosting(itup, nhtids, keysize);
967 htids = BTreeTupleGetPosting(itup);
968 }
969 else
970 {
971 /* Form standard non-pivot tuple */
972 itup->t_info &= ~INDEX_ALT_TID_MASK;
973 htids = &itup->t_tid;
974 }
975
976 ui = 0;
977 d = 0;
978 for (int i = 0; i < BTreeTupleGetNPosting(origtuple); i++)
979 {
980 if (d < vacposting->ndeletedtids && vacposting->deletetids[d] == i)
981 {
982 d++;
983 continue;
984 }
985 htids[ui++] = *BTreeTupleGetPostingN(origtuple, i);
986 }
987 Assert(ui == nhtids);
988 Assert(d == vacposting->ndeletedtids);
989 Assert(nhtids == 1 || _bt_posting_valid(itup));
990 Assert(nhtids > 1 || ItemPointerIsValid(&itup->t_tid));
991
992 /* vacposting arg's itup will now point to updated version */
993 vacposting->itup = itup;
994}

References Assert(), BTreeTupleGetNPosting(), BTreeTupleGetPosting(), BTreeTupleGetPostingN(), BTreeTupleGetPostingOffset(), BTreeTupleSetPosting(), BTVacuumPostingData::deletetids, i, INDEX_SIZE_MASK, ItemPointerIsValid(), BTVacuumPostingData::itup, MAXALIGN, BTVacuumPostingData::ndeletedtids, palloc0(), IndexTupleData::t_info, and IndexTupleData::t_tid.

Referenced by _bt_delitems_update(), and btree_xlog_updates().

◆ _bt_upgradelockbufcleanup()

void _bt_upgradelockbufcleanup ( Relation  rel,
Buffer  buf 
)

Definition at line 1109 of file nbtpage.c.

1110{
1111 /*
1112 * Buffer is pinned and locked, which means that it is expected to be
1113 * defined and addressable. Check that proactively.
1114 */
1116
1117 /* LockBuffer() asserts that pin is held by this backend */
1120}
void LockBufferForCleanup(Buffer buffer)
Definition: bufmgr.c:5617

References buf, BUFFER_LOCK_UNLOCK, BufferGetPage(), LockBuffer(), LockBufferForCleanup(), and VALGRIND_CHECK_MEM_IS_DEFINED.

Referenced by btvacuumpage().

◆ _bt_upgrademetapage()

void _bt_upgrademetapage ( Page  page)

Definition at line 107 of file nbtpage.c.

108{
109 BTMetaPageData *metad;
111
112 metad = BTPageGetMeta(page);
113 metaopaque = BTPageGetOpaque(page);
114
115 /* It must be really a meta page of upgradable version */
116 Assert(metaopaque->btpo_flags & BTP_META);
119
120 /* Set version number and fill extra fields added into version 3 */
124 /* Only a REINDEX can set this field */
125 Assert(!metad->btm_allequalimage);
126 metad->btm_allequalimage = false;
127
128 /* Adjust pd_lower (see _bt_initmetapage() for details) */
129 ((PageHeader) page)->pd_lower =
130 ((char *) metad + sizeof(BTMetaPageData)) - (char *) page;
131}

References Assert(), BTMetaPageData::btm_allequalimage, BTMetaPageData::btm_last_cleanup_num_delpages, BTMetaPageData::btm_last_cleanup_num_heap_tuples, BTMetaPageData::btm_version, BTP_META, BTPageGetMeta, BTPageGetOpaque, BTREE_MIN_VERSION, BTREE_NOVAC_VERSION, and PG_USED_FOR_ASSERTS_ONLY.

Referenced by _bt_getroot(), _bt_insertonpg(), _bt_newlevel(), _bt_set_cleanup_info(), and _bt_unlink_halfdead_page().

◆ _bt_vacuum_cycleid()

BTCycleId _bt_vacuum_cycleid ( Relation  rel)

Definition at line 3527 of file nbtutils.c.

3528{
3529 BTCycleId result = 0;
3530 int i;
3531
3532 /* Share lock is enough since this is a read-only operation */
3533 LWLockAcquire(BtreeVacuumLock, LW_SHARED);
3534
3535 for (i = 0; i < btvacinfo->num_vacuums; i++)
3536 {
3537 BTOneVacInfo *vac = &btvacinfo->vacuums[i];
3538
3539 if (vac->relid.relId == rel->rd_lockInfo.lockRelId.relId &&
3540 vac->relid.dbId == rel->rd_lockInfo.lockRelId.dbId)
3541 {
3542 result = vac->cycleid;
3543 break;
3544 }
3545 }
3546
3547 LWLockRelease(BtreeVacuumLock);
3548 return result;
3549}
@ LW_SHARED
Definition: lwlock.h:115

References btvacinfo, BTOneVacInfo::cycleid, LockRelId::dbId, i, LockInfoData::lockRelId, LW_SHARED, LWLockAcquire(), LWLockRelease(), BTVacInfo::num_vacuums, RelationData::rd_lockInfo, BTOneVacInfo::relid, LockRelId::relId, and BTVacInfo::vacuums.

Referenced by _bt_split().

◆ _bt_vacuum_needs_cleanup()

bool _bt_vacuum_needs_cleanup ( Relation  rel)

Definition at line 179 of file nbtpage.c.

180{
181 Buffer metabuf;
182 Page metapg;
183 BTMetaPageData *metad;
184 uint32 btm_version;
185 BlockNumber prev_num_delpages;
186
187 /*
188 * Copy details from metapage to local variables quickly.
189 *
190 * Note that we deliberately avoid using cached version of metapage here.
191 */
192 metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
193 metapg = BufferGetPage(metabuf);
194 metad = BTPageGetMeta(metapg);
195 btm_version = metad->btm_version;
196
197 if (btm_version < BTREE_NOVAC_VERSION)
198 {
199 /*
200 * Metapage needs to be dynamically upgraded to store fields that are
201 * only present when btm_version >= BTREE_NOVAC_VERSION
202 */
203 _bt_relbuf(rel, metabuf);
204 return true;
205 }
206
207 prev_num_delpages = metad->btm_last_cleanup_num_delpages;
208 _bt_relbuf(rel, metabuf);
209
210 /*
211 * Trigger cleanup in rare cases where prev_num_delpages exceeds 5% of the
212 * total size of the index. We can reasonably expect (though are not
213 * guaranteed) to be able to recycle this many pages if we decide to do a
214 * btvacuumscan call during the ongoing btvacuumcleanup. For further
215 * details see the nbtree/README section on placing deleted pages in the
216 * FSM.
217 */
218 if (prev_num_delpages > 0 &&
219 prev_num_delpages > RelationGetNumberOfBlocks(rel) / 20)
220 return true;
221
222 return false;
223}
#define RelationGetNumberOfBlocks(reln)
Definition: bufmgr.h:280

References _bt_getbuf(), _bt_relbuf(), BT_READ, BTMetaPageData::btm_last_cleanup_num_delpages, BTMetaPageData::btm_version, BTPageGetMeta, BTREE_METAPAGE, BTREE_NOVAC_VERSION, BufferGetPage(), and RelationGetNumberOfBlocks.

Referenced by btvacuumcleanup().

◆ btadjustmembers()

void btadjustmembers ( Oid  opfamilyoid,
Oid  opclassoid,
List operators,
List functions 
)

Definition at line 288 of file nbtvalidate.c.

292{
293 Oid opcintype;
294 ListCell *lc;
295
296 /*
297 * Btree operators and comparison support functions are always "loose"
298 * members of the opfamily if they are cross-type. If they are not
299 * cross-type, we prefer to tie them to the appropriate opclass ... but if
300 * the user hasn't created one, we can't do that, and must fall back to
301 * using the opfamily dependency. (We mustn't force creation of an
302 * opclass in such a case, as leaving an incomplete opclass laying about
303 * would be bad. Throwing an error is another undesirable alternative.)
304 *
305 * This behavior results in a bit of a dump/reload hazard, in that the
306 * order of restoring objects could affect what dependencies we end up
307 * with. pg_dump's existing behavior will preserve the dependency choices
308 * in most cases, but not if a cross-type operator has been bound tightly
309 * into an opclass. That's a mistake anyway, so silently "fixing" it
310 * isn't awful.
311 *
312 * Optional support functions are always "loose" family members.
313 *
314 * To avoid repeated lookups, we remember the most recently used opclass's
315 * input type.
316 */
317 if (OidIsValid(opclassoid))
318 {
319 /* During CREATE OPERATOR CLASS, need CCI to see the pg_opclass row */
321 opcintype = get_opclass_input_type(opclassoid);
322 }
323 else
324 opcintype = InvalidOid;
325
326 /*
327 * We handle operators and support functions almost identically, so rather
328 * than duplicate this code block, just join the lists.
329 */
330 foreach(lc, list_concat_copy(operators, functions))
331 {
333
334 if (op->is_func && op->number != BTORDER_PROC)
335 {
336 /* Optional support proc, so always a soft family dependency */
337 op->ref_is_hard = false;
338 op->ref_is_family = true;
339 op->refobjid = opfamilyoid;
340 }
341 else if (op->lefttype != op->righttype)
342 {
343 /* Cross-type, so always a soft family dependency */
344 op->ref_is_hard = false;
345 op->ref_is_family = true;
346 op->refobjid = opfamilyoid;
347 }
348 else
349 {
350 /* Not cross-type; is there a suitable opclass? */
351 if (op->lefttype != opcintype)
352 {
353 /* Avoid repeating this expensive lookup, even if it fails */
354 opcintype = op->lefttype;
355 opclassoid = opclass_for_family_datatype(BTREE_AM_OID,
356 opfamilyoid,
357 opcintype);
358 }
359 if (OidIsValid(opclassoid))
360 {
361 /* Hard dependency on opclass */
362 op->ref_is_hard = true;
363 op->ref_is_family = false;
364 op->refobjid = opclassoid;
365 }
366 else
367 {
368 /* We're stuck, so make a soft dependency on the opfamily */
369 op->ref_is_hard = false;
370 op->ref_is_family = true;
371 op->refobjid = opfamilyoid;
372 }
373 }
374 }
375}
Oid opclass_for_family_datatype(Oid amoid, Oid opfamilyoid, Oid datatypeoid)
Definition: amvalidate.c:236
List * list_concat_copy(const List *list1, const List *list2)
Definition: list.c:598
Oid get_opclass_input_type(Oid opclass)
Definition: lsyscache.c:1304
#define lfirst(lc)
Definition: pg_list.h:172
static const struct fns functions
Definition: regcomp.c:358
Oid refobjid
Definition: amapi.h:96
Oid lefttype
Definition: amapi.h:91
bool ref_is_family
Definition: amapi.h:95
Oid righttype
Definition: amapi.h:92
int number
Definition: amapi.h:90
bool is_func
Definition: amapi.h:88
bool ref_is_hard
Definition: amapi.h:94
void CommandCounterIncrement(void)
Definition: xact.c:1100

References BTORDER_PROC, CommandCounterIncrement(), functions, get_opclass_input_type(), InvalidOid, OpFamilyMember::is_func, OpFamilyMember::lefttype, lfirst, list_concat_copy(), OpFamilyMember::number, OidIsValid, opclass_for_family_datatype(), OpFamilyMember::ref_is_family, OpFamilyMember::ref_is_hard, OpFamilyMember::refobjid, and OpFamilyMember::righttype.

Referenced by bthandler().

◆ btbeginscan()

IndexScanDesc btbeginscan ( Relation  rel,
int  nkeys,
int  norderbys 
)

Definition at line 332 of file nbtree.c.

333{
334 IndexScanDesc scan;
335 BTScanOpaque so;
336
337 /* no order by operators allowed */
338 Assert(norderbys == 0);
339
340 /* get the scan */
341 scan = RelationGetIndexScan(rel, nkeys, norderbys);
342
343 /* allocate private workspace */
344 so = (BTScanOpaque) palloc(sizeof(BTScanOpaqueData));
347 if (scan->numberOfKeys > 0)
348 so->keyData = (ScanKey) palloc(scan->numberOfKeys * sizeof(ScanKeyData));
349 else
350 so->keyData = NULL;
351
352 so->skipScan = false;
353 so->needPrimScan = false;
354 so->scanBehind = false;
355 so->oppositeDirCheck = false;
356 so->arrayKeys = NULL;
357 so->orderProcs = NULL;
358 so->arrayContext = NULL;
359
360 so->killedItems = NULL; /* until needed */
361 so->numKilled = 0;
362
363 /*
364 * We don't know yet whether the scan will be index-only, so we do not
365 * allocate the tuple workspace arrays until btrescan. However, we set up
366 * scan->xs_itupdesc whether we'll need it or not, since that's so cheap.
367 */
368 so->currTuples = so->markTuples = NULL;
369
370 scan->xs_itupdesc = RelationGetDescr(rel);
371
372 scan->opaque = so;
373
374 return scan;
375}
IndexScanDesc RelationGetIndexScan(Relation indexRelation, int nkeys, int norderbys)
Definition: genam.c:80
char * markTuples
Definition: nbtree.h:1080
char * currTuples
Definition: nbtree.h:1079
BTScanPosData markPos
Definition: nbtree.h:1093
struct TupleDescData * xs_itupdesc
Definition: relscan.h:168

References BTScanOpaqueData::arrayContext, BTScanOpaqueData::arrayKeys, Assert(), BTScanPosInvalidate, BTScanOpaqueData::currPos, BTScanOpaqueData::currTuples, BTScanOpaqueData::keyData, BTScanOpaqueData::killedItems, BTScanOpaqueData::markPos, BTScanOpaqueData::markTuples, BTScanOpaqueData::needPrimScan, IndexScanDescData::numberOfKeys, BTScanOpaqueData::numKilled, IndexScanDescData::opaque, BTScanOpaqueData::oppositeDirCheck, BTScanOpaqueData::orderProcs, palloc(), RelationGetDescr, RelationGetIndexScan(), BTScanOpaqueData::scanBehind, BTScanOpaqueData::skipScan, and IndexScanDescData::xs_itupdesc.

Referenced by bthandler().

◆ btbuild()

IndexBuildResult * btbuild ( Relation  heap,
Relation  index,
struct IndexInfo indexInfo 
)

Definition at line 295 of file nbtsort.c.

296{
297 IndexBuildResult *result;
298 BTBuildState buildstate;
299 double reltuples;
300
301#ifdef BTREE_BUILD_STATS
303 ResetUsage();
304#endif /* BTREE_BUILD_STATS */
305
306 buildstate.isunique = indexInfo->ii_Unique;
307 buildstate.nulls_not_distinct = indexInfo->ii_NullsNotDistinct;
308 buildstate.havedead = false;
309 buildstate.heap = heap;
310 buildstate.spool = NULL;
311 buildstate.spool2 = NULL;
312 buildstate.indtuples = 0;
313 buildstate.btleader = NULL;
314
315 /*
316 * We expect to be called exactly once for any index relation. If that's
317 * not the case, big trouble's what we have.
318 */
320 elog(ERROR, "index \"%s\" already contains data",
322
323 reltuples = _bt_spools_heapscan(heap, index, &buildstate, indexInfo);
324
325 /*
326 * Finish the build by (1) completing the sort of the spool file, (2)
327 * inserting the sorted tuples into btree pages and (3) building the upper
328 * levels. Finally, it may also be necessary to end use of parallelism.
329 */
330 _bt_leafbuild(buildstate.spool, buildstate.spool2);
331 _bt_spooldestroy(buildstate.spool);
332 if (buildstate.spool2)
333 _bt_spooldestroy(buildstate.spool2);
334 if (buildstate.btleader)
335 _bt_end_parallel(buildstate.btleader);
336
337 result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
338
339 result->heap_tuples = reltuples;
340 result->index_tuples = buildstate.indtuples;
341
342#ifdef BTREE_BUILD_STATS
344 {
345 ShowUsage("BTREE BUILD STATS");
346 ResetUsage();
347 }
348#endif /* BTREE_BUILD_STATS */
349
350 return result;
351}
static void _bt_end_parallel(BTLeader *btleader)
Definition: nbtsort.c:1609
static void _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2)
Definition: nbtsort.c:538
static double _bt_spools_heapscan(Relation heap, Relation index, BTBuildState *buildstate, IndexInfo *indexInfo)
Definition: nbtsort.c:365
static void _bt_spooldestroy(BTSpool *btspool)
Definition: nbtsort.c:517
bool isunique
Definition: nbtsort.c:206
BTSpool * spool
Definition: nbtsort.c:210
BTLeader * btleader
Definition: nbtsort.c:224
bool nulls_not_distinct
Definition: nbtsort.c:207
bool havedead
Definition: nbtsort.c:208
Relation heap
Definition: nbtsort.c:209
BTSpool * spool2
Definition: nbtsort.c:216
double indtuples
Definition: nbtsort.c:217
double heap_tuples
Definition: genam.h:55
double index_tuples
Definition: genam.h:56
bool ii_Unique
Definition: execnodes.h:209
bool ii_NullsNotDistinct
Definition: execnodes.h:210
Definition: type.h:96

References _bt_end_parallel(), _bt_leafbuild(), _bt_spooldestroy(), _bt_spools_heapscan(), BTBuildState::btleader, elog, ERROR, BTBuildState::havedead, BTBuildState::heap, IndexBuildResult::heap_tuples, IndexInfo::ii_NullsNotDistinct, IndexInfo::ii_Unique, IndexBuildResult::index_tuples, BTBuildState::indtuples, BTBuildState::isunique, log_btree_build_stats, BTBuildState::nulls_not_distinct, palloc(), RelationGetNumberOfBlocks, RelationGetRelationName, ResetUsage(), ShowUsage(), BTBuildState::spool, and BTBuildState::spool2.

Referenced by bthandler().

◆ btbuildempty()

void btbuildempty ( Relation  index)

Definition at line 179 of file nbtree.c.

180{
181 bool allequalimage = _bt_allequalimage(index, false);
182 BulkWriteState *bulkstate;
183 BulkWriteBuffer metabuf;
184
186
187 /* Construct metapage. */
188 metabuf = smgr_bulk_get_buf(bulkstate);
189 _bt_initmetapage((Page) metabuf, P_NONE, 0, allequalimage);
190 smgr_bulk_write(bulkstate, BTREE_METAPAGE, metabuf, true);
191
192 smgr_bulk_finish(bulkstate);
193}
BulkWriteState * smgr_bulk_start_rel(Relation rel, ForkNumber forknum)
Definition: bulk_write.c:87
void smgr_bulk_write(BulkWriteState *bulkstate, BlockNumber blocknum, BulkWriteBuffer buf, bool page_std)
Definition: bulk_write.c:323
BulkWriteBuffer smgr_bulk_get_buf(BulkWriteState *bulkstate)
Definition: bulk_write.c:347
void smgr_bulk_finish(BulkWriteState *bulkstate)
Definition: bulk_write.c:130
void _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level, bool allequalimage)
Definition: nbtpage.c:67
bool _bt_allequalimage(Relation rel, bool debugmessage)
Definition: nbtutils.c:4273
@ INIT_FORKNUM
Definition: relpath.h:61

References _bt_allequalimage(), _bt_initmetapage(), BTREE_METAPAGE, INIT_FORKNUM, P_NONE, smgr_bulk_finish(), smgr_bulk_get_buf(), smgr_bulk_start_rel(), and smgr_bulk_write().

Referenced by bthandler().

◆ btbuildphasename()

char * btbuildphasename ( int64  phasenum)

Definition at line 3742 of file nbtutils.c.

3743{
3744 switch (phasenum)
3745 {
3747 return "initializing";
3749 return "scanning table";
3751 return "sorting live tuples";
3753 return "sorting dead tuples";
3755 return "loading tuples in tree";
3756 default:
3757 return NULL;
3758 }
3759}
#define PROGRESS_BTREE_PHASE_PERFORMSORT_2
Definition: nbtree.h:1178
#define PROGRESS_BTREE_PHASE_LEAF_LOAD
Definition: nbtree.h:1179
#define PROGRESS_BTREE_PHASE_INDEXBUILD_TABLESCAN
Definition: nbtree.h:1176
#define PROGRESS_BTREE_PHASE_PERFORMSORT_1
Definition: nbtree.h:1177
#define PROGRESS_CREATEIDX_SUBPHASE_INITIALIZE
Definition: progress.h:109

References PROGRESS_BTREE_PHASE_INDEXBUILD_TABLESCAN, PROGRESS_BTREE_PHASE_LEAF_LOAD, PROGRESS_BTREE_PHASE_PERFORMSORT_1, PROGRESS_BTREE_PHASE_PERFORMSORT_2, and PROGRESS_CREATEIDX_SUBPHASE_INITIALIZE.

Referenced by bthandler().

◆ btbulkdelete()

IndexBulkDeleteResult * btbulkdelete ( IndexVacuumInfo info,
IndexBulkDeleteResult stats,
IndexBulkDeleteCallback  callback,
void *  callback_state 
)

Definition at line 1033 of file nbtree.c.

1035{
1036 Relation rel = info->index;
1037 BTCycleId cycleid;
1038
1039 /* allocate stats if first time through, else re-use existing struct */
1040 if (stats == NULL)
1042
1043 /* Establish the vacuum cycle ID to use for this scan */
1044 /* The ENSURE stuff ensures we clean up shared memory on failure */
1046 {
1047 cycleid = _bt_start_vacuum(rel);
1048
1049 btvacuumscan(info, stats, callback, callback_state, cycleid);
1050 }
1052 _bt_end_vacuum(rel);
1053
1054 return stats;
1055}
#define PG_ENSURE_ERROR_CLEANUP(cleanup_function, arg)
Definition: ipc.h:47
#define PG_END_ENSURE_ERROR_CLEANUP(cleanup_function, arg)
Definition: ipc.h:52
static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state, BTCycleId cycleid)
Definition: nbtree.c:1151
void _bt_end_vacuum_callback(int code, Datum arg)
Definition: nbtutils.c:3646
BTCycleId _bt_start_vacuum(Relation rel)
Definition: nbtutils.c:3561
static Datum PointerGetDatum(const void *X)
Definition: postgres.h:327
Relation index
Definition: genam.h:69
static void callback(struct sockaddr *addr, struct sockaddr *mask, void *unused)
Definition: test_ifaddrs.c:46

References _bt_end_vacuum(), _bt_end_vacuum_callback(), _bt_start_vacuum(), btvacuumscan(), callback(), IndexVacuumInfo::index, palloc0(), PG_END_ENSURE_ERROR_CLEANUP, PG_ENSURE_ERROR_CLEANUP, and PointerGetDatum().

Referenced by bthandler().

◆ btcanreturn()

bool btcanreturn ( Relation  index,
int  attno 
)

Definition at line 1712 of file nbtree.c.

1713{
1714 return true;
1715}

Referenced by bthandler().

◆ btendscan()

void btendscan ( IndexScanDesc  scan)

Definition at line 438 of file nbtree.c.

439{
440 BTScanOpaque so = (BTScanOpaque) scan->opaque;
441
442 /* we aren't holding any read locks, but gotta drop the pins */
444 {
445 /* Before leaving current page, deal with any killed items */
446 if (so->numKilled > 0)
447 _bt_killitems(scan);
449 }
450
451 so->markItemIndex = -1;
453
454 /* No need to invalidate positions, the RAM is about to be freed. */
455
456 /* Release storage */
457 if (so->keyData != NULL)
458 pfree(so->keyData);
459 /* so->arrayKeys and so->orderProcs are in arrayContext */
460 if (so->arrayContext != NULL)
462 if (so->killedItems != NULL)
463 pfree(so->killedItems);
464 if (so->currTuples != NULL)
465 pfree(so->currTuples);
466 /* so->markTuples should not be pfree'd, see btrescan */
467 pfree(so);
468}
void MemoryContextDelete(MemoryContext context)
Definition: mcxt.c:454
#define BTScanPosUnpinIfPinned(scanpos)
Definition: nbtree.h:1015
void _bt_killitems(IndexScanDesc scan)
Definition: nbtutils.c:3310

References _bt_killitems(), BTScanOpaqueData::arrayContext, BTScanPosIsValid, BTScanPosUnpinIfPinned, BTScanOpaqueData::currPos, BTScanOpaqueData::currTuples, if(), BTScanOpaqueData::keyData, BTScanOpaqueData::killedItems, BTScanOpaqueData::markItemIndex, BTScanOpaqueData::markPos, MemoryContextDelete(), BTScanOpaqueData::numKilled, IndexScanDescData::opaque, and pfree().

Referenced by bthandler().

◆ btestimateparallelscan()

Size btestimateparallelscan ( Relation  rel,
int  nkeys,
int  norderbys 
)

Definition at line 558 of file nbtree.c.

559{
561 Size estnbtreeshared,
562 genericattrspace;
563
564 /*
565 * Pessimistically assume that every input scan key will be output with
566 * its own SAOP array
567 */
568 estnbtreeshared = offsetof(BTParallelScanDescData, btps_arrElems) +
569 sizeof(int) * nkeys;
570
571 /* Single column indexes cannot possibly use a skip array */
572 if (nkeyatts == 1)
573 return estnbtreeshared;
574
575 /*
576 * Pessimistically assume that all attributes prior to the least
577 * significant attribute require a skip array (and an associated key)
578 */
579 genericattrspace = datumEstimateSpace((Datum) 0, false, true,
580 sizeof(Datum));
581 for (int attnum = 1; attnum < nkeyatts; attnum++)
582 {
583 CompactAttribute *attr;
584
585 /*
586 * We make the conservative assumption that every index column will
587 * also require a skip array.
588 *
589 * Every skip array must have space to store its scan key's sk_flags.
590 */
591 estnbtreeshared = add_size(estnbtreeshared, sizeof(int));
592
593 /* Consider space required to store a datum of opclass input type */
594 attr = TupleDescCompactAttr(rel->rd_att, attnum - 1);
595 if (attr->attbyval)
596 {
597 /* This index attribute stores pass-by-value datums */
598 Size estfixed = datumEstimateSpace((Datum) 0, false,
599 true, attr->attlen);
600
601 estnbtreeshared = add_size(estnbtreeshared, estfixed);
602 continue;
603 }
604
605 /*
606 * This index attribute stores pass-by-reference datums.
607 *
608 * Assume that serializing this array will use just as much space as a
609 * pass-by-value datum, in addition to space for the largest possible
610 * whole index tuple (this is not just a per-datum portion of the
611 * largest possible tuple because that'd be almost as large anyway).
612 *
613 * This is quite conservative, but it's not clear how we could do much
614 * better. The executor requires an up-front storage request size
615 * that reliably covers the scan's high watermark memory usage. We
616 * can't be sure of the real high watermark until the scan is over.
617 */
618 estnbtreeshared = add_size(estnbtreeshared, genericattrspace);
619 estnbtreeshared = add_size(estnbtreeshared, BTMaxItemSize);
620 }
621
622 return estnbtreeshared;
623}
Size datumEstimateSpace(Datum value, bool isnull, bool typByVal, int typLen)
Definition: datum.c:412
Size add_size(Size s1, Size s2)
Definition: shmem.c:491
TupleDesc rd_att
Definition: rel.h:112

References add_size(), CompactAttribute::attbyval, CompactAttribute::attlen, attnum, BTMaxItemSize, BTParallelScanDescData::btps_arrElems, datumEstimateSpace(), IndexRelationGetNumberOfKeyAttributes, RelationData::rd_att, and TupleDescCompactAttr().

Referenced by bthandler().

◆ btgetbitmap()

int64 btgetbitmap ( IndexScanDesc  scan,
TIDBitmap tbm 
)

Definition at line 286 of file nbtree.c.

287{
288 BTScanOpaque so = (BTScanOpaque) scan->opaque;
289 int64 ntids = 0;
290 ItemPointer heapTid;
291
292 /* Each loop iteration performs another primitive index scan */
293 do
294 {
295 /* Fetch the first page & tuple */
297 {
298 /* Save tuple ID, and continue scanning */
299 heapTid = &scan->xs_heaptid;
300 tbm_add_tuples(tbm, heapTid, 1, false);
301 ntids++;
302
303 for (;;)
304 {
305 /*
306 * Advance to next tuple within page. This is the same as the
307 * easy case in _bt_next().
308 */
309 if (++so->currPos.itemIndex > so->currPos.lastItem)
310 {
311 /* let _bt_next do the heavy lifting */
312 if (!_bt_next(scan, ForwardScanDirection))
313 break;
314 }
315
316 /* Save tuple ID, and continue scanning */
317 heapTid = &so->currPos.items[so->currPos.itemIndex].heapTid;
318 tbm_add_tuples(tbm, heapTid, 1, false);
319 ntids++;
320 }
321 }
322 /* Now see if we need another primitive index scan */
324
325 return ntids;
326}
int64_t int64
Definition: c.h:499
bool _bt_first(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:882
bool _bt_next(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:1541
bool _bt_start_prim_scan(IndexScanDesc scan, ScanDirection dir)
Definition: nbtutils.c:1339
ItemPointerData xs_heaptid
Definition: relscan.h:172
void tbm_add_tuples(TIDBitmap *tbm, const ItemPointer tids, int ntids, bool recheck)
Definition: tidbitmap.c:366

References _bt_first(), _bt_next(), BTScanOpaqueData::currPos, ForwardScanDirection, BTScanPosItem::heapTid, if(), BTScanPosData::itemIndex, BTScanPosData::items, BTScanPosData::lastItem, IndexScanDescData::opaque, tbm_add_tuples(), and IndexScanDescData::xs_heaptid.

Referenced by bthandler().

◆ btgettreeheight()

int btgettreeheight ( Relation  rel)

Definition at line 1721 of file nbtree.c.

1722{
1723 return _bt_getrootheight(rel);
1724}
int _bt_getrootheight(Relation rel)
Definition: nbtpage.c:675

References _bt_getrootheight().

Referenced by bthandler().

◆ btgettuple()

bool btgettuple ( IndexScanDesc  scan,
ScanDirection  dir 
)

Definition at line 226 of file nbtree.c.

227{
228 BTScanOpaque so = (BTScanOpaque) scan->opaque;
229 bool res;
230
231 /* btree indexes are never lossy */
232 scan->xs_recheck = false;
233
234 /* Each loop iteration performs another primitive index scan */
235 do
236 {
237 /*
238 * If we've already initialized this scan, we can just advance it in
239 * the appropriate direction. If we haven't done so yet, we call
240 * _bt_first() to get the first item in the scan.
241 */
243 res = _bt_first(scan, dir);
244 else
245 {
246 /*
247 * Check to see if we should kill the previously-fetched tuple.
248 */
249 if (scan->kill_prior_tuple)
250 {
251 /*
252 * Yes, remember it for later. (We'll deal with all such
253 * tuples at once right before leaving the index page.) The
254 * test for numKilled overrun is not just paranoia: if the
255 * caller reverses direction in the indexscan then the same
256 * item might get entered multiple times. It's not worth
257 * trying to optimize that, so we don't detect it, but instead
258 * just forget any excess entries.
259 */
260 if (so->killedItems == NULL)
261 so->killedItems = (int *)
262 palloc(MaxTIDsPerBTreePage * sizeof(int));
264 so->killedItems[so->numKilled++] = so->currPos.itemIndex;
265 }
266
267 /*
268 * Now continue the scan.
269 */
270 res = _bt_next(scan, dir);
271 }
272
273 /* If we have a tuple, return it ... */
274 if (res)
275 break;
276 /* ... otherwise see if we need another primitive index scan */
277 } while (so->numArrayKeys && _bt_start_prim_scan(scan, dir));
278
279 return res;
280}
bool kill_prior_tuple
Definition: relscan.h:147

References _bt_first(), _bt_next(), BTScanPosIsValid, BTScanOpaqueData::currPos, if(), BTScanPosData::itemIndex, IndexScanDescData::kill_prior_tuple, BTScanOpaqueData::killedItems, MaxTIDsPerBTreePage, BTScanOpaqueData::numKilled, IndexScanDescData::opaque, palloc(), and IndexScanDescData::xs_recheck.

Referenced by bthandler().

◆ btinitparallelscan()

void btinitparallelscan ( void *  target)

◆ btinsert()

bool btinsert ( Relation  rel,
Datum values,
bool *  isnull,
ItemPointer  ht_ctid,
Relation  heapRel,
IndexUniqueCheck  checkUnique,
bool  indexUnchanged,
struct IndexInfo indexInfo 
)

Definition at line 202 of file nbtree.c.

207{
208 bool result;
209 IndexTuple itup;
210
211 /* generate an index tuple */
212 itup = index_form_tuple(RelationGetDescr(rel), values, isnull);
213 itup->t_tid = *ht_ctid;
214
215 result = _bt_doinsert(rel, itup, checkUnique, indexUnchanged, heapRel);
216
217 pfree(itup);
218
219 return result;
220}
static Datum values[MAXATTR]
Definition: bootstrap.c:151
IndexTuple index_form_tuple(TupleDesc tupleDescriptor, const Datum *values, const bool *isnull)
Definition: indextuple.c:44
bool _bt_doinsert(Relation rel, IndexTuple itup, IndexUniqueCheck checkUnique, bool indexUnchanged, Relation heapRel)
Definition: nbtinsert.c:102

References _bt_doinsert(), index_form_tuple(), pfree(), RelationGetDescr, IndexTupleData::t_tid, and values.

Referenced by bthandler().

◆ btmarkpos()

void btmarkpos ( IndexScanDesc  scan)

Definition at line 474 of file nbtree.c.

475{
476 BTScanOpaque so = (BTScanOpaque) scan->opaque;
477
478 /* There may be an old mark with a pin (but no lock). */
480
481 /*
482 * Just record the current itemIndex. If we later step to next page
483 * before releasing the marked position, _bt_steppage makes a full copy of
484 * the currPos struct in markPos. If (as often happens) the mark is moved
485 * before we leave the page, we don't have to do that work.
486 */
487 if (BTScanPosIsValid(so->currPos))
489 else
490 {
492 so->markItemIndex = -1;
493 }
494}

References BTScanPosInvalidate, BTScanPosIsValid, BTScanPosUnpinIfPinned, BTScanOpaqueData::currPos, BTScanPosData::itemIndex, BTScanOpaqueData::markItemIndex, BTScanOpaqueData::markPos, and IndexScanDescData::opaque.

Referenced by bthandler().

◆ btoptions()

bytea * btoptions ( Datum  reloptions,
bool  validate 
)

Definition at line 3696 of file nbtutils.c.

3697{
3698 static const relopt_parse_elt tab[] = {
3699 {"fillfactor", RELOPT_TYPE_INT, offsetof(BTOptions, fillfactor)},
3700 {"vacuum_cleanup_index_scale_factor", RELOPT_TYPE_REAL,
3701 offsetof(BTOptions, vacuum_cleanup_index_scale_factor)},
3702 {"deduplicate_items", RELOPT_TYPE_BOOL,
3703 offsetof(BTOptions, deduplicate_items)}
3704 };
3705
3706 return (bytea *) build_reloptions(reloptions, validate,
3708 sizeof(BTOptions),
3709 tab, lengthof(tab));
3710}
static bool validate(Port *port, const char *auth)
Definition: auth-oauth.c:638
#define lengthof(array)
Definition: c.h:759
static int fillfactor
Definition: pgbench.c:188
void * build_reloptions(Datum reloptions, bool validate, relopt_kind kind, Size relopt_struct_size, const relopt_parse_elt *relopt_elems, int num_relopt_elems)
Definition: reloptions.c:1934
@ RELOPT_KIND_BTREE
Definition: reloptions.h:44
@ RELOPT_TYPE_INT
Definition: reloptions.h:32
@ RELOPT_TYPE_BOOL
Definition: reloptions.h:31
@ RELOPT_TYPE_REAL
Definition: reloptions.h:33
Definition: c.h:658

References build_reloptions(), fillfactor, lengthof, RELOPT_KIND_BTREE, RELOPT_TYPE_BOOL, RELOPT_TYPE_INT, RELOPT_TYPE_REAL, and validate().

Referenced by bthandler().

◆ BTPageGetDeleteXid()

static FullTransactionId BTPageGetDeleteXid ( Page  page)
inlinestatic

Definition at line 261 of file nbtree.h.

262{
263 BTPageOpaque opaque;
264 BTDeletedPageData *contents;
265
266 /* We only expect to be called with a deleted page */
267 Assert(!PageIsNew(page));
268 opaque = BTPageGetOpaque(page);
269 Assert(P_ISDELETED(opaque));
270
271 /* pg_upgrade'd deleted page -- must be safe to recycle now */
272 if (!P_HAS_FULLXID(opaque))
274
275 /* Get safexid from deleted page */
276 contents = ((BTDeletedPageData *) PageGetContents(page));
277 return contents->safexid;
278}
static char * PageGetContents(Page page)
Definition: bufpage.h:258
#define P_HAS_FULLXID(opaque)
Definition: nbtree.h:229
FullTransactionId safexid
Definition: nbtree.h:236
#define FirstNormalFullTransactionId
Definition: transam.h:57

References Assert(), BTPageGetOpaque, FirstNormalFullTransactionId, P_HAS_FULLXID, P_ISDELETED, PageGetContents(), PageIsNew(), and BTDeletedPageData::safexid.

Referenced by _bt_allocbuf(), BTPageIsRecyclable(), and GetBTPageStatistics().

◆ BTPageIsRecyclable()

static bool BTPageIsRecyclable ( Page  page,
Relation  heaprel 
)
inlinestatic

Definition at line 292 of file nbtree.h.

293{
294 BTPageOpaque opaque;
295
296 Assert(!PageIsNew(page));
297 Assert(heaprel != NULL);
298
299 /* Recycling okay iff page is deleted and safexid is old enough */
300 opaque = BTPageGetOpaque(page);
301 if (P_ISDELETED(opaque))
302 {
304
305 /*
306 * The page was deleted, but when? If it was just deleted, a scan
307 * might have seen the downlink to it, and will read the page later.
308 * As long as that can happen, we must keep the deleted page around as
309 * a tombstone.
310 *
311 * For that check if the deletion XID could still be visible to
312 * anyone. If not, then no scan that's still in progress could have
313 * seen its downlink, and we can recycle it.
314 */
315 return GlobalVisCheckRemovableFullXid(heaprel, safexid);
316 }
317
318 return false;
319}

References Assert(), BTPageGetDeleteXid(), BTPageGetOpaque, GlobalVisCheckRemovableFullXid(), P_ISDELETED, and PageIsNew().

Referenced by _bt_allocbuf(), and btvacuumpage().

◆ BTPageSetDeleted()

static void BTPageSetDeleted ( Page  page,
FullTransactionId  safexid 
)
inlinestatic

Definition at line 240 of file nbtree.h.

241{
242 BTPageOpaque opaque;
243 PageHeader header;
244 BTDeletedPageData *contents;
245
246 opaque = BTPageGetOpaque(page);
247 header = ((PageHeader) page);
248
249 opaque->btpo_flags &= ~BTP_HALF_DEAD;
252 sizeof(BTDeletedPageData);
253 header->pd_upper = header->pd_special;
254
255 /* Set safexid in deleted page */
256 contents = ((BTDeletedPageData *) PageGetContents(page));
257 contents->safexid = safexid;
258}
#define BTP_HAS_FULLXID
Definition: nbtree.h:85
struct BTDeletedPageData BTDeletedPageData
#define BTP_DELETED
Definition: nbtree.h:79
LocationIndex pd_special
Definition: bufpage.h:168
LocationIndex pd_upper
Definition: bufpage.h:167
LocationIndex pd_lower
Definition: bufpage.h:166

References BTP_DELETED, BTP_HAS_FULLXID, BTPageGetOpaque, BTPageOpaqueData::btpo_flags, MAXALIGN, PageGetContents(), PageHeaderData::pd_lower, PageHeaderData::pd_special, PageHeaderData::pd_upper, BTDeletedPageData::safexid, and SizeOfPageHeaderData.

Referenced by _bt_unlink_halfdead_page(), and btree_xlog_unlink_page().

◆ btparallelrescan()

void btparallelrescan ( IndexScanDesc  scan)

Definition at line 741 of file nbtree.c.

742{
743 BTParallelScanDesc btscan;
744 ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
745
746 Assert(parallel_scan);
747
748 btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
749 parallel_scan->ps_offset_am);
750
751 /*
752 * In theory, we don't need to acquire the LWLock here, because there
753 * shouldn't be any other workers running at this point, but we do so for
754 * consistency.
755 */
760 LWLockRelease(&btscan->btps_lock);
761}

References Assert(), BTPARALLEL_NOT_INITIALIZED, BTParallelScanDescData::btps_lastCurrPage, BTParallelScanDescData::btps_lock, BTParallelScanDescData::btps_nextScanPage, BTParallelScanDescData::btps_pageStatus, InvalidBlockNumber, LW_EXCLUSIVE, LWLockAcquire(), LWLockRelease(), OffsetToPointer, IndexScanDescData::parallel_scan, and ParallelIndexScanDescData::ps_offset_am.

Referenced by bthandler().

◆ btproperty()

bool btproperty ( Oid  index_oid,
int  attno,
IndexAMProperty  prop,
const char *  propname,
bool *  res,
bool *  isnull 
)

Definition at line 3719 of file nbtutils.c.

3722{
3723 switch (prop)
3724 {
3725 case AMPROP_RETURNABLE:
3726 /* answer only for columns, not AM or whole index */
3727 if (attno == 0)
3728 return false;
3729 /* otherwise, btree can always return data */
3730 *res = true;
3731 return true;
3732
3733 default:
3734 return false; /* punt to generic code */
3735 }
3736}
@ AMPROP_RETURNABLE
Definition: amapi.h:45

References AMPROP_RETURNABLE.

Referenced by bthandler().

◆ BTreeShmemInit()

void BTreeShmemInit ( void  )

Definition at line 3668 of file nbtutils.c.

3669{
3670 bool found;
3671
3672 btvacinfo = (BTVacInfo *) ShmemInitStruct("BTree Vacuum State",
3674 &found);
3675
3676 if (!IsUnderPostmaster)
3677 {
3678 /* Initialize shared memory area */
3679 Assert(!found);
3680
3681 /*
3682 * It doesn't really matter what the cycle counter starts at, but
3683 * having it always start the same doesn't seem good. Seed with
3684 * low-order bits of time() instead.
3685 */
3686 btvacinfo->cycle_ctr = (BTCycleId) time(NULL);
3687
3690 }
3691 else
3692 Assert(found);
3693}
bool IsUnderPostmaster
Definition: globals.c:120
int MaxBackends
Definition: globals.c:146
Size BTreeShmemSize(void)
Definition: nbtutils.c:3655
void * ShmemInitStruct(const char *name, Size size, bool *foundPtr)
Definition: shmem.c:385

References Assert(), BTreeShmemSize(), btvacinfo, BTVacInfo::cycle_ctr, IsUnderPostmaster, BTVacInfo::max_vacuums, MaxBackends, BTVacInfo::num_vacuums, and ShmemInitStruct().

Referenced by CreateOrAttachShmemStructs().

◆ BTreeShmemSize()

Size BTreeShmemSize ( void  )

Definition at line 3655 of file nbtutils.c.

3656{
3657 Size size;
3658
3659 size = offsetof(BTVacInfo, vacuums);
3660 size = add_size(size, mul_size(MaxBackends, sizeof(BTOneVacInfo)));
3661 return size;
3662}
Size mul_size(Size s1, Size s2)
Definition: shmem.c:508

References add_size(), MaxBackends, and mul_size().

Referenced by BTreeShmemInit(), and CalculateShmemSize().

◆ BTreeTupleGetDownLink()

static BlockNumber BTreeTupleGetDownLink ( IndexTuple  pivot)
inlinestatic

◆ BTreeTupleGetHeapTID()

static ItemPointer BTreeTupleGetHeapTID ( IndexTuple  itup)
inlinestatic

Definition at line 639 of file nbtree.h.

640{
641 if (BTreeTupleIsPivot(itup))
642 {
643 /* Pivot tuple heap TID representation? */
646 return (ItemPointer) ((char *) itup + IndexTupleSize(itup) -
647 sizeof(ItemPointerData));
648
649 /* Heap TID attribute was truncated */
650 return NULL;
651 }
652 else if (BTreeTupleIsPosting(itup))
653 return BTreeTupleGetPosting(itup);
654
655 return &itup->t_tid;
656}
struct ItemPointerData ItemPointerData

References BT_PIVOT_HEAP_TID_ATTR, BTreeTupleGetPosting(), BTreeTupleIsPivot(), BTreeTupleIsPosting(), IndexTupleSize(), ItemPointerGetOffsetNumberNoCheck(), and IndexTupleData::t_tid.

Referenced by _bt_bottomupdel_finish_pending(), _bt_check_natts(), _bt_check_third_page(), _bt_compare(), _bt_delitems_delete_check(), _bt_mkscankey(), _bt_swap_posting(), _bt_truncate(), bt_entry_unique_check(), bt_page_print_tuples(), bt_target_page_check(), BTreeTupleGetHeapTIDCareful(), and BTreeTupleGetPointsToTID().

◆ BTreeTupleGetMaxHeapTID()

static ItemPointer BTreeTupleGetMaxHeapTID ( IndexTuple  itup)
inlinestatic

◆ BTreeTupleGetNPosting()

◆ BTreeTupleGetPosting()

static ItemPointer BTreeTupleGetPosting ( IndexTuple  posting)
inlinestatic

◆ BTreeTupleGetPostingN()

◆ BTreeTupleGetPostingOffset()

◆ BTreeTupleGetTopParent()

static BlockNumber BTreeTupleGetTopParent ( IndexTuple  leafhikey)
inlinestatic

Definition at line 621 of file nbtree.h.

622{
623 return ItemPointerGetBlockNumberNoCheck(&leafhikey->t_tid);
624}

References ItemPointerGetBlockNumberNoCheck(), and IndexTupleData::t_tid.

Referenced by _bt_unlink_halfdead_page(), and bt_downlink_missing_check().

◆ BTreeTupleIsPivot()

◆ BTreeTupleIsPosting()

◆ BTreeTupleSetDownLink()

static void BTreeTupleSetDownLink ( IndexTuple  pivot,
BlockNumber  blkno 
)
inlinestatic

Definition at line 563 of file nbtree.h.

564{
565 ItemPointerSetBlockNumber(&pivot->t_tid, blkno);
566}
static void ItemPointerSetBlockNumber(ItemPointerData *pointer, BlockNumber blockNumber)
Definition: itemptr.h:147

References ItemPointerSetBlockNumber(), and IndexTupleData::t_tid.

Referenced by _bt_buildadd(), _bt_insert_parent(), _bt_mark_page_halfdead(), _bt_newlevel(), _bt_uppershutdown(), and btree_xlog_mark_page_halfdead().

◆ BTreeTupleSetNAtts()

static void BTreeTupleSetNAtts ( IndexTuple  itup,
uint16  nkeyatts,
bool  heaptid 
)
inlinestatic

Definition at line 596 of file nbtree.h.

597{
598 Assert(nkeyatts <= INDEX_MAX_KEYS);
599 Assert((nkeyatts & BT_STATUS_OFFSET_MASK) == 0);
600 Assert(!heaptid || nkeyatts > 0);
601 Assert(!BTreeTupleIsPivot(itup) || nkeyatts == 0);
602
603 itup->t_info |= INDEX_ALT_TID_MASK;
604
605 if (heaptid)
606 nkeyatts |= BT_PIVOT_HEAP_TID_ATTR;
607
608 /* BT_IS_POSTING bit is deliberately unset here */
609 ItemPointerSetOffsetNumber(&itup->t_tid, nkeyatts);
611}
#define BT_STATUS_OFFSET_MASK
Definition: nbtree.h:464

References Assert(), BT_PIVOT_HEAP_TID_ATTR, BT_STATUS_OFFSET_MASK, BTreeTupleIsPivot(), INDEX_ALT_TID_MASK, INDEX_MAX_KEYS, ItemPointerSetOffsetNumber(), IndexTupleData::t_info, and IndexTupleData::t_tid.

Referenced by _bt_buildadd(), _bt_newlevel(), _bt_pgaddtup(), _bt_sortaddtup(), _bt_truncate(), and BTreeTupleSetTopParent().

◆ BTreeTupleSetPosting()

static void BTreeTupleSetPosting ( IndexTuple  itup,
uint16  nhtids,
int  postingoffset 
)
inlinestatic

Definition at line 505 of file nbtree.h.

506{
507 Assert(nhtids > 1);
508 Assert((nhtids & BT_STATUS_OFFSET_MASK) == 0);
509 Assert((size_t) postingoffset == MAXALIGN(postingoffset));
510 Assert(postingoffset < INDEX_SIZE_MASK);
512
513 itup->t_info |= INDEX_ALT_TID_MASK;
515 ItemPointerSetBlockNumber(&itup->t_tid, postingoffset);
516}

References Assert(), BT_IS_POSTING, BT_STATUS_OFFSET_MASK, BTreeTupleIsPivot(), INDEX_ALT_TID_MASK, INDEX_SIZE_MASK, ItemPointerSetBlockNumber(), ItemPointerSetOffsetNumber(), MAXALIGN, IndexTupleData::t_info, and IndexTupleData::t_tid.

Referenced by _bt_form_posting(), and _bt_update_posting().

◆ BTreeTupleSetTopParent()

static void BTreeTupleSetTopParent ( IndexTuple  leafhikey,
BlockNumber  blkno 
)
inlinestatic

◆ btrescan()

void btrescan ( IndexScanDesc  scan,
ScanKey  scankey,
int  nscankeys,
ScanKey  orderbys,
int  norderbys 
)

Definition at line 381 of file nbtree.c.

383{
384 BTScanOpaque so = (BTScanOpaque) scan->opaque;
385
386 /* we aren't holding any read locks, but gotta drop the pins */
388 {
389 /* Before leaving current page, deal with any killed items */
390 if (so->numKilled > 0)
391 _bt_killitems(scan);
394 }
395
396 so->markItemIndex = -1;
397 so->needPrimScan = false;
398 so->scanBehind = false;
399 so->oppositeDirCheck = false;
402
403 /*
404 * Allocate tuple workspace arrays, if needed for an index-only scan and
405 * not already done in a previous rescan call. To save on palloc
406 * overhead, both workspaces are allocated as one palloc block; only this
407 * function and btendscan know that.
408 *
409 * NOTE: this data structure also makes it safe to return data from a
410 * "name" column, even though btree name_ops uses an underlying storage
411 * datatype of cstring. The risk there is that "name" is supposed to be
412 * padded to NAMEDATALEN, but the actual index tuple is probably shorter.
413 * However, since we only return data out of tuples sitting in the
414 * currTuples array, a fetch of NAMEDATALEN bytes can at worst pull some
415 * data out of the markTuples array --- running off the end of memory for
416 * a SIGSEGV is not possible. Yeah, this is ugly as sin, but it beats
417 * adding special-case treatment for name_ops elsewhere.
418 */
419 if (scan->xs_want_itup && so->currTuples == NULL)
420 {
421 so->currTuples = (char *) palloc(BLCKSZ * 2);
422 so->markTuples = so->currTuples + BLCKSZ;
423 }
424
425 /*
426 * Reset the scan keys
427 */
428 if (scankey && scan->numberOfKeys > 0)
429 memcpy(scan->keyData, scankey, scan->numberOfKeys * sizeof(ScanKeyData));
430 so->numberOfKeys = 0; /* until _bt_preprocess_keys sets it */
431 so->numArrayKeys = 0; /* ditto */
432}

References _bt_killitems(), BTScanPosInvalidate, BTScanPosIsValid, BTScanPosUnpinIfPinned, BTScanOpaqueData::currPos, BTScanOpaqueData::currTuples, if(), IndexScanDescData::keyData, BTScanOpaqueData::markItemIndex, BTScanOpaqueData::markPos, BTScanOpaqueData::markTuples, BTScanOpaqueData::needPrimScan, BTScanOpaqueData::numArrayKeys, BTScanOpaqueData::numberOfKeys, IndexScanDescData::numberOfKeys, BTScanOpaqueData::numKilled, IndexScanDescData::opaque, BTScanOpaqueData::oppositeDirCheck, palloc(), BTScanOpaqueData::scanBehind, and IndexScanDescData::xs_want_itup.

Referenced by bthandler().

◆ btrestrpos()

void btrestrpos ( IndexScanDesc  scan)

Definition at line 500 of file nbtree.c.

501{
502 BTScanOpaque so = (BTScanOpaque) scan->opaque;
503
504 if (so->markItemIndex >= 0)
505 {
506 /*
507 * The scan has never moved to a new page since the last mark. Just
508 * restore the itemIndex.
509 *
510 * NB: In this case we can't count on anything in so->markPos to be
511 * accurate.
512 */
514 }
515 else
516 {
517 /*
518 * The scan moved to a new page after last mark or restore, and we are
519 * now restoring to the marked page. We aren't holding any read
520 * locks, but if we're still holding the pin for the current position,
521 * we must drop it.
522 */
523 if (BTScanPosIsValid(so->currPos))
524 {
525 /* Before leaving current page, deal with any killed items */
526 if (so->numKilled > 0)
527 _bt_killitems(scan);
529 }
530
531 if (BTScanPosIsValid(so->markPos))
532 {
533 /* bump pin on mark buffer for assignment to current buffer */
534 if (BTScanPosIsPinned(so->markPos))
536 memcpy(&so->currPos, &so->markPos,
537 offsetof(BTScanPosData, items[1]) +
538 so->markPos.lastItem * sizeof(BTScanPosItem));
539 if (so->currTuples)
540 memcpy(so->currTuples, so->markTuples,
542 /* Reset the scan's array keys (see _bt_steppage for why) */
543 if (so->numArrayKeys)
544 {
546 so->needPrimScan = false;
547 }
548 }
549 else
551 }
552}
void IncrBufferRefCount(Buffer buffer)
Definition: bufmgr.c:5335
int nextTupleOffset
Definition: nbtree.h:979
static ItemArray items
Definition: test_tidstore.c:48

References _bt_killitems(), _bt_start_array_keys(), BTScanPosInvalidate, BTScanPosIsPinned, BTScanPosIsValid, BTScanPosUnpinIfPinned, BTScanPosData::buf, BTScanOpaqueData::currPos, BTScanOpaqueData::currTuples, BTScanPosData::dir, if(), IncrBufferRefCount(), BTScanPosData::itemIndex, items, BTScanPosData::lastItem, BTScanOpaqueData::markItemIndex, BTScanOpaqueData::markPos, BTScanOpaqueData::markTuples, BTScanOpaqueData::needPrimScan, BTScanPosData::nextTupleOffset, BTScanOpaqueData::numArrayKeys, BTScanOpaqueData::numKilled, and IndexScanDescData::opaque.

Referenced by bthandler().

◆ bttranslatecmptype()

StrategyNumber bttranslatecmptype ( CompareType  cmptype,
Oid  opfamily 
)

Definition at line 1747 of file nbtree.c.

1748{
1749 switch (cmptype)
1750 {
1751 case COMPARE_LT:
1752 return BTLessStrategyNumber;
1753 case COMPARE_LE:
1755 case COMPARE_EQ:
1756 return BTEqualStrategyNumber;
1757 case COMPARE_GE:
1759 case COMPARE_GT:
1761 default:
1762 return InvalidStrategy;
1763 }
1764}
@ COMPARE_LE
Definition: cmptype.h:35
@ COMPARE_GT
Definition: cmptype.h:38
@ COMPARE_EQ
Definition: cmptype.h:36
@ COMPARE_GE
Definition: cmptype.h:37
@ COMPARE_LT
Definition: cmptype.h:34

References BTEqualStrategyNumber, BTGreaterEqualStrategyNumber, BTGreaterStrategyNumber, BTLessEqualStrategyNumber, BTLessStrategyNumber, COMPARE_EQ, COMPARE_GE, COMPARE_GT, COMPARE_LE, COMPARE_LT, and InvalidStrategy.

Referenced by bthandler().

◆ bttranslatestrategy()

CompareType bttranslatestrategy ( StrategyNumber  strategy,
Oid  opfamily 
)

Definition at line 1727 of file nbtree.c.

1728{
1729 switch (strategy)
1730 {
1732 return COMPARE_LT;
1734 return COMPARE_LE;
1736 return COMPARE_EQ;
1738 return COMPARE_GE;
1740 return COMPARE_GT;
1741 default:
1742 return COMPARE_INVALID;
1743 }
1744}
@ COMPARE_INVALID
Definition: cmptype.h:33

References BTEqualStrategyNumber, BTGreaterEqualStrategyNumber, BTGreaterStrategyNumber, BTLessEqualStrategyNumber, BTLessStrategyNumber, COMPARE_EQ, COMPARE_GE, COMPARE_GT, COMPARE_INVALID, COMPARE_LE, and COMPARE_LT.

Referenced by bthandler().

◆ btvacuumcleanup()

IndexBulkDeleteResult * btvacuumcleanup ( IndexVacuumInfo info,
IndexBulkDeleteResult stats 
)

Definition at line 1063 of file nbtree.c.

1064{
1065 BlockNumber num_delpages;
1066
1067 /* No-op in ANALYZE ONLY mode */
1068 if (info->analyze_only)
1069 return stats;
1070
1071 /*
1072 * If btbulkdelete was called, we need not do anything (we just maintain
1073 * the information used within _bt_vacuum_needs_cleanup() by calling
1074 * _bt_set_cleanup_info() below).
1075 *
1076 * If btbulkdelete was _not_ called, then we have a choice to make: we
1077 * must decide whether or not a btvacuumscan() call is needed now (i.e.
1078 * whether the ongoing VACUUM operation can entirely avoid a physical scan
1079 * of the index). A call to _bt_vacuum_needs_cleanup() decides it for us
1080 * now.
1081 */
1082 if (stats == NULL)
1083 {
1084 /* Check if VACUUM operation can entirely avoid btvacuumscan() call */
1085 if (!_bt_vacuum_needs_cleanup(info->index))
1086 return NULL;
1087
1088 /*
1089 * Since we aren't going to actually delete any leaf items, there's no
1090 * need to go through all the vacuum-cycle-ID pushups here.
1091 *
1092 * Posting list tuples are a source of inaccuracy for cleanup-only
1093 * scans. btvacuumscan() will assume that the number of index tuples
1094 * from each page can be used as num_index_tuples, even though
1095 * num_index_tuples is supposed to represent the number of TIDs in the
1096 * index. This naive approach can underestimate the number of tuples
1097 * in the index significantly.
1098 *
1099 * We handle the problem by making num_index_tuples an estimate in
1100 * cleanup-only case.
1101 */
1103 btvacuumscan(info, stats, NULL, NULL, 0);
1104 stats->estimated_count = true;
1105 }
1106
1107 /*
1108 * Maintain num_delpages value in metapage for _bt_vacuum_needs_cleanup().
1109 *
1110 * num_delpages is the number of deleted pages now in the index that were
1111 * not safe to place in the FSM to be recycled just yet. num_delpages is
1112 * greater than 0 only when _bt_pagedel() actually deleted pages during
1113 * our call to btvacuumscan(). Even then, _bt_pendingfsm_finalize() must
1114 * have failed to place any newly deleted pages in the FSM just moments
1115 * ago. (Actually, there are edge cases where recycling of the current
1116 * VACUUM's newly deleted pages does not even become safe by the time the
1117 * next VACUUM comes around. See nbtree/README.)
1118 */
1119 Assert(stats->pages_deleted >= stats->pages_free);
1120 num_delpages = stats->pages_deleted - stats->pages_free;
1121 _bt_set_cleanup_info(info->index, num_delpages);
1122
1123 /*
1124 * It's quite possible for us to be fooled by concurrent page splits into
1125 * double-counting some index tuples, so disbelieve any total that exceeds
1126 * the underlying heap's count ... if we know that accurately. Otherwise
1127 * this might just make matters worse.
1128 */
1129 if (!info->estimated_count)
1130 {
1131 if (stats->num_index_tuples > info->num_heap_tuples)
1132 stats->num_index_tuples = info->num_heap_tuples;
1133 }
1134
1135 return stats;
1136}
void _bt_set_cleanup_info(Relation rel, BlockNumber num_delpages)
Definition: nbtpage.c:232
bool _bt_vacuum_needs_cleanup(Relation rel)
Definition: nbtpage.c:179
BlockNumber pages_deleted
Definition: genam.h:105
double num_index_tuples
Definition: genam.h:102
double num_heap_tuples
Definition: genam.h:75
bool analyze_only
Definition: genam.h:71
bool estimated_count
Definition: genam.h:73

References _bt_set_cleanup_info(), _bt_vacuum_needs_cleanup(), IndexVacuumInfo::analyze_only, Assert(), btvacuumscan(), IndexVacuumInfo::estimated_count, IndexBulkDeleteResult::estimated_count, IndexVacuumInfo::index, IndexVacuumInfo::num_heap_tuples, IndexBulkDeleteResult::num_index_tuples, IndexBulkDeleteResult::pages_deleted, IndexBulkDeleteResult::pages_free, and palloc0().

Referenced by bthandler().

◆ btvalidate()

bool btvalidate ( Oid  opclassoid)

Definition at line 40 of file nbtvalidate.c.

41{
42 bool result = true;
43 HeapTuple classtup;
44 Form_pg_opclass classform;
45 Oid opfamilyoid;
46 Oid opcintype;
47 char *opclassname;
48 char *opfamilyname;
49 CatCList *proclist,
50 *oprlist;
51 List *grouplist;
52 OpFamilyOpFuncGroup *opclassgroup;
53 List *familytypes;
54 int usefulgroups;
55 int i;
56 ListCell *lc;
57
58 /* Fetch opclass information */
59 classtup = SearchSysCache1(CLAOID, ObjectIdGetDatum(opclassoid));
60 if (!HeapTupleIsValid(classtup))
61 elog(ERROR, "cache lookup failed for operator class %u", opclassoid);
62 classform = (Form_pg_opclass) GETSTRUCT(classtup);
63
64 opfamilyoid = classform->opcfamily;
65 opcintype = classform->opcintype;
66 opclassname = NameStr(classform->opcname);
67
68 /* Fetch opfamily information */
69 opfamilyname = get_opfamily_name(opfamilyoid, false);
70
71 /* Fetch all operators and support functions of the opfamily */
72 oprlist = SearchSysCacheList1(AMOPSTRATEGY, ObjectIdGetDatum(opfamilyoid));
73 proclist = SearchSysCacheList1(AMPROCNUM, ObjectIdGetDatum(opfamilyoid));
74
75 /* Check individual support functions */
76 for (i = 0; i < proclist->n_members; i++)
77 {
78 HeapTuple proctup = &proclist->members[i]->tuple;
79 Form_pg_amproc procform = (Form_pg_amproc) GETSTRUCT(proctup);
80 bool ok;
81
82 /* Check procedure numbers and function signatures */
83 switch (procform->amprocnum)
84 {
85 case BTORDER_PROC:
86 ok = check_amproc_signature(procform->amproc, INT4OID, true,
87 2, 2, procform->amproclefttype,
88 procform->amprocrighttype);
89 break;
91 ok = check_amproc_signature(procform->amproc, VOIDOID, true,
92 1, 1, INTERNALOID);
93 break;
94 case BTINRANGE_PROC:
95 ok = check_amproc_signature(procform->amproc, BOOLOID, true,
96 5, 5,
97 procform->amproclefttype,
98 procform->amproclefttype,
99 procform->amprocrighttype,
100 BOOLOID, BOOLOID);
101 break;
103 ok = check_amproc_signature(procform->amproc, BOOLOID, true,
104 1, 1, OIDOID);
105 break;
106 case BTOPTIONS_PROC:
107 ok = check_amoptsproc_signature(procform->amproc);
108 break;
110 ok = check_amproc_signature(procform->amproc, VOIDOID, true,
111 1, 1, INTERNALOID);
112 break;
113 default:
115 (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
116 errmsg("operator family \"%s\" of access method %s contains function %s with invalid support number %d",
117 opfamilyname, "btree",
118 format_procedure(procform->amproc),
119 procform->amprocnum)));
120 result = false;
121 continue; /* don't want additional message */
122 }
123
124 if (!ok)
125 {
127 (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
128 errmsg("operator family \"%s\" of access method %s contains function %s with wrong signature for support number %d",
129 opfamilyname, "btree",
130 format_procedure(procform->amproc),
131 procform->amprocnum)));
132 result = false;
133 }
134 }
135
136 /* Check individual operators */
137 for (i = 0; i < oprlist->n_members; i++)
138 {
139 HeapTuple oprtup = &oprlist->members[i]->tuple;
140 Form_pg_amop oprform = (Form_pg_amop) GETSTRUCT(oprtup);
141
142 /* Check that only allowed strategy numbers exist */
143 if (oprform->amopstrategy < 1 ||
144 oprform->amopstrategy > BTMaxStrategyNumber)
145 {
147 (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
148 errmsg("operator family \"%s\" of access method %s contains operator %s with invalid strategy number %d",
149 opfamilyname, "btree",
150 format_operator(oprform->amopopr),
151 oprform->amopstrategy)));
152 result = false;
153 }
154
155 /* btree doesn't support ORDER BY operators */
156 if (oprform->amoppurpose != AMOP_SEARCH ||
157 OidIsValid(oprform->amopsortfamily))
158 {
160 (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
161 errmsg("operator family \"%s\" of access method %s contains invalid ORDER BY specification for operator %s",
162 opfamilyname, "btree",
163 format_operator(oprform->amopopr))));
164 result = false;
165 }
166
167 /* Check operator signature --- same for all btree strategies */
168 if (!check_amop_signature(oprform->amopopr, BOOLOID,
169 oprform->amoplefttype,
170 oprform->amoprighttype))
171 {
173 (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
174 errmsg("operator family \"%s\" of access method %s contains operator %s with wrong signature",
175 opfamilyname, "btree",
176 format_operator(oprform->amopopr))));
177 result = false;
178 }
179 }
180
181 /* Now check for inconsistent groups of operators/functions */
182 grouplist = identify_opfamily_groups(oprlist, proclist);
183 usefulgroups = 0;
184 opclassgroup = NULL;
185 familytypes = NIL;
186 foreach(lc, grouplist)
187 {
189
190 /*
191 * It is possible for an in_range support function to have a RHS type
192 * that is otherwise irrelevant to the opfamily --- for instance, SQL
193 * requires the datetime_ops opclass to have range support with an
194 * interval offset. So, if this group appears to contain only an
195 * in_range function, ignore it: it doesn't represent a pair of
196 * supported types.
197 */
198 if (thisgroup->operatorset == 0 &&
199 thisgroup->functionset == (1 << BTINRANGE_PROC))
200 continue;
201
202 /* Else count it as a relevant group */
203 usefulgroups++;
204
205 /* Remember the group exactly matching the test opclass */
206 if (thisgroup->lefttype == opcintype &&
207 thisgroup->righttype == opcintype)
208 opclassgroup = thisgroup;
209
210 /*
211 * Identify all distinct data types handled in this opfamily. This
212 * implementation is O(N^2), but there aren't likely to be enough
213 * types in the family for it to matter.
214 */
215 familytypes = list_append_unique_oid(familytypes, thisgroup->lefttype);
216 familytypes = list_append_unique_oid(familytypes, thisgroup->righttype);
217
218 /*
219 * Complain if there seems to be an incomplete set of either operators
220 * or support functions for this datatype pair. The sortsupport,
221 * in_range, and equalimage functions are considered optional.
222 */
223 if (thisgroup->operatorset !=
224 ((1 << BTLessStrategyNumber) |
226 (1 << BTEqualStrategyNumber) |
229 {
231 (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
232 errmsg("operator family \"%s\" of access method %s is missing operator(s) for types %s and %s",
233 opfamilyname, "btree",
234 format_type_be(thisgroup->lefttype),
235 format_type_be(thisgroup->righttype))));
236 result = false;
237 }
238 if ((thisgroup->functionset & (1 << BTORDER_PROC)) == 0)
239 {
241 (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
242 errmsg("operator family \"%s\" of access method %s is missing support function for types %s and %s",
243 opfamilyname, "btree",
244 format_type_be(thisgroup->lefttype),
245 format_type_be(thisgroup->righttype))));
246 result = false;
247 }
248 }
249
250 /* Check that the originally-named opclass is supported */
251 /* (if group is there, we already checked it adequately above) */
252 if (!opclassgroup)
253 {
255 (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
256 errmsg("operator class \"%s\" of access method %s is missing operator(s)",
257 opclassname, "btree")));
258 result = false;
259 }
260
261 /*
262 * Complain if the opfamily doesn't have entries for all possible
263 * combinations of its supported datatypes. While missing cross-type
264 * operators are not fatal, they do limit the planner's ability to derive
265 * additional qual clauses from equivalence classes, so it seems
266 * reasonable to insist that all built-in btree opfamilies be complete.
267 */
268 if (usefulgroups != (list_length(familytypes) * list_length(familytypes)))
269 {
271 (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
272 errmsg("operator family \"%s\" of access method %s is missing cross-type operator(s)",
273 opfamilyname, "btree")));
274 result = false;
275 }
276
277 ReleaseCatCacheList(proclist);
278 ReleaseCatCacheList(oprlist);
279 ReleaseSysCache(classtup);
280
281 return result;
282}
bool check_amproc_signature(Oid funcid, Oid restype, bool exact, int minargs, int maxargs,...)
Definition: amvalidate.c:152
bool check_amop_signature(Oid opno, Oid restype, Oid lefttype, Oid righttype)
Definition: amvalidate.c:206
List * identify_opfamily_groups(CatCList *oprlist, CatCList *proclist)
Definition: amvalidate.c:43
bool check_amoptsproc_signature(Oid funcid)
Definition: amvalidate.c:192
#define NameStr(name)
Definition: c.h:717
void ReleaseCatCacheList(CatCList *list)
Definition: catcache.c:2071
#define INFO
Definition: elog.h:34
char * format_type_be(Oid type_oid)
Definition: format_type.c:343
#define HeapTupleIsValid(tuple)
Definition: htup.h:78
static void * GETSTRUCT(const HeapTupleData *tuple)
Definition: htup_details.h:728
List * list_append_unique_oid(List *list, Oid datum)
Definition: list.c:1380
char * get_opfamily_name(Oid opfid, bool missing_ok)
Definition: lsyscache.c:1393
#define BTSKIPSUPPORT_PROC
Definition: nbtree.h:722
#define BTSORTSUPPORT_PROC
Definition: nbtree.h:718
#define BTINRANGE_PROC
Definition: nbtree.h:719
#define BTOPTIONS_PROC
Definition: nbtree.h:721
FormData_pg_amop * Form_pg_amop
Definition: pg_amop.h:88
FormData_pg_amproc * Form_pg_amproc
Definition: pg_amproc.h:68
static int list_length(const List *l)
Definition: pg_list.h:152
#define NIL
Definition: pg_list.h:68
FormData_pg_opclass * Form_pg_opclass
Definition: pg_opclass.h:83
char * format_procedure(Oid procedure_oid)
Definition: regproc.c:299
char * format_operator(Oid operator_oid)
Definition: regproc.c:793
Definition: pg_list.h:54
CatCTup * members[FLEXIBLE_ARRAY_MEMBER]
Definition: catcache.h:180
int n_members
Definition: catcache.h:178
HeapTupleData tuple
Definition: catcache.h:123
void ReleaseSysCache(HeapTuple tuple)
Definition: syscache.c:269
HeapTuple SearchSysCache1(int cacheId, Datum key1)
Definition: syscache.c:221
#define SearchSysCacheList1(cacheId, key1)
Definition: syscache.h:127

References BTEQUALIMAGE_PROC, BTEqualStrategyNumber, BTGreaterEqualStrategyNumber, BTGreaterStrategyNumber, BTINRANGE_PROC, BTLessEqualStrategyNumber, BTLessStrategyNumber, BTMaxStrategyNumber, BTOPTIONS_PROC, BTORDER_PROC, BTSKIPSUPPORT_PROC, BTSORTSUPPORT_PROC, check_amop_signature(), check_amoptsproc_signature(), check_amproc_signature(), elog, ereport, errcode(), errmsg(), ERROR, format_operator(), format_procedure(), format_type_be(), OpFamilyOpFuncGroup::functionset, get_opfamily_name(), GETSTRUCT(), HeapTupleIsValid, i, identify_opfamily_groups(), INFO, OpFamilyOpFuncGroup::lefttype, lfirst, list_append_unique_oid(), list_length(), catclist::members, catclist::n_members, NameStr, NIL, ObjectIdGetDatum(), OidIsValid, OpFamilyOpFuncGroup::operatorset, ReleaseCatCacheList(), ReleaseSysCache(), OpFamilyOpFuncGroup::righttype, SearchSysCache1(), SearchSysCacheList1, and catctup::tuple.

Referenced by bthandler().

◆ StaticAssertDecl()

StaticAssertDecl ( BT_OFFSET_MASK >=  INDEX_MAX_KEYS,
"BT_OFFSET_MASK can't fit INDEX_MAX_KEYS  
)