PostgreSQL Source Code  git master
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros
nbtree.h File Reference
#include "access/amapi.h"
#include "access/itup.h"
#include "access/sdir.h"
#include "access/xlogreader.h"
#include "catalog/pg_index.h"
#include "lib/stringinfo.h"
#include "storage/bufmgr.h"
Include dependency graph for nbtree.h:
This graph shows which files directly or indirectly include this file:

Go to the source code of this file.

Data Structures

struct  BTPageOpaqueData
 
struct  BTMetaPageData
 
struct  BTStackData
 
struct  BTScanPosItem
 
struct  BTScanPosData
 
struct  BTArrayKeyInfo
 
struct  BTScanOpaqueData
 

Macros

#define BTP_LEAF   (1 << 0) /* leaf page, i.e. not internal page */
 
#define BTP_ROOT   (1 << 1) /* root page (has no parent) */
 
#define BTP_DELETED   (1 << 2) /* page has been deleted from tree */
 
#define BTP_META   (1 << 3) /* meta-page */
 
#define BTP_HALF_DEAD   (1 << 4) /* empty, but still in tree */
 
#define BTP_SPLIT_END   (1 << 5) /* rightmost page of split group */
 
#define BTP_HAS_GARBAGE   (1 << 6) /* page has LP_DEAD tuples */
 
#define BTP_INCOMPLETE_SPLIT   (1 << 7) /* right sibling's downlink is missing */
 
#define MAX_BT_CYCLE_ID   0xFF7F
 
#define BTPageGetMeta(p)   ((BTMetaPageData *) PageGetContents(p))
 
#define BTREE_METAPAGE   0 /* first page is meta */
 
#define BTREE_MAGIC   0x053162 /* magic number of btree pages */
 
#define BTREE_VERSION   2 /* current version number */
 
#define BTMaxItemSize(page)
 
#define BTREE_MIN_FILLFACTOR   10
 
#define BTREE_DEFAULT_FILLFACTOR   90
 
#define BTREE_NONLEAF_FILLFACTOR   70
 
#define BTTidSame(i1, i2)
 
#define BTEntrySame(i1, i2)   BTTidSame((i1)->t_tid, (i2)->t_tid)
 
#define P_NONE   0
 
#define P_LEFTMOST(opaque)   ((opaque)->btpo_prev == P_NONE)
 
#define P_RIGHTMOST(opaque)   ((opaque)->btpo_next == P_NONE)
 
#define P_ISLEAF(opaque)   ((opaque)->btpo_flags & BTP_LEAF)
 
#define P_ISROOT(opaque)   ((opaque)->btpo_flags & BTP_ROOT)
 
#define P_ISDELETED(opaque)   ((opaque)->btpo_flags & BTP_DELETED)
 
#define P_ISHALFDEAD(opaque)   ((opaque)->btpo_flags & BTP_HALF_DEAD)
 
#define P_IGNORE(opaque)   ((opaque)->btpo_flags & (BTP_DELETED|BTP_HALF_DEAD))
 
#define P_HAS_GARBAGE(opaque)   ((opaque)->btpo_flags & BTP_HAS_GARBAGE)
 
#define P_INCOMPLETE_SPLIT(opaque)   ((opaque)->btpo_flags & BTP_INCOMPLETE_SPLIT)
 
#define P_HIKEY   ((OffsetNumber) 1)
 
#define P_FIRSTKEY   ((OffsetNumber) 2)
 
#define P_FIRSTDATAKEY(opaque)   (P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY)
 
#define BTCommuteStrategyNumber(strat)   (BTMaxStrategyNumber + 1 - (strat))
 
#define BTORDER_PROC   1
 
#define BTSORTSUPPORT_PROC   2
 
#define BTNProcs   2
 
#define BT_READ   BUFFER_LOCK_SHARE
 
#define BT_WRITE   BUFFER_LOCK_EXCLUSIVE
 
#define BTScanPosIsPinned(scanpos)
 
#define BTScanPosUnpin(scanpos)
 
#define BTScanPosUnpinIfPinned(scanpos)
 
#define BTScanPosIsValid(scanpos)
 
#define BTScanPosInvalidate(scanpos)
 
#define SK_BT_REQFWD   0x00010000 /* required to continue forward scan */
 
#define SK_BT_REQBKWD   0x00020000 /* required to continue backward scan */
 
#define SK_BT_INDOPTION_SHIFT   24 /* must clear the above bits */
 
#define SK_BT_DESC   (INDOPTION_DESC << SK_BT_INDOPTION_SHIFT)
 
#define SK_BT_NULLS_FIRST   (INDOPTION_NULLS_FIRST << SK_BT_INDOPTION_SHIFT)
 

Typedefs

typedef uint16 BTCycleId
 
typedef struct BTPageOpaqueData BTPageOpaqueData
 
typedef BTPageOpaqueDataBTPageOpaque
 
typedef struct BTMetaPageData BTMetaPageData
 
typedef struct BTStackData BTStackData
 
typedef BTStackDataBTStack
 
typedef struct BTScanPosItem BTScanPosItem
 
typedef struct BTScanPosData BTScanPosData
 
typedef BTScanPosDataBTScanPos
 
typedef struct BTArrayKeyInfo BTArrayKeyInfo
 
typedef struct BTScanOpaqueData BTScanOpaqueData
 
typedef BTScanOpaqueDataBTScanOpaque
 
typedef struct BTSpool BTSpool
 

Functions

IndexBuildResultbtbuild (Relation heap, Relation index, struct IndexInfo *indexInfo)
 
void btbuildempty (Relation index)
 
bool btinsert (Relation rel, Datum *values, bool *isnull, ItemPointer ht_ctid, Relation heapRel, IndexUniqueCheck checkUnique, struct IndexInfo *indexInfo)
 
IndexScanDesc btbeginscan (Relation rel, int nkeys, int norderbys)
 
Size btestimateparallelscan (void)
 
void btinitparallelscan (void *target)
 
bool btgettuple (IndexScanDesc scan, ScanDirection dir)
 
int64 btgetbitmap (IndexScanDesc scan, TIDBitmap *tbm)
 
void btrescan (IndexScanDesc scan, ScanKey scankey, int nscankeys, ScanKey orderbys, int norderbys)
 
void btparallelrescan (IndexScanDesc scan)
 
void btendscan (IndexScanDesc scan)
 
void btmarkpos (IndexScanDesc scan)
 
void btrestrpos (IndexScanDesc scan)
 
IndexBulkDeleteResultbtbulkdelete (IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state)
 
IndexBulkDeleteResultbtvacuumcleanup (IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
 
bool btcanreturn (Relation index, int attno)
 
bool _bt_parallel_seize (IndexScanDesc scan, BlockNumber *pageno)
 
void _bt_parallel_release (IndexScanDesc scan, BlockNumber scan_page)
 
void _bt_parallel_done (IndexScanDesc scan)
 
void _bt_parallel_advance_array_keys (IndexScanDesc scan)
 
bool _bt_doinsert (Relation rel, IndexTuple itup, IndexUniqueCheck checkUnique, Relation heapRel)
 
Buffer _bt_getstackbuf (Relation rel, BTStack stack, int access)
 
void _bt_finish_split (Relation rel, Buffer bbuf, BTStack stack)
 
void _bt_initmetapage (Page page, BlockNumber rootbknum, uint32 level)
 
Buffer _bt_getroot (Relation rel, int access)
 
Buffer _bt_gettrueroot (Relation rel)
 
int _bt_getrootheight (Relation rel)
 
void _bt_checkpage (Relation rel, Buffer buf)
 
Buffer _bt_getbuf (Relation rel, BlockNumber blkno, int access)
 
Buffer _bt_relandgetbuf (Relation rel, Buffer obuf, BlockNumber blkno, int access)
 
void _bt_relbuf (Relation rel, Buffer buf)
 
void _bt_pageinit (Page page, Size size)
 
bool _bt_page_recyclable (Page page)
 
void _bt_delitems_delete (Relation rel, Buffer buf, OffsetNumber *itemnos, int nitems, Relation heapRel)
 
void _bt_delitems_vacuum (Relation rel, Buffer buf, OffsetNumber *itemnos, int nitems, BlockNumber lastBlockVacuumed)
 
int _bt_pagedel (Relation rel, Buffer buf)
 
BTStack _bt_search (Relation rel, int keysz, ScanKey scankey, bool nextkey, Buffer *bufP, int access, Snapshot snapshot)
 
Buffer _bt_moveright (Relation rel, Buffer buf, int keysz, ScanKey scankey, bool nextkey, bool forupdate, BTStack stack, int access, Snapshot snapshot)
 
OffsetNumber _bt_binsrch (Relation rel, Buffer buf, int keysz, ScanKey scankey, bool nextkey)
 
int32 _bt_compare (Relation rel, int keysz, ScanKey scankey, Page page, OffsetNumber offnum)
 
bool _bt_first (IndexScanDesc scan, ScanDirection dir)
 
bool _bt_next (IndexScanDesc scan, ScanDirection dir)
 
Buffer _bt_get_endpoint (Relation rel, uint32 level, bool rightmost, Snapshot snapshot)
 
ScanKey _bt_mkscankey (Relation rel, IndexTuple itup)
 
ScanKey _bt_mkscankey_nodata (Relation rel)
 
void _bt_freeskey (ScanKey skey)
 
void _bt_freestack (BTStack stack)
 
void _bt_preprocess_array_keys (IndexScanDesc scan)
 
void _bt_start_array_keys (IndexScanDesc scan, ScanDirection dir)
 
bool _bt_advance_array_keys (IndexScanDesc scan, ScanDirection dir)
 
void _bt_mark_array_keys (IndexScanDesc scan)
 
void _bt_restore_array_keys (IndexScanDesc scan)
 
void _bt_preprocess_keys (IndexScanDesc scan)
 
IndexTuple _bt_checkkeys (IndexScanDesc scan, Page page, OffsetNumber offnum, ScanDirection dir, bool *continuescan)
 
void _bt_killitems (IndexScanDesc scan)
 
BTCycleId _bt_vacuum_cycleid (Relation rel)
 
BTCycleId _bt_start_vacuum (Relation rel)
 
void _bt_end_vacuum (Relation rel)
 
void _bt_end_vacuum_callback (int code, Datum arg)
 
Size BTreeShmemSize (void)
 
void BTreeShmemInit (void)
 
byteabtoptions (Datum reloptions, bool validate)
 
bool btproperty (Oid index_oid, int attno, IndexAMProperty prop, const char *propname, bool *res, bool *isnull)
 
bool btvalidate (Oid opclassoid)
 
BTSpool_bt_spoolinit (Relation heap, Relation index, bool isunique, bool isdead)
 
void _bt_spooldestroy (BTSpool *btspool)
 
void _bt_spool (BTSpool *btspool, ItemPointer self, Datum *values, bool *isnull)
 
void _bt_leafbuild (BTSpool *btspool, BTSpool *spool2)
 

Macro Definition Documentation

#define BTCommuteStrategyNumber (   strat)    (BTMaxStrategyNumber + 1 - (strat))

Definition at line 214 of file nbtree.h.

Referenced by _bt_compare_scankey_args(), and _bt_fix_scankey_strategy().

#define BTEntrySame (   i1,
  i2 
)    BTTidSame((i1)->t_tid, (i2)->t_tid)

Definition at line 156 of file nbtree.h.

Referenced by _bt_getstackbuf().

#define BTMaxItemSize (   page)
Value:
MAXALIGN(sizeof(BTPageOpaqueData))) / 3)
#define SizeOfPageHeaderData
Definition: bufpage.h:213
#define PageGetPageSize(page)
Definition: bufpage.h:265
#define MAXALIGN(LEN)
Definition: c.h:588
#define MAXALIGN_DOWN(LEN)
Definition: c.h:600

Definition at line 119 of file nbtree.h.

Referenced by _bt_buildadd(), and _bt_findinsertloc().

#define BTNProcs   2

Definition at line 230 of file nbtree.h.

Referenced by bthandler().

#define BTP_DELETED   (1 << 2) /* page has been deleted from tree */

Definition at line 72 of file nbtree.h.

Referenced by _bt_unlink_halfdead_page(), btree_xlog_unlink_page(), and pgstat_btree_page().

#define BTP_HALF_DEAD   (1 << 4) /* empty, but still in tree */
#define BTP_HAS_GARBAGE   (1 << 6) /* page has LP_DEAD tuples */
#define BTP_INCOMPLETE_SPLIT   (1 << 7) /* right sibling's downlink is missing */
#define BTP_LEAF   (1 << 0) /* leaf page, i.e. not internal page */
#define BTP_META   (1 << 3) /* meta-page */
#define BTP_ROOT   (1 << 1) /* root page (has no parent) */

Definition at line 71 of file nbtree.h.

Referenced by _bt_getroot(), _bt_newroot(), _bt_split(), _bt_uppershutdown(), and btree_xlog_newroot().

#define BTP_SPLIT_END   (1 << 5) /* rightmost page of split group */

Definition at line 75 of file nbtree.h.

Referenced by _bt_split(), btree_mask(), and btvacuumpage().

#define BTREE_DEFAULT_FILLFACTOR   90

Definition at line 132 of file nbtree.h.

Referenced by _bt_findsplitloc(), and _bt_pagestate().

#define BTREE_MAGIC   0x053162 /* magic number of btree pages */
#define BTREE_MIN_FILLFACTOR   10

Definition at line 131 of file nbtree.h.

#define BTREE_NONLEAF_FILLFACTOR   70

Definition at line 133 of file nbtree.h.

Referenced by _bt_findsplitloc(), and _bt_pagestate().

#define BTREE_VERSION   2 /* current version number */
#define BTScanPosInvalidate (   scanpos)
Value:
do { \
(scanpos).currPage = InvalidBlockNumber; \
(scanpos).nextPage = InvalidBlockNumber; \
(scanpos).buf = InvalidBuffer; \
(scanpos).lsn = InvalidXLogRecPtr; \
(scanpos).nextTupleOffset = 0; \
} while (0);
#define InvalidXLogRecPtr
Definition: xlogdefs.h:28
#define InvalidBuffer
Definition: buf.h:25
static char * buf
Definition: pg_test_fsync.c:65
#define InvalidBlockNumber
Definition: block.h:33

Definition at line 355 of file nbtree.h.

Referenced by _bt_endpoint(), _bt_first(), _bt_readnextpage(), _bt_steppage(), btbeginscan(), btmarkpos(), btrescan(), and btrestrpos().

#define BTScanPosIsPinned (   scanpos)
Value:
( \
AssertMacro(BlockNumberIsValid((scanpos).currPage) || \
!BufferIsValid((scanpos).buf)), \
BufferIsValid((scanpos).buf) \
)
#define AssertMacro(condition)
Definition: c.h:676
static char * buf
Definition: pg_test_fsync.c:65
#define BlockNumberIsValid(blockNumber)
Definition: block.h:70
#define BufferIsValid(bufnum)
Definition: bufmgr.h:114

Definition at line 332 of file nbtree.h.

Referenced by _bt_killitems(), _bt_readnextpage(), _bt_readpage(), _bt_steppage(), and btrestrpos().

#define BTScanPosIsValid (   scanpos)
Value:
( \
AssertMacro(BlockNumberIsValid((scanpos).currPage) || \
!BufferIsValid((scanpos).buf)), \
BlockNumberIsValid((scanpos).currPage) \
)
#define AssertMacro(condition)
Definition: c.h:676
static char * buf
Definition: pg_test_fsync.c:65
#define BlockNumberIsValid(blockNumber)
Definition: block.h:70
#define BufferIsValid(bufnum)
Definition: bufmgr.h:114

Definition at line 349 of file nbtree.h.

Referenced by _bt_first(), _bt_killitems(), _bt_steppage(), btendscan(), btgettuple(), btmarkpos(), btrescan(), and btrestrpos().

#define BTScanPosUnpin (   scanpos)
Value:
do { \
ReleaseBuffer((scanpos).buf); \
(scanpos).buf = InvalidBuffer; \
} while (0)
#define InvalidBuffer
Definition: buf.h:25
void ReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:3309
static char * buf
Definition: pg_test_fsync.c:65

Definition at line 338 of file nbtree.h.

#define BTScanPosUnpinIfPinned (   scanpos)
Value:
do { \
if (BTScanPosIsPinned(scanpos)) \
BTScanPosUnpin(scanpos); \
} while (0)
#define BTScanPosIsPinned(scanpos)
Definition: nbtree.h:332
#define BTScanPosUnpin(scanpos)
Definition: nbtree.h:338

Definition at line 343 of file nbtree.h.

Referenced by _bt_readnextpage(), _bt_steppage(), btendscan(), btmarkpos(), btrescan(), and btrestrpos().

#define BTSORTSUPPORT_PROC   2

Definition at line 229 of file nbtree.h.

Referenced by assignProcTypes(), btvalidate(), FinishSortSupportFunction(), and MJExamineQuals().

#define BTTidSame (   i1,
  i2 
)
Value:
#define ItemPointerGetOffsetNumber(pointer)
Definition: itemptr.h:94
#define ItemPointerGetBlockNumber(pointer)
Definition: itemptr.h:75

Definition at line 153 of file nbtree.h.

#define MAX_BT_CYCLE_ID   0xFF7F

Definition at line 86 of file nbtree.h.

Referenced by _bt_start_vacuum().

#define P_FIRSTKEY   ((OffsetNumber) 2)

Definition at line 203 of file nbtree.h.

Referenced by _bt_buildadd(), _bt_newroot(), _bt_slideleft(), and _bt_sortaddtup().

#define P_HAS_GARBAGE (   opaque)    ((opaque)->btpo_flags & BTP_HAS_GARBAGE)

Definition at line 181 of file nbtree.h.

Referenced by _bt_findinsertloc(), and palloc_btree_page().

#define P_INCOMPLETE_SPLIT (   opaque)    ((opaque)->btpo_flags & BTP_INCOMPLETE_SPLIT)
#define P_ISHALFDEAD (   opaque)    ((opaque)->btpo_flags & BTP_HALF_DEAD)
#define P_ISROOT (   opaque)    ((opaque)->btpo_flags & BTP_ROOT)
#define P_LEFTMOST (   opaque)    ((opaque)->btpo_prev == P_NONE)
#define SK_BT_INDOPTION_SHIFT   24 /* must clear the above bits */

Definition at line 425 of file nbtree.h.

Referenced by _bt_fix_scankey_strategy(), _bt_mkscankey(), and _bt_mkscankey_nodata().

#define SK_BT_REQBKWD   0x00020000 /* required to continue backward scan */

Definition at line 424 of file nbtree.h.

Referenced by _bt_check_rowcompare(), _bt_checkkeys(), and _bt_mark_scankey_required().

#define SK_BT_REQFWD   0x00010000 /* required to continue forward scan */

Definition at line 423 of file nbtree.h.

Referenced by _bt_check_rowcompare(), _bt_checkkeys(), and _bt_mark_scankey_required().

Typedef Documentation

Definition at line 26 of file nbtree.h.

Definition at line 67 of file nbtree.h.

Definition at line 416 of file nbtree.h.

Definition at line 330 of file nbtree.h.

Definition at line 549 of file nbtree.h.

Definition at line 258 of file nbtree.h.

Function Documentation

bool _bt_advance_array_keys ( IndexScanDesc  scan,
ScanDirection  dir 
)

Definition at line 547 of file nbtutils.c.

References _bt_parallel_advance_array_keys(), BTScanOpaqueData::arrayKeyData, BTScanOpaqueData::arrayKeys, BTArrayKeyInfo::cur_elem, BTArrayKeyInfo::elem_values, i, NULL, BTArrayKeyInfo::num_elems, BTScanOpaqueData::numArrayKeys, IndexScanDescData::opaque, IndexScanDescData::parallel_scan, BTArrayKeyInfo::scan_key, ScanDirectionIsBackward, and ScanKeyData::sk_argument.

Referenced by btgetbitmap(), and btgettuple().

548 {
549  BTScanOpaque so = (BTScanOpaque) scan->opaque;
550  bool found = false;
551  int i;
552 
553  /*
554  * We must advance the last array key most quickly, since it will
555  * correspond to the lowest-order index column among the available
556  * qualifications. This is necessary to ensure correct ordering of output
557  * when there are multiple array keys.
558  */
559  for (i = so->numArrayKeys - 1; i >= 0; i--)
560  {
561  BTArrayKeyInfo *curArrayKey = &so->arrayKeys[i];
562  ScanKey skey = &so->arrayKeyData[curArrayKey->scan_key];
563  int cur_elem = curArrayKey->cur_elem;
564  int num_elems = curArrayKey->num_elems;
565 
566  if (ScanDirectionIsBackward(dir))
567  {
568  if (--cur_elem < 0)
569  {
570  cur_elem = num_elems - 1;
571  found = false; /* need to advance next array key */
572  }
573  else
574  found = true;
575  }
576  else
577  {
578  if (++cur_elem >= num_elems)
579  {
580  cur_elem = 0;
581  found = false; /* need to advance next array key */
582  }
583  else
584  found = true;
585  }
586 
587  curArrayKey->cur_elem = cur_elem;
588  skey->sk_argument = curArrayKey->elem_values[cur_elem];
589  if (found)
590  break;
591  }
592 
593  /* advance parallel scan */
594  if (scan->parallel_scan != NULL)
596 
597  return found;
598 }
ParallelIndexScanDesc parallel_scan
Definition: relscan.h:139
Datum * elem_values
Definition: nbtree.h:371
#define ScanDirectionIsBackward(direction)
Definition: sdir.h:41
BTScanOpaqueData * BTScanOpaque
Definition: nbtree.h:416
int cur_elem
Definition: nbtree.h:368
int numArrayKeys
Definition: nbtree.h:383
ScanKey arrayKeyData
Definition: nbtree.h:382
int num_elems
Definition: nbtree.h:370
#define NULL
Definition: c.h:229
void _bt_parallel_advance_array_keys(IndexScanDesc scan)
Definition: nbtree.c:889
BTArrayKeyInfo * arrayKeys
Definition: nbtree.h:387
int i
Datum sk_argument
Definition: skey.h:72
int scan_key
Definition: nbtree.h:367
OffsetNumber _bt_binsrch ( Relation  rel,
Buffer  buf,
int  keysz,
ScanKey  scankey,
bool  nextkey 
)

Definition at line 323 of file nbtsearch.c.

References _bt_compare(), Assert, BufferGetPage, OffsetNumberPrev, P_FIRSTDATAKEY, P_ISLEAF, PageGetMaxOffsetNumber, PageGetSpecialPointer, and result.

Referenced by _bt_doinsert(), _bt_findinsertloc(), _bt_first(), and _bt_search().

328 {
329  Page page;
330  BTPageOpaque opaque;
331  OffsetNumber low,
332  high;
333  int32 result,
334  cmpval;
335 
336  page = BufferGetPage(buf);
337  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
338 
339  low = P_FIRSTDATAKEY(opaque);
340  high = PageGetMaxOffsetNumber(page);
341 
342  /*
343  * If there are no keys on the page, return the first available slot. Note
344  * this covers two cases: the page is really empty (no keys), or it
345  * contains only a high key. The latter case is possible after vacuuming.
346  * This can never happen on an internal page, however, since they are
347  * never empty (an internal page must have children).
348  */
349  if (high < low)
350  return low;
351 
352  /*
353  * Binary search to find the first key on the page >= scan key, or first
354  * key > scankey when nextkey is true.
355  *
356  * For nextkey=false (cmpval=1), the loop invariant is: all slots before
357  * 'low' are < scan key, all slots at or after 'high' are >= scan key.
358  *
359  * For nextkey=true (cmpval=0), the loop invariant is: all slots before
360  * 'low' are <= scan key, all slots at or after 'high' are > scan key.
361  *
362  * We can fall out when high == low.
363  */
364  high++; /* establish the loop invariant for high */
365 
366  cmpval = nextkey ? 0 : 1; /* select comparison value */
367 
368  while (high > low)
369  {
370  OffsetNumber mid = low + ((high - low) / 2);
371 
372  /* We have low <= mid < high, so mid points at a real slot */
373 
374  result = _bt_compare(rel, keysz, scankey, page, mid);
375 
376  if (result >= cmpval)
377  low = mid + 1;
378  else
379  high = mid;
380  }
381 
382  /*
383  * At this point we have high == low, but be careful: they could point
384  * past the last slot on the page.
385  *
386  * On a leaf page, we always return the first key >= scan key (resp. >
387  * scan key), which could be the last slot + 1.
388  */
389  if (P_ISLEAF(opaque))
390  return low;
391 
392  /*
393  * On a non-leaf page, return the last key < scan key (resp. <= scan key).
394  * There must be one if _bt_compare() is playing by the rules.
395  */
396  Assert(low > P_FIRSTDATAKEY(opaque));
397 
398  return OffsetNumberPrev(low);
399 }
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:204
return result
Definition: formatting.c:1618
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:354
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:67
signed int int32
Definition: c.h:256
uint16 OffsetNumber
Definition: off.h:24
static char * buf
Definition: pg_test_fsync.c:65
#define BufferGetPage(buffer)
Definition: bufmgr.h:160
#define Assert(condition)
Definition: c.h:675
#define PageGetSpecialPointer(page)
Definition: bufpage.h:323
#define OffsetNumberPrev(offsetNumber)
Definition: off.h:55
int32 _bt_compare(Relation rel, int keysz, ScanKey scankey, Page page, OffsetNumber offnum)
Definition: nbtsearch.c:428
Pointer Page
Definition: bufpage.h:74
#define P_ISLEAF(opaque)
Definition: nbtree.h:176
IndexTuple _bt_checkkeys ( IndexScanDesc  scan,
Page  page,
OffsetNumber  offnum,
ScanDirection  dir,
bool continuescan 
)

Definition at line 1359 of file nbtutils.c.

References _bt_check_rowcompare(), Assert, DatumGetBool, FunctionCall2Coll(), IndexScanDescData::ignore_killed_tuples, index_getattr, IndexScanDescData::indexRelation, ItemIdIsDead, BTScanOpaqueData::keyData, NULL, BTScanOpaqueData::numberOfKeys, IndexScanDescData::opaque, P_FIRSTDATAKEY, PageGetItem, PageGetItemId, PageGetMaxOffsetNumber, PageGetSpecialPointer, RelationGetDescr, ScanDirectionIsBackward, ScanDirectionIsForward, ScanKeyData::sk_argument, ScanKeyData::sk_attno, SK_BT_NULLS_FIRST, SK_BT_REQBKWD, SK_BT_REQFWD, ScanKeyData::sk_collation, ScanKeyData::sk_flags, ScanKeyData::sk_func, SK_ISNULL, SK_ROW_HEADER, SK_SEARCHNOTNULL, SK_SEARCHNULL, and test().

Referenced by _bt_readpage().

1362 {
1363  ItemId iid = PageGetItemId(page, offnum);
1364  bool tuple_alive;
1365  IndexTuple tuple;
1366  TupleDesc tupdesc;
1367  BTScanOpaque so;
1368  int keysz;
1369  int ikey;
1370  ScanKey key;
1371 
1372  *continuescan = true; /* default assumption */
1373 
1374  /*
1375  * If the scan specifies not to return killed tuples, then we treat a
1376  * killed tuple as not passing the qual. Most of the time, it's a win to
1377  * not bother examining the tuple's index keys, but just return
1378  * immediately with continuescan = true to proceed to the next tuple.
1379  * However, if this is the last tuple on the page, we should check the
1380  * index keys to prevent uselessly advancing to the next page.
1381  */
1382  if (scan->ignore_killed_tuples && ItemIdIsDead(iid))
1383  {
1384  /* return immediately if there are more tuples on the page */
1385  if (ScanDirectionIsForward(dir))
1386  {
1387  if (offnum < PageGetMaxOffsetNumber(page))
1388  return NULL;
1389  }
1390  else
1391  {
1393 
1394  if (offnum > P_FIRSTDATAKEY(opaque))
1395  return NULL;
1396  }
1397 
1398  /*
1399  * OK, we want to check the keys so we can set continuescan correctly,
1400  * but we'll return NULL even if the tuple passes the key tests.
1401  */
1402  tuple_alive = false;
1403  }
1404  else
1405  tuple_alive = true;
1406 
1407  tuple = (IndexTuple) PageGetItem(page, iid);
1408 
1409  tupdesc = RelationGetDescr(scan->indexRelation);
1410  so = (BTScanOpaque) scan->opaque;
1411  keysz = so->numberOfKeys;
1412 
1413  for (key = so->keyData, ikey = 0; ikey < keysz; key++, ikey++)
1414  {
1415  Datum datum;
1416  bool isNull;
1417  Datum test;
1418 
1419  /* row-comparison keys need special processing */
1420  if (key->sk_flags & SK_ROW_HEADER)
1421  {
1422  if (_bt_check_rowcompare(key, tuple, tupdesc, dir, continuescan))
1423  continue;
1424  return NULL;
1425  }
1426 
1427  datum = index_getattr(tuple,
1428  key->sk_attno,
1429  tupdesc,
1430  &isNull);
1431 
1432  if (key->sk_flags & SK_ISNULL)
1433  {
1434  /* Handle IS NULL/NOT NULL tests */
1435  if (key->sk_flags & SK_SEARCHNULL)
1436  {
1437  if (isNull)
1438  continue; /* tuple satisfies this qual */
1439  }
1440  else
1441  {
1443  if (!isNull)
1444  continue; /* tuple satisfies this qual */
1445  }
1446 
1447  /*
1448  * Tuple fails this qual. If it's a required qual for the current
1449  * scan direction, then we can conclude no further tuples will
1450  * pass, either.
1451  */
1452  if ((key->sk_flags & SK_BT_REQFWD) &&
1454  *continuescan = false;
1455  else if ((key->sk_flags & SK_BT_REQBKWD) &&
1457  *continuescan = false;
1458 
1459  /*
1460  * In any case, this indextuple doesn't match the qual.
1461  */
1462  return NULL;
1463  }
1464 
1465  if (isNull)
1466  {
1467  if (key->sk_flags & SK_BT_NULLS_FIRST)
1468  {
1469  /*
1470  * Since NULLs are sorted before non-NULLs, we know we have
1471  * reached the lower limit of the range of values for this
1472  * index attr. On a backward scan, we can stop if this qual
1473  * is one of the "must match" subset. We can stop regardless
1474  * of whether the qual is > or <, so long as it's required,
1475  * because it's not possible for any future tuples to pass. On
1476  * a forward scan, however, we must keep going, because we may
1477  * have initially positioned to the start of the index.
1478  */
1479  if ((key->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) &&
1481  *continuescan = false;
1482  }
1483  else
1484  {
1485  /*
1486  * Since NULLs are sorted after non-NULLs, we know we have
1487  * reached the upper limit of the range of values for this
1488  * index attr. On a forward scan, we can stop if this qual is
1489  * one of the "must match" subset. We can stop regardless of
1490  * whether the qual is > or <, so long as it's required,
1491  * because it's not possible for any future tuples to pass. On
1492  * a backward scan, however, we must keep going, because we
1493  * may have initially positioned to the end of the index.
1494  */
1495  if ((key->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) &&
1497  *continuescan = false;
1498  }
1499 
1500  /*
1501  * In any case, this indextuple doesn't match the qual.
1502  */
1503  return NULL;
1504  }
1505 
1506  test = FunctionCall2Coll(&key->sk_func, key->sk_collation,
1507  datum, key->sk_argument);
1508 
1509  if (!DatumGetBool(test))
1510  {
1511  /*
1512  * Tuple fails this qual. If it's a required qual for the current
1513  * scan direction, then we can conclude no further tuples will
1514  * pass, either.
1515  *
1516  * Note: because we stop the scan as soon as any required equality
1517  * qual fails, it is critical that equality quals be used for the
1518  * initial positioning in _bt_first() when they are available. See
1519  * comments in _bt_first().
1520  */
1521  if ((key->sk_flags & SK_BT_REQFWD) &&
1523  *continuescan = false;
1524  else if ((key->sk_flags & SK_BT_REQBKWD) &&
1526  *continuescan = false;
1527 
1528  /*
1529  * In any case, this indextuple doesn't match the qual.
1530  */
1531  return NULL;
1532  }
1533  }
1534 
1535  /* Check for failure due to it being a killed tuple. */
1536  if (!tuple_alive)
1537  return NULL;
1538 
1539  /* If we get here, the tuple passes all index quals. */
1540  return tuple;
1541 }
static bool _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, TupleDesc tupdesc, ScanDirection dir, bool *continuescan)
Definition: nbtutils.c:1553
static void test(void)
#define ScanDirectionIsForward(direction)
Definition: sdir.h:55
#define RelationGetDescr(relation)
Definition: rel.h:429
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:204
Datum FunctionCall2Coll(FmgrInfo *flinfo, Oid collation, Datum arg1, Datum arg2)
Definition: fmgr.c:1356
#define ItemIdIsDead(itemId)
Definition: itemid.h:112
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:354
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:67
bool ignore_killed_tuples
Definition: relscan.h:100
Relation indexRelation
Definition: relscan.h:89
#define ScanDirectionIsBackward(direction)
Definition: sdir.h:41
BTScanOpaqueData * BTScanOpaque
Definition: nbtree.h:416
IndexTupleData * IndexTuple
Definition: itup.h:53
FmgrInfo sk_func
Definition: skey.h:71
#define DatumGetBool(X)
Definition: postgres.h:399
#define SK_SEARCHNOTNULL
Definition: skey.h:122
#define SK_ISNULL
Definition: skey.h:115
#define SK_ROW_HEADER
Definition: skey.h:117
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:232
#define SK_BT_NULLS_FIRST
Definition: nbtree.h:427
uintptr_t Datum
Definition: postgres.h:372
#define SK_BT_REQFWD
Definition: nbtree.h:423
int sk_flags
Definition: skey.h:66
#define NULL
Definition: c.h:229
#define Assert(condition)
Definition: c.h:675
#define PageGetSpecialPointer(page)
Definition: bufpage.h:323
int numberOfKeys
Definition: nbtree.h:378
#define index_getattr(tup, attnum, tupleDesc, isnull)
Definition: itup.h:100
ScanKey keyData
Definition: nbtree.h:379
Oid sk_collation
Definition: skey.h:70
#define SK_BT_REQBKWD
Definition: nbtree.h:424
Datum sk_argument
Definition: skey.h:72
#define SK_SEARCHNULL
Definition: skey.h:121
#define PageGetItem(page, itemId)
Definition: bufpage.h:337
AttrNumber sk_attno
Definition: skey.h:67
void _bt_checkpage ( Relation  rel,
Buffer  buf 
)

Definition at line 503 of file nbtpage.c.

References BufferGetBlockNumber(), BufferGetPage, ereport, errcode(), errhint(), errmsg(), ERROR, MAXALIGN, PageGetSpecialSize, PageIsNew, and RelationGetRelationName.

Referenced by _bt_getbuf(), _bt_relandgetbuf(), btvacuumpage(), btvacuumscan(), and palloc_btree_page().

504 {
505  Page page = BufferGetPage(buf);
506 
507  /*
508  * ReadBuffer verifies that every newly-read page passes
509  * PageHeaderIsValid, which means it either contains a reasonably sane
510  * page header or is all-zero. We have to defend against the all-zero
511  * case, however.
512  */
513  if (PageIsNew(page))
514  ereport(ERROR,
515  (errcode(ERRCODE_INDEX_CORRUPTED),
516  errmsg("index \"%s\" contains unexpected zero page at block %u",
519  errhint("Please REINDEX it.")));
520 
521  /*
522  * Additionally check that the special area looks sane.
523  */
524  if (PageGetSpecialSize(page) != MAXALIGN(sizeof(BTPageOpaqueData)))
525  ereport(ERROR,
526  (errcode(ERRCODE_INDEX_CORRUPTED),
527  errmsg("index \"%s\" contains corrupted page at block %u",
530  errhint("Please REINDEX it.")));
531 }
int errhint(const char *fmt,...)
Definition: elog.c:987
int errcode(int sqlerrcode)
Definition: elog.c:575
#define ERROR
Definition: elog.h:43
static char * buf
Definition: pg_test_fsync.c:65
#define RelationGetRelationName(relation)
Definition: rel.h:437
#define BufferGetPage(buffer)
Definition: bufmgr.h:160
#define ereport(elevel, rest)
Definition: elog.h:122
#define MAXALIGN(LEN)
Definition: c.h:588
#define PageGetSpecialSize(page)
Definition: bufpage.h:297
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2605
#define PageIsNew(page)
Definition: bufpage.h:226
int errmsg(const char *fmt,...)
Definition: elog.c:797
Pointer Page
Definition: bufpage.h:74
int32 _bt_compare ( Relation  rel,
int  keysz,
ScanKey  scankey,
Page  page,
OffsetNumber  offnum 
)

Definition at line 428 of file nbtsearch.c.

References DatumGetInt32, FunctionCall2Coll(), i, index_getattr, P_FIRSTDATAKEY, P_ISLEAF, PageGetItem, PageGetItemId, PageGetSpecialPointer, RelationGetDescr, result, ScanKeyData::sk_argument, ScanKeyData::sk_attno, SK_BT_DESC, SK_BT_NULLS_FIRST, ScanKeyData::sk_collation, ScanKeyData::sk_flags, ScanKeyData::sk_func, and SK_ISNULL.

Referenced by _bt_binsrch(), _bt_findinsertloc(), _bt_moveright(), invariant_geq_offset(), invariant_leq_nontarget_offset(), and invariant_leq_offset().

433 {
434  TupleDesc itupdesc = RelationGetDescr(rel);
436  IndexTuple itup;
437  int i;
438 
439  /*
440  * Force result ">" if target item is first data item on an internal page
441  * --- see NOTE above.
442  */
443  if (!P_ISLEAF(opaque) && offnum == P_FIRSTDATAKEY(opaque))
444  return 1;
445 
446  itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
447 
448  /*
449  * The scan key is set up with the attribute number associated with each
450  * term in the key. It is important that, if the index is multi-key, the
451  * scan contain the first k key attributes, and that they be in order. If
452  * you think about how multi-key ordering works, you'll understand why
453  * this is.
454  *
455  * We don't test for violation of this condition here, however. The
456  * initial setup for the index scan had better have gotten it right (see
457  * _bt_first).
458  */
459 
460  for (i = 1; i <= keysz; i++)
461  {
462  Datum datum;
463  bool isNull;
464  int32 result;
465 
466  datum = index_getattr(itup, scankey->sk_attno, itupdesc, &isNull);
467 
468  /* see comments about NULLs handling in btbuild */
469  if (scankey->sk_flags & SK_ISNULL) /* key is NULL */
470  {
471  if (isNull)
472  result = 0; /* NULL "=" NULL */
473  else if (scankey->sk_flags & SK_BT_NULLS_FIRST)
474  result = -1; /* NULL "<" NOT_NULL */
475  else
476  result = 1; /* NULL ">" NOT_NULL */
477  }
478  else if (isNull) /* key is NOT_NULL and item is NULL */
479  {
480  if (scankey->sk_flags & SK_BT_NULLS_FIRST)
481  result = 1; /* NOT_NULL ">" NULL */
482  else
483  result = -1; /* NOT_NULL "<" NULL */
484  }
485  else
486  {
487  /*
488  * The sk_func needs to be passed the index value as left arg and
489  * the sk_argument as right arg (they might be of different
490  * types). Since it is convenient for callers to think of
491  * _bt_compare as comparing the scankey to the index item, we have
492  * to flip the sign of the comparison result. (Unless it's a DESC
493  * column, in which case we *don't* flip the sign.)
494  */
495  result = DatumGetInt32(FunctionCall2Coll(&scankey->sk_func,
496  scankey->sk_collation,
497  datum,
498  scankey->sk_argument));
499 
500  if (!(scankey->sk_flags & SK_BT_DESC))
501  result = -result;
502  }
503 
504  /* if the keys are unequal, return the difference */
505  if (result != 0)
506  return result;
507 
508  scankey++;
509  }
510 
511  /* if we get here, the keys are equal */
512  return 0;
513 }
#define DatumGetInt32(X)
Definition: postgres.h:478
#define RelationGetDescr(relation)
Definition: rel.h:429
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:204
return result
Definition: formatting.c:1618
Datum FunctionCall2Coll(FmgrInfo *flinfo, Oid collation, Datum arg1, Datum arg2)
Definition: fmgr.c:1356
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:67
signed int int32
Definition: c.h:256
IndexTupleData * IndexTuple
Definition: itup.h:53
FmgrInfo sk_func
Definition: skey.h:71
#define SK_ISNULL
Definition: skey.h:115
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:232
#define SK_BT_NULLS_FIRST
Definition: nbtree.h:427
uintptr_t Datum
Definition: postgres.h:372
int sk_flags
Definition: skey.h:66
#define SK_BT_DESC
Definition: nbtree.h:426
#define PageGetSpecialPointer(page)
Definition: bufpage.h:323
#define index_getattr(tup, attnum, tupleDesc, isnull)
Definition: itup.h:100
Oid sk_collation
Definition: skey.h:70
int i
Datum sk_argument
Definition: skey.h:72
#define PageGetItem(page, itemId)
Definition: bufpage.h:337
AttrNumber sk_attno
Definition: skey.h:67
#define P_ISLEAF(opaque)
Definition: nbtree.h:176
void _bt_delitems_delete ( Relation  rel,
Buffer  buf,
OffsetNumber itemnos,
int  nitems,
Relation  heapRel 
)

Definition at line 862 of file nbtpage.c.

References Assert, BTP_HAS_GARBAGE, BTPageOpaqueData::btpo_flags, BufferGetPage, END_CRIT_SECTION, xl_btree_delete::hnode, MarkBufferDirty(), xl_btree_delete::nitems, PageGetSpecialPointer, PageIndexMultiDelete(), PageSetLSN, RelationData::rd_node, REGBUF_STANDARD, RelationNeedsWAL, SizeOfBtreeDelete, START_CRIT_SECTION, XLOG_BTREE_DELETE, XLogBeginInsert(), XLogInsert(), XLogRegisterBuffer(), and XLogRegisterData().

Referenced by _bt_vacuum_one_page().

865 {
866  Page page = BufferGetPage(buf);
867  BTPageOpaque opaque;
868 
869  /* Shouldn't be called unless there's something to do */
870  Assert(nitems > 0);
871 
872  /* No ereport(ERROR) until changes are logged */
874 
875  /* Fix the page */
876  PageIndexMultiDelete(page, itemnos, nitems);
877 
878  /*
879  * Unlike _bt_delitems_vacuum, we *must not* clear the vacuum cycle ID,
880  * because this is not called by VACUUM.
881  */
882 
883  /*
884  * Mark the page as not containing any LP_DEAD items. This is not
885  * certainly true (there might be some that have recently been marked, but
886  * weren't included in our target-item list), but it will almost always be
887  * true and it doesn't seem worth an additional page scan to check it.
888  * Remember that BTP_HAS_GARBAGE is only a hint anyway.
889  */
890  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
891  opaque->btpo_flags &= ~BTP_HAS_GARBAGE;
892 
894 
895  /* XLOG stuff */
896  if (RelationNeedsWAL(rel))
897  {
898  XLogRecPtr recptr;
899  xl_btree_delete xlrec_delete;
900 
901  xlrec_delete.hnode = heapRel->rd_node;
902  xlrec_delete.nitems = nitems;
903 
904  XLogBeginInsert();
906  XLogRegisterData((char *) &xlrec_delete, SizeOfBtreeDelete);
907 
908  /*
909  * We need the target-offsets array whether or not we store the whole
910  * buffer, to allow us to find the latestRemovedXid on a standby
911  * server.
912  */
913  XLogRegisterData((char *) itemnos, nitems * sizeof(OffsetNumber));
914 
915  recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DELETE);
916 
917  PageSetLSN(page, recptr);
918  }
919 
921 }
void MarkBufferDirty(Buffer buffer)
Definition: bufmgr.c:1450
void XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags)
Definition: xloginsert.c:213
RelFileNode hnode
Definition: nbtxlog.h:121
#define END_CRIT_SECTION()
Definition: miscadmin.h:132
#define START_CRIT_SECTION()
Definition: miscadmin.h:130
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:67
uint16 OffsetNumber
Definition: off.h:24
static char * buf
Definition: pg_test_fsync.c:65
#define REGBUF_STANDARD
Definition: xloginsert.h:35
#define BufferGetPage(buffer)
Definition: bufmgr.h:160
#define XLOG_BTREE_DELETE
Definition: nbtxlog.h:33
void XLogRegisterData(char *data, int len)
Definition: xloginsert.c:323
XLogRecPtr XLogInsert(RmgrId rmid, uint8 info)
Definition: xloginsert.c:415
RelFileNode rd_node
Definition: rel.h:85
uint64 XLogRecPtr
Definition: xlogdefs.h:21
#define Assert(condition)
Definition: c.h:675
void PageIndexMultiDelete(Page page, OffsetNumber *itemnos, int nitems)
Definition: bufpage.c:836
#define PageGetSpecialPointer(page)
Definition: bufpage.h:323
#define RelationNeedsWAL(relation)
Definition: rel.h:506
#define SizeOfBtreeDelete
Definition: nbtxlog.h:128
void XLogBeginInsert(void)
Definition: xloginsert.c:120
uint16 btpo_flags
Definition: nbtree.h:63
#define PageSetLSN(page, lsn)
Definition: bufpage.h:365
#define BTP_HAS_GARBAGE
Definition: nbtree.h:76
Pointer Page
Definition: bufpage.h:74
void _bt_delitems_vacuum ( Relation  rel,
Buffer  buf,
OffsetNumber itemnos,
int  nitems,
BlockNumber  lastBlockVacuumed 
)

Definition at line 789 of file nbtpage.c.

References BTP_HAS_GARBAGE, BTPageOpaqueData::btpo_cycleid, BTPageOpaqueData::btpo_flags, BufferGetPage, END_CRIT_SECTION, xl_btree_vacuum::lastBlockVacuumed, MarkBufferDirty(), PageGetSpecialPointer, PageIndexMultiDelete(), PageSetLSN, REGBUF_STANDARD, RelationNeedsWAL, SizeOfBtreeVacuum, START_CRIT_SECTION, XLOG_BTREE_VACUUM, XLogBeginInsert(), XLogInsert(), XLogRegisterBufData(), XLogRegisterBuffer(), and XLogRegisterData().

Referenced by btvacuumpage(), and btvacuumscan().

792 {
793  Page page = BufferGetPage(buf);
794  BTPageOpaque opaque;
795 
796  /* No ereport(ERROR) until changes are logged */
798 
799  /* Fix the page */
800  if (nitems > 0)
801  PageIndexMultiDelete(page, itemnos, nitems);
802 
803  /*
804  * We can clear the vacuum cycle ID since this page has certainly been
805  * processed by the current vacuum scan.
806  */
807  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
808  opaque->btpo_cycleid = 0;
809 
810  /*
811  * Mark the page as not containing any LP_DEAD items. This is not
812  * certainly true (there might be some that have recently been marked, but
813  * weren't included in our target-item list), but it will almost always be
814  * true and it doesn't seem worth an additional page scan to check it.
815  * Remember that BTP_HAS_GARBAGE is only a hint anyway.
816  */
817  opaque->btpo_flags &= ~BTP_HAS_GARBAGE;
818 
820 
821  /* XLOG stuff */
822  if (RelationNeedsWAL(rel))
823  {
824  XLogRecPtr recptr;
825  xl_btree_vacuum xlrec_vacuum;
826 
827  xlrec_vacuum.lastBlockVacuumed = lastBlockVacuumed;
828 
829  XLogBeginInsert();
831  XLogRegisterData((char *) &xlrec_vacuum, SizeOfBtreeVacuum);
832 
833  /*
834  * The target-offsets array is not in the buffer, but pretend that it
835  * is. When XLogInsert stores the whole buffer, the offsets array
836  * need not be stored too.
837  */
838  if (nitems > 0)
839  XLogRegisterBufData(0, (char *) itemnos, nitems * sizeof(OffsetNumber));
840 
841  recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_VACUUM);
842 
843  PageSetLSN(page, recptr);
844  }
845 
847 }
BlockNumber lastBlockVacuumed
Definition: nbtxlog.h:167
void XLogRegisterBufData(uint8 block_id, char *data, int len)
Definition: xloginsert.c:361
void MarkBufferDirty(Buffer buffer)
Definition: bufmgr.c:1450
void XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags)
Definition: xloginsert.c:213
#define END_CRIT_SECTION()
Definition: miscadmin.h:132
#define START_CRIT_SECTION()
Definition: miscadmin.h:130
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:67
uint16 OffsetNumber
Definition: off.h:24
#define SizeOfBtreeVacuum
Definition: nbtxlog.h:172
BTCycleId btpo_cycleid
Definition: nbtree.h:64
static char * buf
Definition: pg_test_fsync.c:65
#define REGBUF_STANDARD
Definition: xloginsert.h:35
#define XLOG_BTREE_VACUUM
Definition: nbtxlog.h:38
#define BufferGetPage(buffer)
Definition: bufmgr.h:160
void XLogRegisterData(char *data, int len)
Definition: xloginsert.c:323
XLogRecPtr XLogInsert(RmgrId rmid, uint8 info)
Definition: xloginsert.c:415
uint64 XLogRecPtr
Definition: xlogdefs.h:21
void PageIndexMultiDelete(Page page, OffsetNumber *itemnos, int nitems)
Definition: bufpage.c:836
#define PageGetSpecialPointer(page)
Definition: bufpage.h:323
#define RelationNeedsWAL(relation)
Definition: rel.h:506
void XLogBeginInsert(void)
Definition: xloginsert.c:120
uint16 btpo_flags
Definition: nbtree.h:63
#define PageSetLSN(page, lsn)
Definition: bufpage.h:365
#define BTP_HAS_GARBAGE
Definition: nbtree.h:76
Pointer Page
Definition: bufpage.h:74
bool _bt_doinsert ( Relation  rel,
IndexTuple  itup,
IndexUniqueCheck  checkUnique,
Relation  heapRel 
)

Definition at line 108 of file nbtinsert.c.

References _bt_binsrch(), _bt_check_unique(), _bt_findinsertloc(), _bt_freeskey(), _bt_freestack(), _bt_insertonpg(), _bt_mkscankey(), _bt_moveright(), _bt_relbuf(), _bt_search(), BT_WRITE, buf, BUFFER_LOCK_UNLOCK, CheckForSerializableConflictIn(), InvalidBuffer, InvalidOffsetNumber, LockBuffer(), NULL, RelationData::rd_rel, SpeculativeInsertionWait(), IndexTupleData::t_tid, TransactionIdIsValid, UNIQUE_CHECK_EXISTING, UNIQUE_CHECK_NO, XactLockTableWait(), and XLTW_InsertIndex.

Referenced by btinsert().

110 {
111  bool is_unique = false;
112  int natts = rel->rd_rel->relnatts;
113  ScanKey itup_scankey;
114  BTStack stack;
115  Buffer buf;
116  OffsetNumber offset;
117 
118  /* we need an insertion scan key to do our search, so build one */
119  itup_scankey = _bt_mkscankey(rel, itup);
120 
121 top:
122  /* find the first page containing this key */
123  stack = _bt_search(rel, natts, itup_scankey, false, &buf, BT_WRITE, NULL);
124 
125  offset = InvalidOffsetNumber;
126 
127  /* trade in our read lock for a write lock */
129  LockBuffer(buf, BT_WRITE);
130 
131  /*
132  * If the page was split between the time that we surrendered our read
133  * lock and acquired our write lock, then this page may no longer be the
134  * right place for the key we want to insert. In this case, we need to
135  * move right in the tree. See Lehman and Yao for an excruciatingly
136  * precise description.
137  */
138  buf = _bt_moveright(rel, buf, natts, itup_scankey, false,
139  true, stack, BT_WRITE, NULL);
140 
141  /*
142  * If we're not allowing duplicates, make sure the key isn't already in
143  * the index.
144  *
145  * NOTE: obviously, _bt_check_unique can only detect keys that are already
146  * in the index; so it cannot defend against concurrent insertions of the
147  * same key. We protect against that by means of holding a write lock on
148  * the target page. Any other would-be inserter of the same key must
149  * acquire a write lock on the same target page, so only one would-be
150  * inserter can be making the check at one time. Furthermore, once we are
151  * past the check we hold write locks continuously until we have performed
152  * our insertion, so no later inserter can fail to see our insertion.
153  * (This requires some care in _bt_insertonpg.)
154  *
155  * If we must wait for another xact, we release the lock while waiting,
156  * and then must start over completely.
157  *
158  * For a partial uniqueness check, we don't wait for the other xact. Just
159  * let the tuple in and return false for possibly non-unique, or true for
160  * definitely unique.
161  */
162  if (checkUnique != UNIQUE_CHECK_NO)
163  {
164  TransactionId xwait;
165  uint32 speculativeToken;
166 
167  offset = _bt_binsrch(rel, buf, natts, itup_scankey, false);
168  xwait = _bt_check_unique(rel, itup, heapRel, buf, offset, itup_scankey,
169  checkUnique, &is_unique, &speculativeToken);
170 
171  if (TransactionIdIsValid(xwait))
172  {
173  /* Have to wait for the other guy ... */
174  _bt_relbuf(rel, buf);
175 
176  /*
177  * If it's a speculative insertion, wait for it to finish (ie. to
178  * go ahead with the insertion, or kill the tuple). Otherwise
179  * wait for the transaction to finish as usual.
180  */
181  if (speculativeToken)
182  SpeculativeInsertionWait(xwait, speculativeToken);
183  else
184  XactLockTableWait(xwait, rel, &itup->t_tid, XLTW_InsertIndex);
185 
186  /* start over... */
187  _bt_freestack(stack);
188  goto top;
189  }
190  }
191 
192  if (checkUnique != UNIQUE_CHECK_EXISTING)
193  {
194  /*
195  * The only conflict predicate locking cares about for indexes is when
196  * an index tuple insert conflicts with an existing lock. Since the
197  * actual location of the insert is hard to predict because of the
198  * random search used to prevent O(N^2) performance when there are
199  * many duplicate entries, we can just use the "first valid" page.
200  */
202  /* do the insertion */
203  _bt_findinsertloc(rel, &buf, &offset, natts, itup_scankey, itup,
204  stack, heapRel);
205  _bt_insertonpg(rel, buf, InvalidBuffer, stack, itup, offset, false);
206  }
207  else
208  {
209  /* just release the buffer */
210  _bt_relbuf(rel, buf);
211  }
212 
213  /* be tidy */
214  _bt_freestack(stack);
215  _bt_freeskey(itup_scankey);
216 
217  return is_unique;
218 }
static TransactionId _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel, Buffer buf, OffsetNumber offset, ScanKey itup_scankey, IndexUniqueCheck checkUnique, bool *is_unique, uint32 *speculativeToken)
Definition: nbtinsert.c:240
#define BUFFER_LOCK_UNLOCK
Definition: bufmgr.h:87
void _bt_freestack(BTStack stack)
Definition: nbtutils.c:164
uint32 TransactionId
Definition: c.h:397
void _bt_freeskey(ScanKey skey)
Definition: nbtutils.c:155
ItemPointerData t_tid
Definition: itup.h:37
#define InvalidBuffer
Definition: buf.h:25
Form_pg_class rd_rel
Definition: rel.h:114
void CheckForSerializableConflictIn(Relation relation, HeapTuple tuple, Buffer buffer)
Definition: predicate.c:4243
uint16 OffsetNumber
Definition: off.h:24
Buffer _bt_moveright(Relation rel, Buffer buf, int keysz, ScanKey scankey, bool nextkey, bool forupdate, BTStack stack, int access, Snapshot snapshot)
Definition: nbtsearch.c:214
void SpeculativeInsertionWait(TransactionId xid, uint32 token)
Definition: lmgr.c:685
static char * buf
Definition: pg_test_fsync.c:65
OffsetNumber _bt_binsrch(Relation rel, Buffer buf, int keysz, ScanKey scankey, bool nextkey)
Definition: nbtsearch.c:323
unsigned int uint32
Definition: c.h:268
static void _bt_findinsertloc(Relation rel, Buffer *bufptr, OffsetNumber *offsetptr, int keysz, ScanKey scankey, IndexTuple newtup, BTStack stack, Relation heapRel)
Definition: nbtinsert.c:541
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:3546
BTStack _bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey, Buffer *bufP, int access, Snapshot snapshot)
Definition: nbtsearch.c:97
#define InvalidOffsetNumber
Definition: off.h:26
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:720
void XactLockTableWait(TransactionId xid, Relation rel, ItemPointer ctid, XLTW_Oper oper)
Definition: lmgr.c:554
#define NULL
Definition: c.h:229
static void _bt_insertonpg(Relation rel, Buffer buf, Buffer cbuf, BTStack stack, IndexTuple itup, OffsetNumber newitemoff, bool split_only_page)
Definition: nbtinsert.c:734
#define TransactionIdIsValid(xid)
Definition: transam.h:41
ScanKey _bt_mkscankey(Relation rel, IndexTuple itup)
Definition: nbtutils.c:62
#define BT_WRITE
Definition: nbtree.h:238
int Buffer
Definition: buf.h:23
void _bt_end_vacuum ( Relation  rel)

Definition at line 1964 of file nbtutils.c.

References LockRelId::dbId, i, LockInfoData::lockRelId, LW_EXCLUSIVE, LWLockAcquire(), LWLockRelease(), BTVacInfo::num_vacuums, RelationData::rd_lockInfo, LockRelId::relId, BTOneVacInfo::relid, and BTVacInfo::vacuums.

Referenced by _bt_end_vacuum_callback(), and btbulkdelete().

1965 {
1966  int i;
1967 
1968  LWLockAcquire(BtreeVacuumLock, LW_EXCLUSIVE);
1969 
1970  /* Find the array entry */
1971  for (i = 0; i < btvacinfo->num_vacuums; i++)
1972  {
1973  BTOneVacInfo *vac = &btvacinfo->vacuums[i];
1974 
1975  if (vac->relid.relId == rel->rd_lockInfo.lockRelId.relId &&
1976  vac->relid.dbId == rel->rd_lockInfo.lockRelId.dbId)
1977  {
1978  /* Remove it by shifting down the last entry */
1979  *vac = btvacinfo->vacuums[btvacinfo->num_vacuums - 1];
1981  break;
1982  }
1983  }
1984 
1985  LWLockRelease(BtreeVacuumLock);
1986 }
LockRelId lockRelId
Definition: rel.h:44
static BTVacInfo * btvacinfo
Definition: nbtutils.c:1860
LockRelId relid
Definition: nbtutils.c:1848
Oid dbId
Definition: rel.h:39
BTOneVacInfo vacuums[FLEXIBLE_ARRAY_MEMBER]
Definition: nbtutils.c:1857
void LWLockRelease(LWLock *lock)
Definition: lwlock.c:1715
LockInfoData rd_lockInfo
Definition: rel.h:117
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1111
int num_vacuums
Definition: nbtutils.c:1855
int i
Oid relId
Definition: rel.h:38
void _bt_end_vacuum_callback ( int  code,
Datum  arg 
)

Definition at line 1992 of file nbtutils.c.

References _bt_end_vacuum(), and DatumGetPointer.

Referenced by btbulkdelete().

1993 {
1995 }
void _bt_end_vacuum(Relation rel)
Definition: nbtutils.c:1964
#define DatumGetPointer(X)
Definition: postgres.h:555
void * arg
void _bt_finish_split ( Relation  rel,
Buffer  bbuf,
BTStack  stack 
)

Definition at line 1745 of file nbtinsert.c.

References _bt_getbuf(), _bt_insert_parent(), _bt_relbuf(), Assert, BT_WRITE, BTMetaPageData::btm_root, BTPageGetMeta, BTPageOpaqueData::btpo_next, BTREE_METAPAGE, BufferGetBlockNumber(), BufferGetPage, DEBUG1, elog, P_INCOMPLETE_SPLIT, P_LEFTMOST, P_RIGHTMOST, and PageGetSpecialPointer.

Referenced by _bt_findinsertloc(), _bt_getstackbuf(), and _bt_moveright().

1746 {
1747  Page lpage = BufferGetPage(lbuf);
1748  BTPageOpaque lpageop = (BTPageOpaque) PageGetSpecialPointer(lpage);
1749  Buffer rbuf;
1750  Page rpage;
1751  BTPageOpaque rpageop;
1752  bool was_root;
1753  bool was_only;
1754 
1755  Assert(P_INCOMPLETE_SPLIT(lpageop));
1756 
1757  /* Lock right sibling, the one missing the downlink */
1758  rbuf = _bt_getbuf(rel, lpageop->btpo_next, BT_WRITE);
1759  rpage = BufferGetPage(rbuf);
1760  rpageop = (BTPageOpaque) PageGetSpecialPointer(rpage);
1761 
1762  /* Could this be a root split? */
1763  if (!stack)
1764  {
1765  Buffer metabuf;
1766  Page metapg;
1767  BTMetaPageData *metad;
1768 
1769  /* acquire lock on the metapage */
1770  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
1771  metapg = BufferGetPage(metabuf);
1772  metad = BTPageGetMeta(metapg);
1773 
1774  was_root = (metad->btm_root == BufferGetBlockNumber(lbuf));
1775 
1776  _bt_relbuf(rel, metabuf);
1777  }
1778  else
1779  was_root = false;
1780 
1781  /* Was this the only page on the level before split? */
1782  was_only = (P_LEFTMOST(lpageop) && P_RIGHTMOST(rpageop));
1783 
1784  elog(DEBUG1, "finishing incomplete split of %u/%u",
1786 
1787  _bt_insert_parent(rel, lbuf, rbuf, stack, was_root, was_only);
1788 }
BlockNumber btpo_next
Definition: nbtree.h:57
#define DEBUG1
Definition: elog.h:25
Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access)
Definition: nbtpage.c:570
static void _bt_insert_parent(Relation rel, Buffer buf, Buffer rbuf, BTStack stack, bool is_root, bool is_only)
Definition: nbtinsert.c:1634
#define P_INCOMPLETE_SPLIT(opaque)
Definition: nbtree.h:182
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:67
#define BTPageGetMeta(p)
Definition: nbtree.h:106
#define P_LEFTMOST(opaque)
Definition: nbtree.h:174
#define BufferGetPage(buffer)
Definition: bufmgr.h:160
#define BTREE_METAPAGE
Definition: nbtree.h:109
BlockNumber btm_root
Definition: nbtree.h:100
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:720
#define Assert(condition)
Definition: c.h:675
#define PageGetSpecialPointer(page)
Definition: bufpage.h:323
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2605
#define elog
Definition: elog.h:219
#define BT_WRITE
Definition: nbtree.h:238
int Buffer
Definition: buf.h:23
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:175
Pointer Page
Definition: bufpage.h:74
bool _bt_first ( IndexScanDesc  scan,
ScanDirection  dir 
)

Definition at line 536 of file nbtsearch.c.

References _bt_binsrch(), _bt_drop_lock_and_maybe_pin(), _bt_endpoint(), _bt_freestack(), _bt_initialize_more_data(), _bt_parallel_done(), _bt_parallel_readpage(), _bt_parallel_seize(), _bt_preprocess_keys(), _bt_readpage(), _bt_search(), _bt_steppage(), Assert, BT_READ, BTEqualStrategyNumber, BTGreaterEqualStrategyNumber, BTGreaterStrategyNumber, BTLessEqualStrategyNumber, BTLessStrategyNumber, BTORDER_PROC, BTScanPosInvalidate, BTScanPosIsValid, buf, BTScanPosData::buf, BUFFER_LOCK_UNLOCK, BufferGetBlockNumber(), BufferIsValid, cur, BTScanOpaqueData::currPos, BTScanOpaqueData::currTuples, DatumGetPointer, elog, ERROR, get_opfamily_proc(), i, index_getprocinfo(), INDEX_MAX_KEYS, IndexScanDescData::indexRelation, InvalidBlockNumber, InvalidOid, InvalidStrategy, BTScanPosData::itemIndex, BTScanPosData::items, BTScanOpaqueData::keyData, LockBuffer(), NULL, BTScanOpaqueData::numberOfKeys, OffsetNumberPrev, IndexScanDescData::opaque, P_NONE, IndexScanDescData::parallel_scan, pgstat_count_index_scan, PredicateLockPage(), PredicateLockRelation(), BTScanOpaqueData::qual_ok, RelationData::rd_opcintype, RelationData::rd_opfamily, RegProcedureIsValid, RelationGetRelationName, ScanDirectionIsBackward, ScanDirectionIsForward, ScanKeyEntryInitialize(), ScanKeyEntryInitializeWithInfo(), ScanKeyData::sk_argument, ScanKeyData::sk_attno, SK_BT_DESC, SK_BT_NULLS_FIRST, ScanKeyData::sk_collation, ScanKeyData::sk_flags, SK_ISNULL, SK_ROW_END, SK_ROW_HEADER, SK_ROW_MEMBER, SK_SEARCHNOTNULL, ScanKeyData::sk_strategy, ScanKeyData::sk_subtype, status(), HeapTupleData::t_self, IndexScanDescData::xs_ctup, IndexScanDescData::xs_itup, IndexScanDescData::xs_snapshot, and IndexScanDescData::xs_want_itup.

Referenced by btgetbitmap(), and btgettuple().

537 {
538  Relation rel = scan->indexRelation;
539  BTScanOpaque so = (BTScanOpaque) scan->opaque;
540  Buffer buf;
541  BTStack stack;
542  OffsetNumber offnum;
543  StrategyNumber strat;
544  bool nextkey;
545  bool goback;
546  ScanKey startKeys[INDEX_MAX_KEYS];
547  ScanKeyData scankeys[INDEX_MAX_KEYS];
548  ScanKeyData notnullkeys[INDEX_MAX_KEYS];
549  int keysCount = 0;
550  int i;
551  bool status = true;
552  StrategyNumber strat_total;
553  BTScanPosItem *currItem;
554  BlockNumber blkno;
555 
557 
559 
560  /*
561  * Examine the scan keys and eliminate any redundant keys; also mark the
562  * keys that must be matched to continue the scan.
563  */
564  _bt_preprocess_keys(scan);
565 
566  /*
567  * Quit now if _bt_preprocess_keys() discovered that the scan keys can
568  * never be satisfied (eg, x == 1 AND x > 2).
569  */
570  if (!so->qual_ok)
571  return false;
572 
573  /*
574  * For parallel scans, get the starting page from shared state. If the
575  * scan has not started, proceed to find out first leaf page in the usual
576  * way while keeping other participating processes waiting. If the scan
577  * has already begun, use the page number from the shared structure.
578  */
579  if (scan->parallel_scan != NULL)
580  {
581  status = _bt_parallel_seize(scan, &blkno);
582  if (!status)
583  return false;
584  else if (blkno == P_NONE)
585  {
586  _bt_parallel_done(scan);
587  return false;
588  }
589  else if (blkno != InvalidBlockNumber)
590  {
591  if (!_bt_parallel_readpage(scan, blkno, dir))
592  return false;
593  goto readcomplete;
594  }
595  }
596 
597  /*----------
598  * Examine the scan keys to discover where we need to start the scan.
599  *
600  * We want to identify the keys that can be used as starting boundaries;
601  * these are =, >, or >= keys for a forward scan or =, <, <= keys for
602  * a backwards scan. We can use keys for multiple attributes so long as
603  * the prior attributes had only =, >= (resp. =, <=) keys. Once we accept
604  * a > or < boundary or find an attribute with no boundary (which can be
605  * thought of as the same as "> -infinity"), we can't use keys for any
606  * attributes to its right, because it would break our simplistic notion
607  * of what initial positioning strategy to use.
608  *
609  * When the scan keys include cross-type operators, _bt_preprocess_keys
610  * may not be able to eliminate redundant keys; in such cases we will
611  * arbitrarily pick a usable one for each attribute. This is correct
612  * but possibly not optimal behavior. (For example, with keys like
613  * "x >= 4 AND x >= 5" we would elect to scan starting at x=4 when
614  * x=5 would be more efficient.) Since the situation only arises given
615  * a poorly-worded query plus an incomplete opfamily, live with it.
616  *
617  * When both equality and inequality keys appear for a single attribute
618  * (again, only possible when cross-type operators appear), we *must*
619  * select one of the equality keys for the starting point, because
620  * _bt_checkkeys() will stop the scan as soon as an equality qual fails.
621  * For example, if we have keys like "x >= 4 AND x = 10" and we elect to
622  * start at x=4, we will fail and stop before reaching x=10. If multiple
623  * equality quals survive preprocessing, however, it doesn't matter which
624  * one we use --- by definition, they are either redundant or
625  * contradictory.
626  *
627  * Any regular (not SK_SEARCHNULL) key implies a NOT NULL qualifier.
628  * If the index stores nulls at the end of the index we'll be starting
629  * from, and we have no boundary key for the column (which means the key
630  * we deduced NOT NULL from is an inequality key that constrains the other
631  * end of the index), then we cons up an explicit SK_SEARCHNOTNULL key to
632  * use as a boundary key. If we didn't do this, we might find ourselves
633  * traversing a lot of null entries at the start of the scan.
634  *
635  * In this loop, row-comparison keys are treated the same as keys on their
636  * first (leftmost) columns. We'll add on lower-order columns of the row
637  * comparison below, if possible.
638  *
639  * The selected scan keys (at most one per index column) are remembered by
640  * storing their addresses into the local startKeys[] array.
641  *----------
642  */
643  strat_total = BTEqualStrategyNumber;
644  if (so->numberOfKeys > 0)
645  {
646  AttrNumber curattr;
647  ScanKey chosen;
648  ScanKey impliesNN;
649  ScanKey cur;
650 
651  /*
652  * chosen is the so-far-chosen key for the current attribute, if any.
653  * We don't cast the decision in stone until we reach keys for the
654  * next attribute.
655  */
656  curattr = 1;
657  chosen = NULL;
658  /* Also remember any scankey that implies a NOT NULL constraint */
659  impliesNN = NULL;
660 
661  /*
662  * Loop iterates from 0 to numberOfKeys inclusive; we use the last
663  * pass to handle after-last-key processing. Actual exit from the
664  * loop is at one of the "break" statements below.
665  */
666  for (cur = so->keyData, i = 0;; cur++, i++)
667  {
668  if (i >= so->numberOfKeys || cur->sk_attno != curattr)
669  {
670  /*
671  * Done looking at keys for curattr. If we didn't find a
672  * usable boundary key, see if we can deduce a NOT NULL key.
673  */
674  if (chosen == NULL && impliesNN != NULL &&
675  ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
678  {
679  /* Yes, so build the key in notnullkeys[keysCount] */
680  chosen = &notnullkeys[keysCount];
681  ScanKeyEntryInitialize(chosen,
683  (impliesNN->sk_flags &
685  curattr,
686  ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
689  InvalidOid,
690  InvalidOid,
691  InvalidOid,
692  (Datum) 0);
693  }
694 
695  /*
696  * If we still didn't find a usable boundary key, quit; else
697  * save the boundary key pointer in startKeys.
698  */
699  if (chosen == NULL)
700  break;
701  startKeys[keysCount++] = chosen;
702 
703  /*
704  * Adjust strat_total, and quit if we have stored a > or <
705  * key.
706  */
707  strat = chosen->sk_strategy;
708  if (strat != BTEqualStrategyNumber)
709  {
710  strat_total = strat;
711  if (strat == BTGreaterStrategyNumber ||
712  strat == BTLessStrategyNumber)
713  break;
714  }
715 
716  /*
717  * Done if that was the last attribute, or if next key is not
718  * in sequence (implying no boundary key is available for the
719  * next attribute).
720  */
721  if (i >= so->numberOfKeys ||
722  cur->sk_attno != curattr + 1)
723  break;
724 
725  /*
726  * Reset for next attr.
727  */
728  curattr = cur->sk_attno;
729  chosen = NULL;
730  impliesNN = NULL;
731  }
732 
733  /*
734  * Can we use this key as a starting boundary for this attr?
735  *
736  * If not, does it imply a NOT NULL constraint? (Because
737  * SK_SEARCHNULL keys are always assigned BTEqualStrategyNumber,
738  * *any* inequality key works for that; we need not test.)
739  */
740  switch (cur->sk_strategy)
741  {
744  if (chosen == NULL)
745  {
746  if (ScanDirectionIsBackward(dir))
747  chosen = cur;
748  else
749  impliesNN = cur;
750  }
751  break;
753  /* override any non-equality choice */
754  chosen = cur;
755  break;
758  if (chosen == NULL)
759  {
760  if (ScanDirectionIsForward(dir))
761  chosen = cur;
762  else
763  impliesNN = cur;
764  }
765  break;
766  }
767  }
768  }
769 
770  /*
771  * If we found no usable boundary keys, we have to start from one end of
772  * the tree. Walk down that edge to the first or last key, and scan from
773  * there.
774  */
775  if (keysCount == 0)
776  {
777  bool match;
778 
779  match = _bt_endpoint(scan, dir);
780 
781  if (!match)
782  {
783  /* No match, so mark (parallel) scan finished */
784  _bt_parallel_done(scan);
785  }
786 
787  return match;
788  }
789 
790  /*
791  * We want to start the scan somewhere within the index. Set up an
792  * insertion scankey we can use to search for the boundary point we
793  * identified above. The insertion scankey is built in the local
794  * scankeys[] array, using the keys identified by startKeys[].
795  */
796  Assert(keysCount <= INDEX_MAX_KEYS);
797  for (i = 0; i < keysCount; i++)
798  {
799  ScanKey cur = startKeys[i];
800 
801  Assert(cur->sk_attno == i + 1);
802 
803  if (cur->sk_flags & SK_ROW_HEADER)
804  {
805  /*
806  * Row comparison header: look to the first row member instead.
807  *
808  * The member scankeys are already in insertion format (ie, they
809  * have sk_func = 3-way-comparison function), but we have to watch
810  * out for nulls, which _bt_preprocess_keys didn't check. A null
811  * in the first row member makes the condition unmatchable, just
812  * like qual_ok = false.
813  */
814  ScanKey subkey = (ScanKey) DatumGetPointer(cur->sk_argument);
815 
816  Assert(subkey->sk_flags & SK_ROW_MEMBER);
817  if (subkey->sk_flags & SK_ISNULL)
818  {
819  _bt_parallel_done(scan);
820  return false;
821  }
822  memcpy(scankeys + i, subkey, sizeof(ScanKeyData));
823 
824  /*
825  * If the row comparison is the last positioning key we accepted,
826  * try to add additional keys from the lower-order row members.
827  * (If we accepted independent conditions on additional index
828  * columns, we use those instead --- doesn't seem worth trying to
829  * determine which is more restrictive.) Note that this is OK
830  * even if the row comparison is of ">" or "<" type, because the
831  * condition applied to all but the last row member is effectively
832  * ">=" or "<=", and so the extra keys don't break the positioning
833  * scheme. But, by the same token, if we aren't able to use all
834  * the row members, then the part of the row comparison that we
835  * did use has to be treated as just a ">=" or "<=" condition, and
836  * so we'd better adjust strat_total accordingly.
837  */
838  if (i == keysCount - 1)
839  {
840  bool used_all_subkeys = false;
841 
842  Assert(!(subkey->sk_flags & SK_ROW_END));
843  for (;;)
844  {
845  subkey++;
846  Assert(subkey->sk_flags & SK_ROW_MEMBER);
847  if (subkey->sk_attno != keysCount + 1)
848  break; /* out-of-sequence, can't use it */
849  if (subkey->sk_strategy != cur->sk_strategy)
850  break; /* wrong direction, can't use it */
851  if (subkey->sk_flags & SK_ISNULL)
852  break; /* can't use null keys */
853  Assert(keysCount < INDEX_MAX_KEYS);
854  memcpy(scankeys + keysCount, subkey, sizeof(ScanKeyData));
855  keysCount++;
856  if (subkey->sk_flags & SK_ROW_END)
857  {
858  used_all_subkeys = true;
859  break;
860  }
861  }
862  if (!used_all_subkeys)
863  {
864  switch (strat_total)
865  {
867  strat_total = BTLessEqualStrategyNumber;
868  break;
870  strat_total = BTGreaterEqualStrategyNumber;
871  break;
872  }
873  }
874  break; /* done with outer loop */
875  }
876  }
877  else
878  {
879  /*
880  * Ordinary comparison key. Transform the search-style scan key
881  * to an insertion scan key by replacing the sk_func with the
882  * appropriate btree comparison function.
883  *
884  * If scankey operator is not a cross-type comparison, we can use
885  * the cached comparison function; otherwise gotta look it up in
886  * the catalogs. (That can't lead to infinite recursion, since no
887  * indexscan initiated by syscache lookup will use cross-data-type
888  * operators.)
889  *
890  * We support the convention that sk_subtype == InvalidOid means
891  * the opclass input type; this is a hack to simplify life for
892  * ScanKeyInit().
893  */
894  if (cur->sk_subtype == rel->rd_opcintype[i] ||
895  cur->sk_subtype == InvalidOid)
896  {
897  FmgrInfo *procinfo;
898 
899  procinfo = index_getprocinfo(rel, cur->sk_attno, BTORDER_PROC);
901  cur->sk_flags,
902  cur->sk_attno,
904  cur->sk_subtype,
905  cur->sk_collation,
906  procinfo,
907  cur->sk_argument);
908  }
909  else
910  {
911  RegProcedure cmp_proc;
912 
913  cmp_proc = get_opfamily_proc(rel->rd_opfamily[i],
914  rel->rd_opcintype[i],
915  cur->sk_subtype,
916  BTORDER_PROC);
917  if (!RegProcedureIsValid(cmp_proc))
918  elog(ERROR, "missing support function %d(%u,%u) for attribute %d of index \"%s\"",
919  BTORDER_PROC, rel->rd_opcintype[i], cur->sk_subtype,
920  cur->sk_attno, RelationGetRelationName(rel));
921  ScanKeyEntryInitialize(scankeys + i,
922  cur->sk_flags,
923  cur->sk_attno,
925  cur->sk_subtype,
926  cur->sk_collation,
927  cmp_proc,
928  cur->sk_argument);
929  }
930  }
931  }
932 
933  /*----------
934  * Examine the selected initial-positioning strategy to determine exactly
935  * where we need to start the scan, and set flag variables to control the
936  * code below.
937  *
938  * If nextkey = false, _bt_search and _bt_binsrch will locate the first
939  * item >= scan key. If nextkey = true, they will locate the first
940  * item > scan key.
941  *
942  * If goback = true, we will then step back one item, while if
943  * goback = false, we will start the scan on the located item.
944  *----------
945  */
946  switch (strat_total)
947  {
949 
950  /*
951  * Find first item >= scankey, then back up one to arrive at last
952  * item < scankey. (Note: this positioning strategy is only used
953  * for a backward scan, so that is always the correct starting
954  * position.)
955  */
956  nextkey = false;
957  goback = true;
958  break;
959 
961 
962  /*
963  * Find first item > scankey, then back up one to arrive at last
964  * item <= scankey. (Note: this positioning strategy is only used
965  * for a backward scan, so that is always the correct starting
966  * position.)
967  */
968  nextkey = true;
969  goback = true;
970  break;
971 
973 
974  /*
975  * If a backward scan was specified, need to start with last equal
976  * item not first one.
977  */
978  if (ScanDirectionIsBackward(dir))
979  {
980  /*
981  * This is the same as the <= strategy. We will check at the
982  * end whether the found item is actually =.
983  */
984  nextkey = true;
985  goback = true;
986  }
987  else
988  {
989  /*
990  * This is the same as the >= strategy. We will check at the
991  * end whether the found item is actually =.
992  */
993  nextkey = false;
994  goback = false;
995  }
996  break;
997 
999 
1000  /*
1001  * Find first item >= scankey. (This is only used for forward
1002  * scans.)
1003  */
1004  nextkey = false;
1005  goback = false;
1006  break;
1007 
1009 
1010  /*
1011  * Find first item > scankey. (This is only used for forward
1012  * scans.)
1013  */
1014  nextkey = true;
1015  goback = false;
1016  break;
1017 
1018  default:
1019  /* can't get here, but keep compiler quiet */
1020  elog(ERROR, "unrecognized strat_total: %d", (int) strat_total);
1021  return false;
1022  }
1023 
1024  /*
1025  * Use the manufactured insertion scan key to descend the tree and
1026  * position ourselves on the target leaf page.
1027  */
1028  stack = _bt_search(rel, keysCount, scankeys, nextkey, &buf, BT_READ,
1029  scan->xs_snapshot);
1030 
1031  /* don't need to keep the stack around... */
1032  _bt_freestack(stack);
1033 
1034  if (!BufferIsValid(buf))
1035  {
1036  /*
1037  * We only get here if the index is completely empty. Lock relation
1038  * because nothing finer to lock exists.
1039  */
1040  PredicateLockRelation(rel, scan->xs_snapshot);
1041 
1042  /*
1043  * mark parallel scan as done, so that all the workers can finish
1044  * their scan
1045  */
1046  _bt_parallel_done(scan);
1048 
1049  return false;
1050  }
1051  else
1053  scan->xs_snapshot);
1054 
1055  _bt_initialize_more_data(so, dir);
1056 
1057  /* position to the precise item on the page */
1058  offnum = _bt_binsrch(rel, buf, keysCount, scankeys, nextkey);
1059 
1060  /*
1061  * If nextkey = false, we are positioned at the first item >= scan key, or
1062  * possibly at the end of a page on which all the existing items are less
1063  * than the scan key and we know that everything on later pages is greater
1064  * than or equal to scan key.
1065  *
1066  * If nextkey = true, we are positioned at the first item > scan key, or
1067  * possibly at the end of a page on which all the existing items are less
1068  * than or equal to the scan key and we know that everything on later
1069  * pages is greater than scan key.
1070  *
1071  * The actually desired starting point is either this item or the prior
1072  * one, or in the end-of-page case it's the first item on the next page or
1073  * the last item on this page. Adjust the starting offset if needed. (If
1074  * this results in an offset before the first item or after the last one,
1075  * _bt_readpage will report no items found, and then we'll step to the
1076  * next page as needed.)
1077  */
1078  if (goback)
1079  offnum = OffsetNumberPrev(offnum);
1080 
1081  /* remember which buffer we have pinned, if any */
1083  so->currPos.buf = buf;
1084 
1085  /*
1086  * Now load data from the first page of the scan.
1087  */
1088  if (!_bt_readpage(scan, dir, offnum))
1089  {
1090  /*
1091  * There's no actually-matching data on this page. Try to advance to
1092  * the next page. Return false if there's no matching data at all.
1093  */
1095  if (!_bt_steppage(scan, dir))
1096  return false;
1097  }
1098  else
1099  {
1100  /* Drop the lock, and maybe the pin, on the current page */
1102  }
1103 
1104 readcomplete:
1105  /* OK, itemIndex says what to return */
1106  currItem = &so->currPos.items[so->currPos.itemIndex];
1107  scan->xs_ctup.t_self = currItem->heapTid;
1108  if (scan->xs_want_itup)
1109  scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
1110 
1111  return true;
1112 }
ParallelIndexScanDesc parallel_scan
Definition: relscan.h:139
#define InvalidStrategy
Definition: stratnum.h:24
Oid sk_subtype
Definition: skey.h:69
#define BUFFER_LOCK_UNLOCK
Definition: bufmgr.h:87
Definition: fmgr.h:56
#define SK_ROW_MEMBER
Definition: skey.h:118
void _bt_freestack(BTStack stack)
Definition: nbtutils.c:164
void PredicateLockPage(Relation relation, BlockNumber blkno, Snapshot snapshot)
Definition: predicate.c:2438
#define BTORDER_PROC
Definition: nbtree.h:228
#define BTGreaterStrategyNumber
Definition: stratnum.h:33
FmgrInfo * index_getprocinfo(Relation irel, AttrNumber attnum, uint16 procnum)
Definition: indexam.c:855
#define ScanDirectionIsForward(direction)
Definition: sdir.h:55
IndexTuple xs_itup
Definition: relscan.h:113
BTScanPosItem items[MaxIndexTuplesPerPage]
Definition: nbtree.h:327
void ScanKeyEntryInitializeWithInfo(ScanKey entry, int flags, AttrNumber attributeNumber, StrategyNumber strategy, Oid subtype, Oid collation, FmgrInfo *finfo, Datum argument)
Definition: scankey.c:101
void _bt_parallel_done(IndexScanDesc scan)
Definition: nbtree.c:848
static bool _bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
Definition: nbtsearch.c:1615
static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp)
Definition: nbtsearch.c:57
int itemIndex
Definition: nbtree.h:325
void PredicateLockRelation(Relation relation, Snapshot snapshot)
Definition: predicate.c:2415
char * currTuples
Definition: nbtree.h:399
regproc RegProcedure
Definition: c.h:395
#define P_NONE
Definition: nbtree.h:168
Snapshot xs_snapshot
Definition: relscan.h:90
struct cursor * cur
Definition: ecpg.c:28
uint16 StrategyNumber
Definition: stratnum.h:22
#define BTScanPosIsValid(scanpos)
Definition: nbtree.h:349
uint32 BlockNumber
Definition: block.h:31
#define BTLessEqualStrategyNumber
Definition: stratnum.h:30
Relation indexRelation
Definition: relscan.h:89
uint16 OffsetNumber
Definition: off.h:24
#define SK_ROW_END
Definition: skey.h:119
#define ScanDirectionIsBackward(direction)
Definition: sdir.h:41
#define BT_READ
Definition: nbtree.h:237
#define ERROR
Definition: elog.h:43
BTScanOpaqueData * BTScanOpaque
Definition: nbtree.h:416
StrategyNumber sk_strategy
Definition: skey.h:68
ItemPointerData t_self
Definition: htup.h:65
static char * buf
Definition: pg_test_fsync.c:65
void ScanKeyEntryInitialize(ScanKey entry, int flags, AttrNumber attributeNumber, StrategyNumber strategy, Oid subtype, Oid collation, RegProcedure procedure, Datum argument)
Definition: scankey.c:32
ScanKeyData * ScanKey
Definition: skey.h:75
static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:1846
#define RegProcedureIsValid(p)
Definition: c.h:540
IndexTupleData * IndexTuple
Definition: itup.h:53
OffsetNumber _bt_binsrch(Relation rel, Buffer buf, int keysz, ScanKey scankey, bool nextkey)
Definition: nbtsearch.c:323
#define RelationGetRelationName(relation)
Definition: rel.h:437
#define pgstat_count_index_scan(rel)
Definition: pgstat.h:1262
#define SK_SEARCHNOTNULL
Definition: skey.h:122
bool _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno)
Definition: nbtree.c:767
Oid * rd_opfamily
Definition: rel.h:182
#define SK_ISNULL
Definition: skey.h:115
#define SK_ROW_HEADER
Definition: skey.h:117
void _bt_preprocess_keys(IndexScanDesc scan)
Definition: nbtutils.c:745
#define SK_BT_NULLS_FIRST
Definition: nbtree.h:427
static void _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir)
Definition: nbtsearch.c:1936
uintptr_t Datum
Definition: postgres.h:372
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:3546
BTStack _bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey, Buffer *bufP, int access, Snapshot snapshot)
Definition: nbtsearch.c:97
#define InvalidOid
Definition: postgres_ext.h:36
bool xs_want_itup
Definition: relscan.h:95
int sk_flags
Definition: skey.h:66
#define NULL
Definition: c.h:229
#define Assert(condition)
Definition: c.h:675
#define SK_BT_DESC
Definition: nbtree.h:426
Oid get_opfamily_proc(Oid opfamily, Oid lefttype, Oid righttype, int16 procnum)
Definition: lsyscache.c:744
HeapTupleData xs_ctup
Definition: relscan.h:119
#define BTScanPosInvalidate(scanpos)
Definition: nbtree.h:355
#define INDEX_MAX_KEYS
#define InvalidBlockNumber
Definition: block.h:33
#define OffsetNumberPrev(offsetNumber)
Definition: off.h:55
int numberOfKeys
Definition: nbtree.h:378
#define BufferIsValid(bufnum)
Definition: bufmgr.h:114
#define DatumGetPointer(X)
Definition: postgres.h:555
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2605
BTScanPosData currPos
Definition: nbtree.h:412
ScanKey keyData
Definition: nbtree.h:379
Oid sk_collation
Definition: skey.h:70
int i
static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:1342
#define elog
Definition: elog.h:219
Buffer buf
Definition: nbtree.h:295
Oid * rd_opcintype
Definition: rel.h:183
static void static void status(const char *fmt,...) pg_attribute_printf(1
Definition: pg_regress.c:224
#define BTLessStrategyNumber
Definition: stratnum.h:29
int Buffer
Definition: buf.h:23
Datum sk_argument
Definition: skey.h:72
static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
Definition: nbtsearch.c:1184
int16 AttrNumber
Definition: attnum.h:21
#define BTEqualStrategyNumber
Definition: stratnum.h:31
#define BTGreaterEqualStrategyNumber
Definition: stratnum.h:32
AttrNumber sk_attno
Definition: skey.h:67
void _bt_freeskey ( ScanKey  skey)

Definition at line 155 of file nbtutils.c.

References pfree().

Referenced by _bt_doinsert(), _bt_load(), tuplesort_begin_cluster(), and tuplesort_begin_index_btree().

156 {
157  pfree(skey);
158 }
void pfree(void *pointer)
Definition: mcxt.c:950
void _bt_freestack ( BTStack  stack)

Definition at line 164 of file nbtutils.c.

References BTStackData::bts_parent, NULL, and pfree().

Referenced by _bt_doinsert(), and _bt_first().

165 {
166  BTStack ostack;
167 
168  while (stack != NULL)
169  {
170  ostack = stack;
171  stack = stack->bts_parent;
172  pfree(ostack);
173  }
174 }
void pfree(void *pointer)
Definition: mcxt.c:950
#define NULL
Definition: c.h:229
struct BTStackData * bts_parent
Definition: nbtree.h:255
Buffer _bt_get_endpoint ( Relation  rel,
uint32  level,
bool  rightmost,
Snapshot  snapshot 
)

Definition at line 1764 of file nbtsearch.c.

References _bt_getroot(), _bt_gettrueroot(), _bt_relandgetbuf(), BT_READ, BTPageOpaqueData::btpo, BTPageOpaqueData::btpo_next, buf, BufferGetPage, BufferIsValid, elog, ERROR, InvalidBuffer, ItemPointerGetBlockNumber, BTPageOpaqueData::level, P_FIRSTDATAKEY, P_IGNORE, P_NONE, P_RIGHTMOST, PageGetItem, PageGetItemId, PageGetMaxOffsetNumber, PageGetSpecialPointer, RelationGetRelationName, IndexTupleData::t_tid, and TestForOldSnapshot().

Referenced by _bt_endpoint(), and _bt_insert_parent().

1766 {
1767  Buffer buf;
1768  Page page;
1769  BTPageOpaque opaque;
1770  OffsetNumber offnum;
1771  BlockNumber blkno;
1772  IndexTuple itup;
1773 
1774  /*
1775  * If we are looking for a leaf page, okay to descend from fast root;
1776  * otherwise better descend from true root. (There is no point in being
1777  * smarter about intermediate levels.)
1778  */
1779  if (level == 0)
1780  buf = _bt_getroot(rel, BT_READ);
1781  else
1782  buf = _bt_gettrueroot(rel);
1783 
1784  if (!BufferIsValid(buf))
1785  return InvalidBuffer;
1786 
1787  page = BufferGetPage(buf);
1788  TestForOldSnapshot(snapshot, rel, page);
1789  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1790 
1791  for (;;)
1792  {
1793  /*
1794  * If we landed on a deleted page, step right to find a live page
1795  * (there must be one). Also, if we want the rightmost page, step
1796  * right if needed to get to it (this could happen if the page split
1797  * since we obtained a pointer to it).
1798  */
1799  while (P_IGNORE(opaque) ||
1800  (rightmost && !P_RIGHTMOST(opaque)))
1801  {
1802  blkno = opaque->btpo_next;
1803  if (blkno == P_NONE)
1804  elog(ERROR, "fell off the end of index \"%s\"",
1806  buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
1807  page = BufferGetPage(buf);
1808  TestForOldSnapshot(snapshot, rel, page);
1809  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1810  }
1811 
1812  /* Done? */
1813  if (opaque->btpo.level == level)
1814  break;
1815  if (opaque->btpo.level < level)
1816  elog(ERROR, "btree level %u not found in index \"%s\"",
1817  level, RelationGetRelationName(rel));
1818 
1819  /* Descend to leftmost or rightmost child page */
1820  if (rightmost)
1821  offnum = PageGetMaxOffsetNumber(page);
1822  else
1823  offnum = P_FIRSTDATAKEY(opaque);
1824 
1825  itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
1826  blkno = ItemPointerGetBlockNumber(&(itup->t_tid));
1827 
1828  buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
1829  page = BufferGetPage(buf);
1830  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1831  }
1832 
1833  return buf;
1834 }
BlockNumber btpo_next
Definition: nbtree.h:57
static void TestForOldSnapshot(Snapshot snapshot, Relation relation, Page page)
Definition: bufmgr.h:265
#define P_IGNORE(opaque)
Definition: nbtree.h:180
Buffer _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
Definition: nbtpage.c:701
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:204
ItemPointerData t_tid
Definition: itup.h:37
#define P_NONE
Definition: nbtree.h:168
#define InvalidBuffer
Definition: buf.h:25
union BTPageOpaqueData::@40 btpo
uint32 BlockNumber
Definition: block.h:31
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:354
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:67
uint16 OffsetNumber
Definition: off.h:24
#define BT_READ
Definition: nbtree.h:237
#define ERROR
Definition: elog.h:43
static char * buf
Definition: pg_test_fsync.c:65
IndexTupleData * IndexTuple
Definition: itup.h:53
#define RelationGetRelationName(relation)
Definition: rel.h:437
Buffer _bt_getroot(Relation rel, int access)
Definition: nbtpage.c:104
#define BufferGetPage(buffer)
Definition: bufmgr.h:160
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:232
uint32 level
Definition: nbtree.h:60
Buffer _bt_gettrueroot(Relation rel)
Definition: nbtpage.c:341
#define PageGetSpecialPointer(page)
Definition: bufpage.h:323
#define BufferIsValid(bufnum)
Definition: bufmgr.h:114
#define elog
Definition: elog.h:219
#define ItemPointerGetBlockNumber(pointer)
Definition: itemptr.h:75
int Buffer
Definition: buf.h:23
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:175
#define PageGetItem(page, itemId)
Definition: bufpage.h:337
Pointer Page
Definition: bufpage.h:74
Buffer _bt_getbuf ( Relation  rel,
BlockNumber  blkno,
int  access 
)

Definition at line 570 of file nbtpage.c.

References _bt_checkpage(), _bt_log_reuse_page(), _bt_page_recyclable(), _bt_pageinit(), _bt_relbuf(), Assert, BT_WRITE, BTPageOpaqueData::btpo, buf, BufferGetPage, BufferGetPageSize, ConditionalLockBuffer(), DEBUG2, elog, ExclusiveLock, GetFreeIndexPage(), InvalidBlockNumber, LockBuffer(), LockRelationForExtension(), P_NEW, PageGetSpecialPointer, PageIsNew, ReadBuffer(), RELATION_IS_LOCAL, RelationNeedsWAL, ReleaseBuffer(), UnlockRelationForExtension(), BTPageOpaqueData::xact, and XLogStandbyInfoActive.

Referenced by _bt_finish_split(), _bt_getroot(), _bt_getrootheight(), _bt_getstackbuf(), _bt_gettrueroot(), _bt_insertonpg(), _bt_is_page_halfdead(), _bt_killitems(), _bt_lock_branch_parent(), _bt_moveright(), _bt_newroot(), _bt_pagedel(), _bt_readnextpage(), _bt_split(), _bt_unlink_halfdead_page(), and _bt_walk_left().

571 {
572  Buffer buf;
573 
574  if (blkno != P_NEW)
575  {
576  /* Read an existing block of the relation */
577  buf = ReadBuffer(rel, blkno);
578  LockBuffer(buf, access);
579  _bt_checkpage(rel, buf);
580  }
581  else
582  {
583  bool needLock;
584  Page page;
585 
586  Assert(access == BT_WRITE);
587 
588  /*
589  * First see if the FSM knows of any free pages.
590  *
591  * We can't trust the FSM's report unreservedly; we have to check that
592  * the page is still free. (For example, an already-free page could
593  * have been re-used between the time the last VACUUM scanned it and
594  * the time the VACUUM made its FSM updates.)
595  *
596  * In fact, it's worse than that: we can't even assume that it's safe
597  * to take a lock on the reported page. If somebody else has a lock
598  * on it, or even worse our own caller does, we could deadlock. (The
599  * own-caller scenario is actually not improbable. Consider an index
600  * on a serial or timestamp column. Nearly all splits will be at the
601  * rightmost page, so it's entirely likely that _bt_split will call us
602  * while holding a lock on the page most recently acquired from FSM. A
603  * VACUUM running concurrently with the previous split could well have
604  * placed that page back in FSM.)
605  *
606  * To get around that, we ask for only a conditional lock on the
607  * reported page. If we fail, then someone else is using the page,
608  * and we may reasonably assume it's not free. (If we happen to be
609  * wrong, the worst consequence is the page will be lost to use till
610  * the next VACUUM, which is no big problem.)
611  */
612  for (;;)
613  {
614  blkno = GetFreeIndexPage(rel);
615  if (blkno == InvalidBlockNumber)
616  break;
617  buf = ReadBuffer(rel, blkno);
618  if (ConditionalLockBuffer(buf))
619  {
620  page = BufferGetPage(buf);
621  if (_bt_page_recyclable(page))
622  {
623  /*
624  * If we are generating WAL for Hot Standby then create a
625  * WAL record that will allow us to conflict with queries
626  * running on standby.
627  */
629  {
631 
632  _bt_log_reuse_page(rel, blkno, opaque->btpo.xact);
633  }
634 
635  /* Okay to use page. Re-initialize and return it */
636  _bt_pageinit(page, BufferGetPageSize(buf));
637  return buf;
638  }
639  elog(DEBUG2, "FSM returned nonrecyclable page");
640  _bt_relbuf(rel, buf);
641  }
642  else
643  {
644  elog(DEBUG2, "FSM returned nonlockable page");
645  /* couldn't get lock, so just drop pin */
646  ReleaseBuffer(buf);
647  }
648  }
649 
650  /*
651  * Extend the relation by one page.
652  *
653  * We have to use a lock to ensure no one else is extending the rel at
654  * the same time, else we will both try to initialize the same new
655  * page. We can skip locking for new or temp relations, however,
656  * since no one else could be accessing them.
657  */
658  needLock = !RELATION_IS_LOCAL(rel);
659 
660  if (needLock)
662 
663  buf = ReadBuffer(rel, P_NEW);
664 
665  /* Acquire buffer lock on new page */
666  LockBuffer(buf, BT_WRITE);
667 
668  /*
669  * Release the file-extension lock; it's now OK for someone else to
670  * extend the relation some more. Note that we cannot release this
671  * lock before we have buffer lock on the new page, or we risk a race
672  * condition against btvacuumscan --- see comments therein.
673  */
674  if (needLock)
676 
677  /* Initialize the new page before returning it */
678  page = BufferGetPage(buf);
679  Assert(PageIsNew(page));
680  _bt_pageinit(page, BufferGetPageSize(buf));
681  }
682 
683  /* ref count and lock type are correct */
684  return buf;
685 }
#define ExclusiveLock
Definition: lockdefs.h:44
#define RELATION_IS_LOCAL(relation)
Definition: rel.h:524
union BTPageOpaqueData::@40 btpo
void ReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:3309
#define P_NEW
Definition: bufmgr.h:82
bool _bt_page_recyclable(Page page)
Definition: nbtpage.c:744
TransactionId xact
Definition: nbtree.h:61
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:67
void _bt_checkpage(Relation rel, Buffer buf)
Definition: nbtpage.c:503
#define DEBUG2
Definition: elog.h:24
static char * buf
Definition: pg_test_fsync.c:65
#define BufferGetPage(buffer)
Definition: bufmgr.h:160
bool ConditionalLockBuffer(Buffer buffer)
Definition: bufmgr.c:3572
static void _bt_log_reuse_page(Relation rel, BlockNumber blkno, TransactionId latestRemovedXid)
Definition: nbtpage.c:537
void LockRelationForExtension(Relation relation, LOCKMODE lockmode)
Definition: lmgr.c:332
void UnlockRelationForExtension(Relation relation, LOCKMODE lockmode)
Definition: lmgr.c:382
#define BufferGetPageSize(buffer)
Definition: bufmgr.h:147
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:3546
#define XLogStandbyInfoActive()
Definition: xlog.h:159
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:720
BlockNumber GetFreeIndexPage(Relation rel)
Definition: indexfsm.c:38
#define Assert(condition)
Definition: c.h:675
Buffer ReadBuffer(Relation reln, BlockNumber blockNum)
Definition: bufmgr.c:594
#define PageGetSpecialPointer(page)
Definition: bufpage.h:323
#define InvalidBlockNumber
Definition: block.h:33
#define RelationNeedsWAL(relation)
Definition: rel.h:506
#define PageIsNew(page)
Definition: bufpage.h:226
void _bt_pageinit(Page page, Size size)
Definition: nbtpage.c:732
#define elog
Definition: elog.h:219
#define BT_WRITE
Definition: nbtree.h:238
int Buffer
Definition: buf.h:23
Pointer Page
Definition: bufpage.h:74
Buffer _bt_getroot ( Relation  rel,
int  access 
)

Definition at line 104 of file nbtpage.c.

References _bt_getbuf(), _bt_getroot(), _bt_relandgetbuf(), _bt_relbuf(), Assert, BT_READ, BT_WRITE, BTMetaPageData::btm_fastlevel, BTMetaPageData::btm_fastroot, BTMetaPageData::btm_level, BTMetaPageData::btm_magic, BTMetaPageData::btm_root, BTMetaPageData::btm_version, BTP_LEAF, BTP_META, BTP_ROOT, BTPageGetMeta, BTPageOpaqueData::btpo, BTPageOpaqueData::btpo_cycleid, BTPageOpaqueData::btpo_flags, BTPageOpaqueData::btpo_next, BTPageOpaqueData::btpo_prev, BTREE_MAGIC, BTREE_METAPAGE, BTREE_VERSION, BUFFER_LOCK_UNLOCK, BufferGetBlockNumber(), BufferGetPage, elog, END_CRIT_SECTION, ereport, errcode(), errmsg(), ERROR, xl_btree_metadata::fastlevel, xl_btree_metadata::fastroot, InvalidBuffer, xl_btree_metadata::level, BTPageOpaqueData::level, xl_btree_newroot::level, LockBuffer(), MarkBufferDirty(), MemoryContextAlloc(), NULL, P_IGNORE, P_LEFTMOST, P_NEW, P_NONE, P_RIGHTMOST, PageGetSpecialPointer, PageSetLSN, pfree(), RelationData::rd_amcache, RelationData::rd_indexcxt, REGBUF_WILL_INIT, RelationGetRelationName, RelationNeedsWAL, xl_btree_metadata::root, xl_btree_newroot::rootblk, SizeOfBtreeNewroot, START_CRIT_SECTION, XLOG_BTREE_NEWROOT, XLogBeginInsert(), XLogInsert(), XLogRegisterBufData(), XLogRegisterBuffer(), and XLogRegisterData().

Referenced by _bt_get_endpoint(), _bt_getroot(), and _bt_search().

105 {
106  Buffer metabuf;
107  Page metapg;
108  BTPageOpaque metaopaque;
109  Buffer rootbuf;
110  Page rootpage;
111  BTPageOpaque rootopaque;
112  BlockNumber rootblkno;
113  uint32 rootlevel;
114  BTMetaPageData *metad;
115 
116  /*
117  * Try to use previously-cached metapage data to find the root. This
118  * normally saves one buffer access per index search, which is a very
119  * helpful savings in bufmgr traffic and hence contention.
120  */
121  if (rel->rd_amcache != NULL)
122  {
123  metad = (BTMetaPageData *) rel->rd_amcache;
124  /* We shouldn't have cached it if any of these fail */
125  Assert(metad->btm_magic == BTREE_MAGIC);
126  Assert(metad->btm_version == BTREE_VERSION);
127  Assert(metad->btm_root != P_NONE);
128 
129  rootblkno = metad->btm_fastroot;
130  Assert(rootblkno != P_NONE);
131  rootlevel = metad->btm_fastlevel;
132 
133  rootbuf = _bt_getbuf(rel, rootblkno, BT_READ);
134  rootpage = BufferGetPage(rootbuf);
135  rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
136 
137  /*
138  * Since the cache might be stale, we check the page more carefully
139  * here than normal. We *must* check that it's not deleted. If it's
140  * not alone on its level, then we reject too --- this may be overly
141  * paranoid but better safe than sorry. Note we don't check P_ISROOT,
142  * because that's not set in a "fast root".
143  */
144  if (!P_IGNORE(rootopaque) &&
145  rootopaque->btpo.level == rootlevel &&
146  P_LEFTMOST(rootopaque) &&
147  P_RIGHTMOST(rootopaque))
148  {
149  /* OK, accept cached page as the root */
150  return rootbuf;
151  }
152  _bt_relbuf(rel, rootbuf);
153  /* Cache is stale, throw it away */
154  if (rel->rd_amcache)
155  pfree(rel->rd_amcache);
156  rel->rd_amcache = NULL;
157  }
158 
159  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
160  metapg = BufferGetPage(metabuf);
161  metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg);
162  metad = BTPageGetMeta(metapg);
163 
164  /* sanity-check the metapage */
165  if (!(metaopaque->btpo_flags & BTP_META) ||
166  metad->btm_magic != BTREE_MAGIC)
167  ereport(ERROR,
168  (errcode(ERRCODE_INDEX_CORRUPTED),
169  errmsg("index \"%s\" is not a btree",
170  RelationGetRelationName(rel))));
171 
172  if (metad->btm_version != BTREE_VERSION)
173  ereport(ERROR,
174  (errcode(ERRCODE_INDEX_CORRUPTED),
175  errmsg("version mismatch in index \"%s\": file version %d, code version %d",
177  metad->btm_version, BTREE_VERSION)));
178 
179  /* if no root page initialized yet, do it */
180  if (metad->btm_root == P_NONE)
181  {
182  /* If access = BT_READ, caller doesn't want us to create root yet */
183  if (access == BT_READ)
184  {
185  _bt_relbuf(rel, metabuf);
186  return InvalidBuffer;
187  }
188 
189  /* trade in our read lock for a write lock */
190  LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
191  LockBuffer(metabuf, BT_WRITE);
192 
193  /*
194  * Race condition: if someone else initialized the metadata between
195  * the time we released the read lock and acquired the write lock, we
196  * must avoid doing it again.
197  */
198  if (metad->btm_root != P_NONE)
199  {
200  /*
201  * Metadata initialized by someone else. In order to guarantee no
202  * deadlocks, we have to release the metadata page and start all
203  * over again. (Is that really true? But it's hardly worth trying
204  * to optimize this case.)
205  */
206  _bt_relbuf(rel, metabuf);
207  return _bt_getroot(rel, access);
208  }
209 
210  /*
211  * Get, initialize, write, and leave a lock of the appropriate type on
212  * the new root page. Since this is the first page in the tree, it's
213  * a leaf as well as the root.
214  */
215  rootbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
216  rootblkno = BufferGetBlockNumber(rootbuf);
217  rootpage = BufferGetPage(rootbuf);
218  rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
219  rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE;
220  rootopaque->btpo_flags = (BTP_LEAF | BTP_ROOT);
221  rootopaque->btpo.level = 0;
222  rootopaque->btpo_cycleid = 0;
223 
224  /* NO ELOG(ERROR) till meta is updated */
226 
227  metad->btm_root = rootblkno;
228  metad->btm_level = 0;
229  metad->btm_fastroot = rootblkno;
230  metad->btm_fastlevel = 0;
231 
232  MarkBufferDirty(rootbuf);
233  MarkBufferDirty(metabuf);
234 
235  /* XLOG stuff */
236  if (RelationNeedsWAL(rel))
237  {
238  xl_btree_newroot xlrec;
239  XLogRecPtr recptr;
241 
242  XLogBeginInsert();
245 
246  md.root = rootblkno;
247  md.level = 0;
248  md.fastroot = rootblkno;
249  md.fastlevel = 0;
250 
251  XLogRegisterBufData(2, (char *) &md, sizeof(xl_btree_metadata));
252 
253  xlrec.rootblk = rootblkno;
254  xlrec.level = 0;
255 
256  XLogRegisterData((char *) &xlrec, SizeOfBtreeNewroot);
257 
258  recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT);
259 
260  PageSetLSN(rootpage, recptr);
261  PageSetLSN(metapg, recptr);
262  }
263 
265 
266  /*
267  * swap root write lock for read lock. There is no danger of anyone
268  * else accessing the new root page while it's unlocked, since no one
269  * else knows where it is yet.
270  */
271  LockBuffer(rootbuf, BUFFER_LOCK_UNLOCK);
272  LockBuffer(rootbuf, BT_READ);
273 
274  /* okay, metadata is correct, release lock on it */
275  _bt_relbuf(rel, metabuf);
276  }
277  else
278  {
279  rootblkno = metad->btm_fastroot;
280  Assert(rootblkno != P_NONE);
281  rootlevel = metad->btm_fastlevel;
282 
283  /*
284  * Cache the metapage data for next time
285  */
287  sizeof(BTMetaPageData));
288  memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));
289 
290  /*
291  * We are done with the metapage; arrange to release it via first
292  * _bt_relandgetbuf call
293  */
294  rootbuf = metabuf;
295 
296  for (;;)
297  {
298  rootbuf = _bt_relandgetbuf(rel, rootbuf, rootblkno, BT_READ);
299  rootpage = BufferGetPage(rootbuf);
300  rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
301 
302  if (!P_IGNORE(rootopaque))
303  break;
304 
305  /* it's dead, Jim. step right one page */
306  if (P_RIGHTMOST(rootopaque))
307  elog(ERROR, "no live root page found in index \"%s\"",
309  rootblkno = rootopaque->btpo_next;
310  }
311 
312  /* Note: can't check btpo.level on deleted pages */
313  if (rootopaque->btpo.level != rootlevel)
314  elog(ERROR, "root page %u of index \"%s\" has level %u, expected %u",
315  rootblkno, RelationGetRelationName(rel),
316  rootopaque->btpo.level, rootlevel);
317  }
318 
319  /*
320  * By here, we have a pin and read lock on the root page, and no lock set
321  * on the metadata page. Return the root page's buffer.
322  */
323  return rootbuf;
324 }
void XLogRegisterBufData(uint8 block_id, char *data, int len)
Definition: xloginsert.c:361
BlockNumber rootblk
Definition: nbtxlog.h:240
#define BUFFER_LOCK_UNLOCK
Definition: bufmgr.h:87
#define BTP_ROOT
Definition: nbtree.h:71
BlockNumber btpo_next
Definition: nbtree.h:57
#define P_IGNORE(opaque)
Definition: nbtree.h:180
uint32 btm_version
Definition: nbtree.h:99
Buffer _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
Definition: nbtpage.c:701
Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access)
Definition: nbtpage.c:570
void MarkBufferDirty(Buffer buffer)
Definition: bufmgr.c:1450
void XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags)
Definition: xloginsert.c:213
#define BTREE_VERSION
Definition: nbtree.h:111
uint32 btm_magic
Definition: nbtree.h:98
#define BTP_LEAF
Definition: nbtree.h:70
#define END_CRIT_SECTION()
Definition: miscadmin.h:132
BlockNumber root
Definition: nbtxlog.h:48
#define P_NONE
Definition: nbtree.h:168
#define InvalidBuffer
Definition: buf.h:25
#define REGBUF_WILL_INIT
Definition: xloginsert.h:32
#define START_CRIT_SECTION()
Definition: miscadmin.h:130
int errcode(int sqlerrcode)
Definition: elog.c:575
uint32 level
Definition: nbtxlog.h:241
union BTPageOpaqueData::@40 btpo
uint32 BlockNumber
Definition: block.h:31
#define P_NEW
Definition: bufmgr.h:82
#define BTP_META
Definition: nbtree.h:73
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:67
#define BT_READ
Definition: nbtree.h:237
BlockNumber btm_fastroot
Definition: nbtree.h:102
#define XLOG_BTREE_NEWROOT
Definition: nbtxlog.h:36
void pfree(void *pointer)
Definition: mcxt.c:950
#define BTREE_MAGIC
Definition: nbtree.h:110
#define ERROR
Definition: elog.h:43
BTCycleId btpo_cycleid
Definition: nbtree.h:64
#define BTPageGetMeta(p)
Definition: nbtree.h:106
BlockNumber btpo_prev
Definition: nbtree.h:56
#define RelationGetRelationName(relation)
Definition: rel.h:437
#define P_LEFTMOST(opaque)
Definition: nbtree.h:174
unsigned int uint32
Definition: c.h:268
Buffer _bt_getroot(Relation rel, int access)
Definition: nbtpage.c:104
#define BufferGetPage(buffer)
Definition: bufmgr.h:160
#define ereport(elevel, rest)
Definition: elog.h:122
#define BTREE_METAPAGE
Definition: nbtree.h:109
#define SizeOfBtreeNewroot
Definition: nbtxlog.h:244
uint32 btm_fastlevel
Definition: nbtree.h:103
void XLogRegisterData(char *data, int len)
Definition: xloginsert.c:323
uint32 level
Definition: nbtree.h:60
XLogRecPtr XLogInsert(RmgrId rmid, uint8 info)
Definition: xloginsert.c:415
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:3546
BlockNumber btm_root
Definition: nbtree.h:100
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:720
#define NULL
Definition: c.h:229
uint64 XLogRecPtr
Definition: xlogdefs.h:21
#define Assert(condition)
Definition: c.h:675
#define PageGetSpecialPointer(page)
Definition: bufpage.h:323
#define RelationNeedsWAL(relation)
Definition: rel.h:506
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2605
int errmsg(const char *fmt,...)
Definition: elog.c:797
void * MemoryContextAlloc(MemoryContext context, Size size)
Definition: mcxt.c:707
uint32 fastlevel
Definition: nbtxlog.h:51
uint32 btm_level
Definition: nbtree.h:101
MemoryContext rd_indexcxt
Definition: rel.h:179
uint32 level
Definition: nbtxlog.h:49
BlockNumber fastroot
Definition: nbtxlog.h:50
#define elog
Definition: elog.h:219
void * rd_amcache
Definition: rel.h:192
#define BT_WRITE
Definition: nbtree.h:238
void XLogBeginInsert(void)
Definition: xloginsert.c:120
uint16 btpo_flags
Definition: nbtree.h:63
#define PageSetLSN(page, lsn)
Definition: bufpage.h:365
int Buffer
Definition: buf.h:23
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:175
Pointer Page
Definition: bufpage.h:74
int _bt_getrootheight ( Relation  rel)

Definition at line 435 of file nbtpage.c.

References _bt_getbuf(), _bt_relbuf(), Assert, BT_READ, BTMetaPageData::btm_fastlevel, BTMetaPageData::btm_fastroot, BTMetaPageData::btm_magic, BTMetaPageData::btm_root, BTMetaPageData::btm_version, BTP_META, BTPageGetMeta, BTPageOpaqueData::btpo_flags, BTREE_MAGIC, BTREE_METAPAGE, BTREE_VERSION, BufferGetPage, ereport, errcode(), errmsg(), ERROR, MemoryContextAlloc(), NULL, P_NONE, PageGetSpecialPointer, RelationData::rd_amcache, RelationData::rd_indexcxt, and RelationGetRelationName.

Referenced by get_relation_info().

436 {
437  BTMetaPageData *metad;
438 
439  /*
440  * We can get what we need from the cached metapage data. If it's not
441  * cached yet, load it. Sanity checks here must match _bt_getroot().
442  */
443  if (rel->rd_amcache == NULL)
444  {
445  Buffer metabuf;
446  Page metapg;
447  BTPageOpaque metaopaque;
448 
449  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
450  metapg = BufferGetPage(metabuf);
451  metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg);
452  metad = BTPageGetMeta(metapg);
453 
454  /* sanity-check the metapage */
455  if (!(metaopaque->btpo_flags & BTP_META) ||
456  metad->btm_magic != BTREE_MAGIC)
457  ereport(ERROR,
458  (errcode(ERRCODE_INDEX_CORRUPTED),
459  errmsg("index \"%s\" is not a btree",
460  RelationGetRelationName(rel))));
461 
462  if (metad->btm_version != BTREE_VERSION)
463  ereport(ERROR,
464  (errcode(ERRCODE_INDEX_CORRUPTED),
465  errmsg("version mismatch in index \"%s\": file version %d, code version %d",
467  metad->btm_version, BTREE_VERSION)));
468 
469  /*
470  * If there's no root page yet, _bt_getroot() doesn't expect a cache
471  * to be made, so just stop here and report the index height is zero.
472  * (XXX perhaps _bt_getroot() should be changed to allow this case.)
473  */
474  if (metad->btm_root == P_NONE)
475  {
476  _bt_relbuf(rel, metabuf);
477  return 0;
478  }
479 
480  /*
481  * Cache the metapage data for next time
482  */
484  sizeof(BTMetaPageData));
485  memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));
486 
487  _bt_relbuf(rel, metabuf);
488  }
489 
490  metad = (BTMetaPageData *) rel->rd_amcache;
491  /* We shouldn't have cached it if any of these fail */
492  Assert(metad->btm_magic == BTREE_MAGIC);
493  Assert(metad->btm_version == BTREE_VERSION);
494  Assert(metad->btm_fastroot != P_NONE);
495 
496  return metad->btm_fastlevel;
497 }
uint32 btm_version
Definition: nbtree.h:99
Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access)
Definition: nbtpage.c:570
#define BTREE_VERSION
Definition: nbtree.h:111
uint32 btm_magic
Definition: nbtree.h:98
#define P_NONE
Definition: nbtree.h:168
int errcode(int sqlerrcode)
Definition: elog.c:575
#define BTP_META
Definition: nbtree.h:73
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:67
#define BT_READ
Definition: nbtree.h:237
BlockNumber btm_fastroot
Definition: nbtree.h:102
#define BTREE_MAGIC
Definition: nbtree.h:110
#define ERROR
Definition: elog.h:43
#define BTPageGetMeta(p)
Definition: nbtree.h:106
#define RelationGetRelationName(relation)
Definition: rel.h:437
#define BufferGetPage(buffer)
Definition: bufmgr.h:160
#define ereport(elevel, rest)
Definition: elog.h:122
#define BTREE_METAPAGE
Definition: nbtree.h:109
uint32 btm_fastlevel
Definition: nbtree.h:103
BlockNumber btm_root
Definition: nbtree.h:100
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:720
#define NULL
Definition: c.h:229
#define Assert(condition)
Definition: c.h:675
#define PageGetSpecialPointer(page)
Definition: bufpage.h:323
int errmsg(const char *fmt,...)
Definition: elog.c:797
void * MemoryContextAlloc(MemoryContext context, Size size)
Definition: mcxt.c:707
MemoryContext rd_indexcxt
Definition: rel.h:179
void * rd_amcache
Definition: rel.h:192
uint16 btpo_flags
Definition: nbtree.h:63
int Buffer
Definition: buf.h:23
Pointer Page
Definition: bufpage.h:74
Buffer _bt_getstackbuf ( Relation  rel,
BTStack  stack,
int  access 
)

Definition at line 1804 of file nbtinsert.c.

References _bt_finish_split(), _bt_getbuf(), _bt_relbuf(), BT_WRITE, BTEntrySame, BTPageOpaqueData::btpo_next, BTStackData::bts_blkno, BTStackData::bts_btentry, BTStackData::bts_offset, BTStackData::bts_parent, buf, BufferGetPage, InvalidBuffer, InvalidOffsetNumber, OffsetNumberNext, OffsetNumberPrev, P_FIRSTDATAKEY, P_IGNORE, P_INCOMPLETE_SPLIT, P_RIGHTMOST, PageGetItem, PageGetItemId, PageGetMaxOffsetNumber, and PageGetSpecialPointer.

Referenced by _bt_insert_parent(), and _bt_lock_branch_parent().

1805 {
1806  BlockNumber blkno;
1807  OffsetNumber start;
1808 
1809  blkno = stack->bts_blkno;
1810  start = stack->bts_offset;
1811 
1812  for (;;)
1813  {
1814  Buffer buf;
1815  Page page;
1816  BTPageOpaque opaque;
1817 
1818  buf = _bt_getbuf(rel, blkno, access);
1819  page = BufferGetPage(buf);
1820  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1821 
1822  if (access == BT_WRITE && P_INCOMPLETE_SPLIT(opaque))
1823  {
1824  _bt_finish_split(rel, buf, stack->bts_parent);
1825  continue;
1826  }
1827 
1828  if (!P_IGNORE(opaque))
1829  {
1830  OffsetNumber offnum,
1831  minoff,
1832  maxoff;
1833  ItemId itemid;
1834  IndexTuple item;
1835 
1836  minoff = P_FIRSTDATAKEY(opaque);
1837  maxoff = PageGetMaxOffsetNumber(page);
1838 
1839  /*
1840  * start = InvalidOffsetNumber means "search the whole page". We
1841  * need this test anyway due to possibility that page has a high
1842  * key now when it didn't before.
1843  */
1844  if (start < minoff)
1845  start = minoff;
1846 
1847  /*
1848  * Need this check too, to guard against possibility that page
1849  * split since we visited it originally.
1850  */
1851  if (start > maxoff)
1852  start = OffsetNumberNext(maxoff);
1853 
1854  /*
1855  * These loops will check every item on the page --- but in an
1856  * order that's attuned to the probability of where it actually
1857  * is. Scan to the right first, then to the left.
1858  */
1859  for (offnum = start;
1860  offnum <= maxoff;
1861  offnum = OffsetNumberNext(offnum))
1862  {
1863  itemid = PageGetItemId(page, offnum);
1864  item = (IndexTuple) PageGetItem(page, itemid);
1865  if (BTEntrySame(item, &stack->bts_btentry))
1866  {
1867  /* Return accurate pointer to where link is now */
1868  stack->bts_blkno = blkno;
1869  stack->bts_offset = offnum;
1870  return buf;
1871  }
1872  }
1873 
1874  for (offnum = OffsetNumberPrev(start);
1875  offnum >= minoff;
1876  offnum = OffsetNumberPrev(offnum))
1877  {
1878  itemid = PageGetItemId(page, offnum);
1879  item = (IndexTuple) PageGetItem(page, itemid);
1880  if (BTEntrySame(item, &stack->bts_btentry))
1881  {
1882  /* Return accurate pointer to where link is now */
1883  stack->bts_blkno = blkno;
1884  stack->bts_offset = offnum;
1885  return buf;
1886  }
1887  }
1888  }
1889 
1890  /*
1891  * The item we're looking for moved right at least one page.
1892  */
1893  if (P_RIGHTMOST(opaque))
1894  {
1895  _bt_relbuf(rel, buf);
1896  return InvalidBuffer;
1897  }
1898  blkno = opaque->btpo_next;
1899  start = InvalidOffsetNumber;
1900  _bt_relbuf(rel, buf);
1901  }
1902 }
BlockNumber btpo_next
Definition: nbtree.h:57
#define P_IGNORE(opaque)
Definition: nbtree.h:180
Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access)
Definition: nbtpage.c:570
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:204
#define InvalidBuffer
Definition: buf.h:25
#define BTEntrySame(i1, i2)
Definition: nbtree.h:156
uint32 BlockNumber
Definition: block.h:31
#define P_INCOMPLETE_SPLIT(opaque)
Definition: nbtree.h:182
IndexTupleData bts_btentry
Definition: nbtree.h:254
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:354
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:67
uint16 OffsetNumber
Definition: off.h:24
void _bt_finish_split(Relation rel, Buffer lbuf, BTStack stack)
Definition: nbtinsert.c:1745
OffsetNumber bts_offset
Definition: nbtree.h:253
static char * buf
Definition: pg_test_fsync.c:65
IndexTupleData * IndexTuple
Definition: itup.h:53
#define BufferGetPage(buffer)
Definition: bufmgr.h:160
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:232
#define InvalidOffsetNumber
Definition: off.h:26
BlockNumber bts_blkno
Definition: nbtree.h:252
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:720
#define OffsetNumberNext(offsetNumber)
Definition: off.h:53
#define PageGetSpecialPointer(page)
Definition: bufpage.h:323
#define OffsetNumberPrev(offsetNumber)
Definition: off.h:55
struct BTStackData * bts_parent
Definition: nbtree.h:255
#define BT_WRITE
Definition: nbtree.h:238
int Buffer
Definition: buf.h:23
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:175
#define PageGetItem(page, itemId)
Definition: bufpage.h:337
Pointer Page
Definition: bufpage.h:74
Buffer _bt_gettrueroot ( Relation  rel)

Definition at line 341 of file nbtpage.c.

References _bt_getbuf(), _bt_relandgetbuf(), _bt_relbuf(), BT_READ, BTMetaPageData::btm_level, BTMetaPageData::btm_magic, BTMetaPageData::btm_root, BTMetaPageData::btm_version, BTP_META, BTPageGetMeta, BTPageOpaqueData::btpo, BTPageOpaqueData::btpo_flags, BTPageOpaqueData::btpo_next, BTREE_MAGIC, BTREE_METAPAGE, BTREE_VERSION, BufferGetPage, elog, ereport, errcode(), errmsg(), ERROR, InvalidBuffer, BTPageOpaqueData::level, NULL, P_IGNORE, P_NONE, P_RIGHTMOST, PageGetSpecialPointer, pfree(), RelationData::rd_amcache, and RelationGetRelationName.

Referenced by _bt_get_endpoint().

342 {
343  Buffer metabuf;
344  Page metapg;
345  BTPageOpaque metaopaque;
346  Buffer rootbuf;
347  Page rootpage;
348  BTPageOpaque rootopaque;
349  BlockNumber rootblkno;
350  uint32 rootlevel;
351  BTMetaPageData *metad;
352 
353  /*
354  * We don't try to use cached metapage data here, since (a) this path is
355  * not performance-critical, and (b) if we are here it suggests our cache
356  * is out-of-date anyway. In light of point (b), it's probably safest to
357  * actively flush any cached metapage info.
358  */
359  if (rel->rd_amcache)
360  pfree(rel->rd_amcache);
361  rel->rd_amcache = NULL;
362 
363  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
364  metapg = BufferGetPage(metabuf);
365  metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg);
366  metad = BTPageGetMeta(metapg);
367 
368  if (!(metaopaque->btpo_flags & BTP_META) ||
369  metad->btm_magic != BTREE_MAGIC)
370  ereport(ERROR,
371  (errcode(ERRCODE_INDEX_CORRUPTED),
372  errmsg("index \"%s\" is not a btree",
373  RelationGetRelationName(rel))));
374 
375  if (metad->btm_version != BTREE_VERSION)
376  ereport(ERROR,
377  (errcode(ERRCODE_INDEX_CORRUPTED),
378  errmsg("version mismatch in index \"%s\": file version %d, code version %d",
380  metad->btm_version, BTREE_VERSION)));
381 
382  /* if no root page initialized yet, fail */
383  if (metad->btm_root == P_NONE)
384  {
385  _bt_relbuf(rel, metabuf);
386  return InvalidBuffer;
387  }
388 
389  rootblkno = metad->btm_root;
390  rootlevel = metad->btm_level;
391 
392  /*
393  * We are done with the metapage; arrange to release it via first
394  * _bt_relandgetbuf call
395  */
396  rootbuf = metabuf;
397 
398  for (;;)
399  {
400  rootbuf = _bt_relandgetbuf(rel, rootbuf, rootblkno, BT_READ);
401  rootpage = BufferGetPage(rootbuf);
402  rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
403 
404  if (!P_IGNORE(rootopaque))
405  break;
406 
407  /* it's dead, Jim. step right one page */
408  if (P_RIGHTMOST(rootopaque))
409  elog(ERROR, "no live root page found in index \"%s\"",
411  rootblkno = rootopaque->btpo_next;
412  }
413 
414  /* Note: can't check btpo.level on deleted pages */
415  if (rootopaque->btpo.level != rootlevel)
416  elog(ERROR, "root page %u of index \"%s\" has level %u, expected %u",
417  rootblkno, RelationGetRelationName(rel),
418  rootopaque->btpo.level, rootlevel);
419 
420  return rootbuf;
421 }
BlockNumber btpo_next
Definition: nbtree.h:57
#define P_IGNORE(opaque)
Definition: nbtree.h:180
uint32 btm_version
Definition: nbtree.h:99
Buffer _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
Definition: nbtpage.c:701
Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access)
Definition: nbtpage.c:570
#define BTREE_VERSION
Definition: nbtree.h:111
uint32 btm_magic
Definition: nbtree.h:98
#define P_NONE
Definition: nbtree.h:168
#define InvalidBuffer
Definition: buf.h:25
int errcode(int sqlerrcode)
Definition: elog.c:575
union BTPageOpaqueData::@40 btpo
uint32 BlockNumber
Definition: block.h:31
#define BTP_META
Definition: nbtree.h:73
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:67
#define BT_READ
Definition: nbtree.h:237
void pfree(void *pointer)
Definition: mcxt.c:950
#define BTREE_MAGIC
Definition: nbtree.h:110
#define ERROR
Definition: elog.h:43
#define BTPageGetMeta(p)
Definition: nbtree.h:106
#define RelationGetRelationName(relation)
Definition: rel.h:437
unsigned int uint32
Definition: c.h:268
#define BufferGetPage(buffer)
Definition: bufmgr.h:160
#define ereport(elevel, rest)
Definition: elog.h:122
#define BTREE_METAPAGE
Definition: nbtree.h:109
uint32 level
Definition: nbtree.h:60
BlockNumber btm_root
Definition: nbtree.h:100
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:720
#define NULL
Definition: c.h:229
#define PageGetSpecialPointer(page)
Definition: bufpage.h:323
int errmsg(const char *fmt,...)
Definition: elog.c:797
uint32 btm_level
Definition: nbtree.h:101
#define elog
Definition: elog.h:219
void * rd_amcache
Definition: rel.h:192
uint16 btpo_flags
Definition: nbtree.h:63
int Buffer
Definition: buf.h:23
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:175
Pointer Page
Definition: bufpage.h:74
void _bt_initmetapage ( Page  page,
BlockNumber  rootbknum,
uint32  level 
)

Definition at line 49 of file nbtpage.c.

References _bt_pageinit(), BTMetaPageData::btm_fastlevel, BTMetaPageData::btm_fastroot, BTMetaPageData::btm_level, BTMetaPageData::btm_magic, BTMetaPageData::btm_root, BTMetaPageData::btm_version, BTP_META, BTPageGetMeta, BTPageOpaqueData::btpo_flags, BTREE_MAGIC, BTREE_VERSION, and PageGetSpecialPointer.

Referenced by _bt_uppershutdown(), and btbuildempty().

50 {
51  BTMetaPageData *metad;
52  BTPageOpaque metaopaque;
53 
54  _bt_pageinit(page, BLCKSZ);
55 
56  metad = BTPageGetMeta(page);
57  metad->btm_magic = BTREE_MAGIC;
58  metad->btm_version = BTREE_VERSION;
59  metad->btm_root = rootbknum;
60  metad->btm_level = level;
61  metad->btm_fastroot = rootbknum;
62  metad->btm_fastlevel = level;
63 
64  metaopaque = (BTPageOpaque) PageGetSpecialPointer(page);
65  metaopaque->btpo_flags = BTP_META;
66 
67  /*
68  * Set pd_lower just past the end of the metadata. This is not essential
69  * but it makes the page look compressible to xlog.c.
70  */
71  ((PageHeader) page)->pd_lower =
72  ((char *) metad + sizeof(BTMetaPageData)) - (char *) page;
73 }
uint32 btm_version
Definition: nbtree.h:99
#define BTREE_VERSION
Definition: nbtree.h:111
uint32 btm_magic
Definition: nbtree.h:98
#define BTP_META
Definition: nbtree.h:73
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:67
BlockNumber btm_fastroot
Definition: nbtree.h:102
#define BTREE_MAGIC
Definition: nbtree.h:110
#define BTPageGetMeta(p)
Definition: nbtree.h:106
uint32 btm_fastlevel
Definition: nbtree.h:103
BlockNumber btm_root
Definition: nbtree.h:100
PageHeaderData * PageHeader
Definition: bufpage.h:162
#define PageGetSpecialPointer(page)
Definition: bufpage.h:323
uint32 btm_level
Definition: nbtree.h:101
void _bt_pageinit(Page page, Size size)
Definition: nbtpage.c:732
uint16 btpo_flags
Definition: nbtree.h:63
void _bt_killitems ( IndexScanDesc  scan)

Definition at line 1732 of file nbtutils.c.

References _bt_getbuf(), _bt_relbuf(), Assert, BT_READ, BTP_HAS_GARBAGE, BTScanPosIsPinned, BTScanPosIsValid, buf, BTScanPosData::buf, BUFFER_LOCK_UNLOCK, BufferGetPage, BufferIsValid, BTScanPosData::currPage, BTScanOpaqueData::currPos, BTScanPosData::firstItem, BTScanPosItem::heapTid, i, BTScanPosItem::indexOffset, IndexScanDescData::indexRelation, ItemIdMarkDead, ItemPointerEquals(), BTScanPosData::items, BTScanOpaqueData::killedItems, LockBuffer(), BTScanPosData::lsn, MarkBufferDirtyHint(), BTScanOpaqueData::numKilled, OffsetNumberNext, IndexScanDescData::opaque, P_FIRSTDATAKEY, PageGetItem, PageGetItemId, PageGetLSN, PageGetMaxOffsetNumber, PageGetSpecialPointer, and IndexTupleData::t_tid.

Referenced by _bt_steppage(), btendscan(), btrescan(), and btrestrpos().

1733 {
1734  BTScanOpaque so = (BTScanOpaque) scan->opaque;
1735  Page page;
1736  BTPageOpaque opaque;
1737  OffsetNumber minoff;
1738  OffsetNumber maxoff;
1739  int i;
1740  int numKilled = so->numKilled;
1741  bool killedsomething = false;
1742 
1744 
1745  /*
1746  * Always reset the scan state, so we don't look for same items on other
1747  * pages.
1748  */
1749  so->numKilled = 0;
1750 
1751  if (BTScanPosIsPinned(so->currPos))
1752  {
1753  /*
1754  * We have held the pin on this page since we read the index tuples,
1755  * so all we need to do is lock it. The pin will have prevented
1756  * re-use of any TID on the page, so there is no need to check the
1757  * LSN.
1758  */
1759  LockBuffer(so->currPos.buf, BT_READ);
1760 
1761  page = BufferGetPage(so->currPos.buf);
1762  }
1763  else
1764  {
1765  Buffer buf;
1766 
1767  /* Attempt to re-read the buffer, getting pin and lock. */
1768  buf = _bt_getbuf(scan->indexRelation, so->currPos.currPage, BT_READ);
1769 
1770  /* It might not exist anymore; in which case we can't hint it. */
1771  if (!BufferIsValid(buf))
1772  return;
1773 
1774  page = BufferGetPage(buf);
1775  if (PageGetLSN(page) == so->currPos.lsn)
1776  so->currPos.buf = buf;
1777  else
1778  {
1779  /* Modified while not pinned means hinting is not safe. */
1780  _bt_relbuf(scan->indexRelation, buf);
1781  return;
1782  }
1783  }
1784 
1785  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1786  minoff = P_FIRSTDATAKEY(opaque);
1787  maxoff = PageGetMaxOffsetNumber(page);
1788 
1789  for (i = 0; i < numKilled; i++)
1790  {
1791  int itemIndex = so->killedItems[i];
1792  BTScanPosItem *kitem = &so->currPos.items[itemIndex];
1793  OffsetNumber offnum = kitem->indexOffset;
1794 
1795  Assert(itemIndex >= so->currPos.firstItem &&
1796  itemIndex <= so->currPos.lastItem);
1797  if (offnum < minoff)
1798  continue; /* pure paranoia */
1799  while (offnum <= maxoff)
1800  {
1801  ItemId iid = PageGetItemId(page, offnum);
1802  IndexTuple ituple = (IndexTuple) PageGetItem(page, iid);
1803 
1804  if (ItemPointerEquals(&ituple->t_tid, &kitem->heapTid))
1805  {
1806  /* found the item */
1807  ItemIdMarkDead(iid);
1808  killedsomething = true;
1809  break; /* out of inner search loop */
1810  }
1811  offnum = OffsetNumberNext(offnum);
1812  }
1813  }
1814 
1815  /*
1816  * Since this can be redone later if needed, mark as dirty hint.
1817  *
1818  * Whenever we mark anything LP_DEAD, we also set the page's
1819  * BTP_HAS_GARBAGE flag, which is likewise just a hint.
1820  */
1821  if (killedsomething)
1822  {
1823  opaque->btpo_flags |= BTP_HAS_GARBAGE;
1824  MarkBufferDirtyHint(so->currPos.buf, true);
1825  }
1826 
1828 }
#define BUFFER_LOCK_UNLOCK
Definition: bufmgr.h:87
BTScanPosItem items[MaxIndexTuplesPerPage]
Definition: nbtree.h:327
void MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
Definition: bufmgr.c:3379
#define ItemIdMarkDead(itemId)
Definition: itemid.h:178
Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access)
Definition: nbtpage.c:570
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:204
ItemPointerData t_tid
Definition: itup.h:37
BlockNumber currPage
Definition: nbtree.h:298
#define BTScanPosIsValid(scanpos)
Definition: nbtree.h:349
OffsetNumber indexOffset
Definition: nbtree.h:289
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:354
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:67
Relation indexRelation
Definition: relscan.h:89
uint16 OffsetNumber
Definition: off.h:24
#define BT_READ
Definition: nbtree.h:237
BTScanOpaqueData * BTScanOpaque
Definition: nbtree.h:416
static char * buf
Definition: pg_test_fsync.c:65
#define BTScanPosIsPinned(scanpos)
Definition: nbtree.h:332
IndexTupleData * IndexTuple
Definition: itup.h:53
int firstItem
Definition: nbtree.h:323
#define BufferGetPage(buffer)
Definition: bufmgr.h:160
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:232
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:3546
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:720
int * killedItems
Definition: nbtree.h:391
#define Assert(condition)
Definition: c.h:675
#define OffsetNumberNext(offsetNumber)
Definition: off.h:53
#define PageGetSpecialPointer(page)
Definition: bufpage.h:323
#define BufferIsValid(bufnum)
Definition: bufmgr.h:114
bool ItemPointerEquals(ItemPointer pointer1, ItemPointer pointer2)
Definition: itemptr.c:29
ItemPointerData heapTid
Definition: nbtree.h:288
#define PageGetLSN(page)
Definition: bufpage.h:363
BTScanPosData currPos
Definition: nbtree.h:412
int i
XLogRecPtr lsn
Definition: nbtree.h:297
Buffer buf
Definition: nbtree.h:295
int Buffer
Definition: buf.h:23
#define BTP_HAS_GARBAGE
Definition: nbtree.h:76
#define PageGetItem(page, itemId)
Definition: bufpage.h:337
Pointer Page
Definition: bufpage.h:74
void _bt_leafbuild ( BTSpool btspool,
BTSpool spool2 
)

Definition at line 201 of file nbtsort.c.

References _bt_load(), BTREE_METAPAGE, BTWriteState::btws_pages_alloced, BTWriteState::btws_pages_written, BTWriteState::btws_use_wal, BTWriteState::btws_zeropage, BTSpool::heap, BTWriteState::heap, BTSpool::index, BTWriteState::index, log_btree_build_stats, NULL, RelationNeedsWAL, ResetUsage(), ShowUsage(), BTSpool::sortstate, tuplesort_performsort(), and XLogIsNeeded.

Referenced by btbuild().

202 {
203  BTWriteState wstate;
204 
205 #ifdef BTREE_BUILD_STATS
207  {
208  ShowUsage("BTREE BUILD (Spool) STATISTICS");
209  ResetUsage();
210  }
211 #endif /* BTREE_BUILD_STATS */
212 
214  if (btspool2)
215  tuplesort_performsort(btspool2->sortstate);
216 
217  wstate.heap = btspool->heap;
218  wstate.index = btspool->index;
219 
220  /*
221  * We need to log index creation in WAL iff WAL archiving/streaming is
222  * enabled UNLESS the index isn't WAL-logged anyway.
223  */
224  wstate.btws_use_wal = XLogIsNeeded() && RelationNeedsWAL(wstate.index);
225 
226  /* reserve the metapage */
227  wstate.btws_pages_alloced = BTREE_METAPAGE + 1;
228  wstate.btws_pages_written = 0;
229  wstate.btws_zeropage = NULL; /* until needed */
230 
231  _bt_load(&wstate, btspool, btspool2);
232 }
void tuplesort_performsort(Tuplesortstate *state)
Definition: tuplesort.c:1763
Page btws_zeropage
Definition: nbtsort.c:127
static void _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
Definition: nbtsort.c:677
void ShowUsage(const char *title)
Definition: postgres.c:4388
BlockNumber btws_pages_alloced
Definition: nbtsort.c:125
#define XLogIsNeeded()
Definition: xlog.h:145
Tuplesortstate * sortstate
Definition: nbtsort.c:87
bool btws_use_wal
Definition: nbtsort.c:124
Relation heap
Definition: nbtsort.c:88
void ResetUsage(void)
Definition: postgres.c:4381
#define BTREE_METAPAGE
Definition: nbtree.h:109
Relation index
Definition: nbtsort.c:89
Relation index
Definition: nbtsort.c:123
BlockNumber btws_pages_written
Definition: nbtsort.c:126
bool log_btree_build_stats
Definition: guc.c:446
#define NULL
Definition: c.h:229
#define RelationNeedsWAL(relation)
Definition: rel.h:506
Relation heap
Definition: nbtsort.c:122
void _bt_mark_array_keys ( IndexScanDesc  scan)

Definition at line 606 of file nbtutils.c.

References BTScanOpaqueData::arrayKeys, BTArrayKeyInfo::cur_elem, i, BTArrayKeyInfo::mark_elem, BTScanOpaqueData::numArrayKeys, and IndexScanDescData::opaque.

607 {
608  BTScanOpaque so = (BTScanOpaque) scan->opaque;
609  int i;
610 
611  for (i = 0; i < so->numArrayKeys; i++)
612  {
613  BTArrayKeyInfo *curArrayKey = &so->arrayKeys[i];
614 
615  curArrayKey->mark_elem = curArrayKey->cur_elem;
616  }
617 }
int mark_elem
Definition: nbtree.h:369
BTScanOpaqueData * BTScanOpaque
Definition: nbtree.h:416
int cur_elem
Definition: nbtree.h:368
int numArrayKeys
Definition: nbtree.h:383
BTArrayKeyInfo * arrayKeys
Definition: nbtree.h:387
int i
ScanKey _bt_mkscankey ( Relation  rel,
IndexTuple  itup 
)

Definition at line 62 of file nbtutils.c.

References arg, BTORDER_PROC, i, index_getattr, index_getprocinfo(), InvalidOid, InvalidStrategy, palloc(), RelationData::rd_indcollation, RelationData::rd_indoption, RelationGetDescr, RelationGetNumberOfAttributes, ScanKeyEntryInitializeWithInfo(), SK_BT_INDOPTION_SHIFT, and SK_ISNULL.

Referenced by _bt_doinsert(), _bt_pagedel(), bt_right_page_check_scankey(), and bt_target_page_check().

63 {
64  ScanKey skey;
65  TupleDesc itupdesc;
66  int natts;
67  int16 *indoption;
68  int i;
69 
70  itupdesc = RelationGetDescr(rel);
71  natts = RelationGetNumberOfAttributes(rel);
72  indoption = rel->rd_indoption;
73 
74  skey = (ScanKey) palloc(natts * sizeof(ScanKeyData));
75 
76  for (i = 0; i < natts; i++)
77  {
78  FmgrInfo *procinfo;
79  Datum arg;
80  bool null;
81  int flags;
82 
83  /*
84  * We can use the cached (default) support procs since no cross-type
85  * comparison can be needed.
86  */
87  procinfo = index_getprocinfo(rel, i + 1, BTORDER_PROC);
88  arg = index_getattr(itup, i + 1, itupdesc, &null);
89  flags = (null ? SK_ISNULL : 0) | (indoption[i] << SK_BT_INDOPTION_SHIFT);
91  flags,
92  (AttrNumber) (i + 1),
94  InvalidOid,
95  rel->rd_indcollation[i],
96  procinfo,
97  arg);
98  }
99 
100  return skey;
101 }
#define SK_BT_INDOPTION_SHIFT
Definition: nbtree.h:425
signed short int16
Definition: c.h:255
#define InvalidStrategy
Definition: stratnum.h:24
Definition: fmgr.h:56
#define BTORDER_PROC
Definition: nbtree.h:228
int16 * rd_indoption
Definition: rel.h:186
FmgrInfo * index_getprocinfo(Relation irel, AttrNumber attnum, uint16 procnum)
Definition: indexam.c:855
void ScanKeyEntryInitializeWithInfo(ScanKey entry, int flags, AttrNumber attributeNumber, StrategyNumber strategy, Oid subtype, Oid collation, FmgrInfo *finfo, Datum argument)
Definition: scankey.c:101
#define RelationGetDescr(relation)
Definition: rel.h:429
#define RelationGetNumberOfAttributes(relation)
Definition: rel.h:423
Oid * rd_indcollation
Definition: rel.h:193
ScanKeyData * ScanKey
Definition: skey.h:75
#define SK_ISNULL
Definition: skey.h:115
uintptr_t Datum
Definition: postgres.h:372
#define InvalidOid
Definition: postgres_ext.h:36
#define index_getattr(tup, attnum, tupleDesc, isnull)
Definition: itup.h:100
void * palloc(Size size)
Definition: mcxt.c:849
int i
void * arg
int16 AttrNumber
Definition: attnum.h:21
ScanKey _bt_mkscankey_nodata ( Relation  rel)

Definition at line 115 of file nbtutils.c.

References BTORDER_PROC, i, index_getprocinfo(), InvalidOid, InvalidStrategy, palloc(), RelationData::rd_indcollation, RelationData::rd_indoption, RelationGetNumberOfAttributes, ScanKeyEntryInitializeWithInfo(), SK_BT_INDOPTION_SHIFT, and SK_ISNULL.

Referenced by _bt_load(), tuplesort_begin_cluster(), and tuplesort_begin_index_btree().

116 {
117  ScanKey skey;
118  int natts;
119  int16 *indoption;
120  int i;
121 
122  natts = RelationGetNumberOfAttributes(rel);
123  indoption = rel->rd_indoption;
124 
125  skey = (ScanKey) palloc(natts * sizeof(ScanKeyData));
126 
127  for (i = 0; i < natts; i++)
128  {
129  FmgrInfo *procinfo;
130  int flags;
131 
132  /*
133  * We can use the cached (default) support procs since no cross-type
134  * comparison can be needed.
135  */
136  procinfo = index_getprocinfo(rel, i + 1, BTORDER_PROC);
137  flags = SK_ISNULL | (indoption[i] << SK_BT_INDOPTION_SHIFT);
139  flags,
140  (AttrNumber) (i + 1),
142  InvalidOid,
143  rel->rd_indcollation[i],
144  procinfo,
145  (Datum) 0);
146  }
147 
148  return skey;
149 }
#define SK_BT_INDOPTION_SHIFT
Definition: nbtree.h:425
signed short int16
Definition: c.h:255
#define InvalidStrategy
Definition: stratnum.h:24
Definition: fmgr.h:56
#define BTORDER_PROC
Definition: nbtree.h:228
int16 * rd_indoption
Definition: rel.h:186
FmgrInfo * index_getprocinfo(Relation irel, AttrNumber attnum, uint16 procnum)
Definition: indexam.c:855
void ScanKeyEntryInitializeWithInfo(ScanKey entry, int flags, AttrNumber attributeNumber, StrategyNumber strategy, Oid subtype, Oid collation, FmgrInfo *finfo, Datum argument)
Definition: scankey.c:101
#define RelationGetNumberOfAttributes(relation)
Definition: rel.h:423
Oid * rd_indcollation
Definition: rel.h:193
ScanKeyData * ScanKey
Definition: skey.h:75
#define SK_ISNULL
Definition: skey.h:115
uintptr_t Datum
Definition: postgres.h:372
#define InvalidOid
Definition: postgres_ext.h:36
void * palloc(Size size)
Definition: mcxt.c:849
int i
int16 AttrNumber
Definition: attnum.h:21
Buffer _bt_moveright ( Relation  rel,
Buffer  buf,
int  keysz,
ScanKey  scankey,
bool  nextkey,
bool  forupdate,
BTStack  stack,
int  access,
Snapshot  snapshot 
)

Definition at line 214 of file nbtsearch.c.

References _bt_compare(), _bt_finish_split(), _bt_getbuf(), _bt_relandgetbuf(), _bt_relbuf(), BT_READ, BT_WRITE, BTPageOpaqueData::btpo_next, buf, BUFFER_LOCK_UNLOCK, BufferGetBlockNumber(), BufferGetPage, elog, ERROR, LockBuffer(), P_HIKEY, P_IGNORE, P_INCOMPLETE_SPLIT, P_RIGHTMOST, PageGetSpecialPointer, RelationGetRelationName, and TestForOldSnapshot().

Referenced by _bt_doinsert(), and _bt_search().

223 {
224  Page page;
225  BTPageOpaque opaque;
226  int32 cmpval;
227 
228  /*
229  * When nextkey = false (normal case): if the scan key that brought us to
230  * this page is > the high key stored on the page, then the page has split
231  * and we need to move right. (If the scan key is equal to the high key,
232  * we might or might not need to move right; have to scan the page first
233  * anyway.)
234  *
235  * When nextkey = true: move right if the scan key is >= page's high key.
236  *
237  * The page could even have split more than once, so scan as far as
238  * needed.
239  *
240  * We also have to move right if we followed a link that brought us to a
241  * dead page.
242  */
243  cmpval = nextkey ? 0 : 1;
244 
245  for (;;)
246  {
247  page = BufferGetPage(buf);
248  TestForOldSnapshot(snapshot, rel, page);
249  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
250 
251  if (P_RIGHTMOST(opaque))
252  break;
253 
254  /*
255  * Finish any incomplete splits we encounter along the way.
256  */
257  if (forupdate && P_INCOMPLETE_SPLIT(opaque))
258  {
260 
261  /* upgrade our lock if necessary */
262  if (access == BT_READ)
263  {
266  }
267 
268  if (P_INCOMPLETE_SPLIT(opaque))
269  _bt_finish_split(rel, buf, stack);
270  else
271  _bt_relbuf(rel, buf);
272 
273  /* re-acquire the lock in the right mode, and re-check */
274  buf = _bt_getbuf(rel, blkno, access);
275  continue;
276  }
277 
278  if (P_IGNORE(opaque) || _bt_compare(rel, keysz, scankey, page, P_HIKEY) >= cmpval)
279  {
280  /* step right one page */
281  buf = _bt_relandgetbuf(rel, buf, opaque->btpo_next, access);
282  continue;
283  }
284  else
285  break;
286  }
287 
288  if (P_IGNORE(opaque))
289  elog(ERROR, "fell off the end of index \"%s\"",
291 
292  return buf;
293 }
#define BUFFER_LOCK_UNLOCK
Definition: bufmgr.h:87
BlockNumber btpo_next
Definition: nbtree.h:57
static void TestForOldSnapshot(Snapshot snapshot, Relation relation, Page page)
Definition: bufmgr.h:265
#define P_IGNORE(opaque)
Definition: nbtree.h:180
Buffer _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
Definition: nbtpage.c:701
Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access)
Definition: nbtpage.c:570
uint32 BlockNumber
Definition: block.h:31
#define P_INCOMPLETE_SPLIT(opaque)
Definition: nbtree.h:182
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:67
signed int int32
Definition: c.h:256
#define BT_READ
Definition: nbtree.h:237
void _bt_finish_split(Relation rel, Buffer lbuf, BTStack stack)
Definition: nbtinsert.c:1745
#define ERROR
Definition: elog.h:43
static char * buf
Definition: pg_test_fsync.c:65
#define RelationGetRelationName(relation)
Definition: rel.h:437
#define BufferGetPage(buffer)
Definition: bufmgr.h:160
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:3546
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:720
#define PageGetSpecialPointer(page)
Definition: bufpage.h:323
#define P_HIKEY
Definition: nbtree.h:202
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2605
#define elog
Definition: elog.h:219
int32 _bt_compare(Relation rel, int keysz, ScanKey scankey, Page page, OffsetNumber offnum)
Definition: nbtsearch.c:428
#define BT_WRITE
Definition: nbtree.h:238
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:175
Pointer Page
Definition: bufpage.h:74
bool _bt_next ( IndexScanDesc  scan,
ScanDirection  dir 
)

Definition at line 1129 of file nbtsearch.c.

References _bt_steppage(), BTScanOpaqueData::currPos, BTScanOpaqueData::currTuples, BTScanPosData::firstItem, BTScanPosData::itemIndex, BTScanPosData::items, BTScanPosData::lastItem, IndexScanDescData::opaque, ScanDirectionIsForward, HeapTupleData::t_self, IndexScanDescData::xs_ctup, IndexScanDescData::xs_itup, and IndexScanDescData::xs_want_itup.

Referenced by btgetbitmap(), and btgettuple().

1130 {
1131  BTScanOpaque so = (BTScanOpaque) scan->opaque;
1132  BTScanPosItem *currItem;
1133 
1134  /*
1135  * Advance to next tuple on current page; or if there's no more, try to
1136  * step to the next page with data.
1137  */
1138  if (ScanDirectionIsForward(dir))
1139  {
1140  if (++so->currPos.itemIndex > so->currPos.lastItem)
1141  {
1142  if (!_bt_steppage(scan, dir))
1143  return false;
1144  }
1145  }
1146  else
1147  {
1148  if (--so->currPos.itemIndex < so->currPos.firstItem)
1149  {
1150  if (!_bt_steppage(scan, dir))
1151  return false;
1152  }
1153  }
1154 
1155  /* OK, itemIndex says what to return */
1156  currItem = &so->currPos.items[so->currPos.itemIndex];
1157  scan->xs_ctup.t_self = currItem->heapTid;
1158  if (scan->xs_want_itup)
1159  scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
1160 
1161  return true;
1162 }
#define ScanDirectionIsForward(direction)
Definition: sdir.h:55
IndexTuple xs_itup
Definition: relscan.h:113
BTScanPosItem items[MaxIndexTuplesPerPage]
Definition: nbtree.h:327
int itemIndex
Definition: nbtree.h:325
char * currTuples
Definition: nbtree.h:399
int lastItem
Definition: nbtree.h:324
BTScanOpaqueData * BTScanOpaque
Definition: nbtree.h:416
ItemPointerData t_self
Definition: htup.h:65
IndexTupleData * IndexTuple
Definition: itup.h:53
int firstItem
Definition: nbtree.h:323
bool xs_want_itup
Definition: relscan.h:95
HeapTupleData xs_ctup
Definition: relscan.h:119
BTScanPosData currPos
Definition: nbtree.h:412
static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:1342
bool _bt_page_recyclable ( Page  page)

Definition at line 744 of file nbtpage.c.

References BTPageOpaqueData::btpo, P_ISDELETED, PageGetSpecialPointer, PageIsNew, RecentGlobalXmin, TransactionIdPrecedes(), and BTPageOpaqueData::xact.

Referenced by _bt_getbuf(), and btvacuumpage().

745 {
746  BTPageOpaque opaque;
747 
748  /*
749  * It's possible to find an all-zeroes page in an index --- for example, a
750  * backend might successfully extend the relation one page and then crash
751  * before it is able to make a WAL entry for adding the page. If we find a
752  * zeroed page then reclaim it.
753  */
754  if (PageIsNew(page))
755  return true;
756 
757  /*
758  * Otherwise, recycle if deleted and too old to have any processes
759  * interested in it.
760  */
761  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
762  if (P_ISDELETED(opaque) &&
764  return true;
765  return false;
766 }
union BTPageOpaqueData::@40 btpo
TransactionId xact
Definition: nbtree.h:61
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:67
TransactionId RecentGlobalXmin
Definition: snapmgr.c:166
#define P_ISDELETED(opaque)
Definition: nbtree.h:178
bool TransactionIdPrecedes(TransactionId id1, TransactionId id2)
Definition: transam.c:300
#define PageGetSpecialPointer(page)
Definition: bufpage.h:323
#define PageIsNew(page)
Definition: bufpage.h:226
int _bt_pagedel ( Relation  rel,
Buffer  buf 
)

Definition at line 1109 of file nbtpage.c.

References _bt_getbuf(), _bt_mark_page_halfdead(), _bt_mkscankey(), _bt_relbuf(), _bt_search(), _bt_unlink_halfdead_page(), Assert, BT_READ, BT_WRITE, BTPageOpaqueData::btpo_next, BTPageOpaqueData::btpo_prev, BUFFER_LOCK_UNLOCK, BufferGetBlockNumber(), BufferGetPage, CopyIndexTuple(), ereport, errcode(), errhint(), errmsg(), LockBuffer(), LOG, NULL, P_FIRSTDATAKEY, P_HIKEY, P_INCOMPLETE_SPLIT, P_ISDELETED, P_ISHALFDEAD, P_ISLEAF, P_ISROOT, P_LEFTMOST, P_RIGHTMOST, PageGetItem, PageGetItemId, PageGetMaxOffsetNumber, PageGetSpecialPointer, RelationData::rd_rel, RelationGetRelationName, and ReleaseBuffer().

Referenced by btvacuumpage().

1110 {
1111  int ndeleted = 0;
1112  BlockNumber rightsib;
1113  bool rightsib_empty;
1114  Page page;
1115  BTPageOpaque opaque;
1116 
1117  /*
1118  * "stack" is a search stack leading (approximately) to the target page.
1119  * It is initially NULL, but when iterating, we keep it to avoid
1120  * duplicated search effort.
1121  *
1122  * Also, when "stack" is not NULL, we have already checked that the
1123  * current page is not the right half of an incomplete split, i.e. the
1124  * left sibling does not have its INCOMPLETE_SPLIT flag set.
1125  */
1126  BTStack stack = NULL;
1127 
1128  for (;;)
1129  {
1130  page = BufferGetPage(buf);
1131  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1132 
1133  /*
1134  * Internal pages are never deleted directly, only as part of deleting
1135  * the whole branch all the way down to leaf level.
1136  */
1137  if (!P_ISLEAF(opaque))
1138  {
1139  /*
1140  * Pre-9.4 page deletion only marked internal pages as half-dead,
1141  * but now we only use that flag on leaf pages. The old algorithm
1142  * was never supposed to leave half-dead pages in the tree, it was
1143  * just a transient state, but it was nevertheless possible in
1144  * error scenarios. We don't know how to deal with them here. They
1145  * are harmless as far as searches are considered, but inserts
1146  * into the deleted keyspace could add out-of-order downlinks in
1147  * the upper levels. Log a notice, hopefully the admin will notice
1148  * and reindex.
1149  */
1150  if (P_ISHALFDEAD(opaque))
1151  ereport(LOG,
1152  (errcode(ERRCODE_INDEX_CORRUPTED),
1153  errmsg("index \"%s\" contains a half-dead internal page",
1155  errhint("This can be caused by an interrupted VACUUM in version 9.3 or older, before upgrade. Please REINDEX it.")));
1156  _bt_relbuf(rel, buf);
1157  return ndeleted;
1158  }
1159 
1160  /*
1161  * We can never delete rightmost pages nor root pages. While at it,
1162  * check that page is not already deleted and is empty.
1163  *
1164  * To keep the algorithm simple, we also never delete an incompletely
1165  * split page (they should be rare enough that this doesn't make any
1166  * meaningful difference to disk usage):
1167  *
1168  * The INCOMPLETE_SPLIT flag on the page tells us if the page is the
1169  * left half of an incomplete split, but ensuring that it's not the
1170  * right half is more complicated. For that, we have to check that
1171  * the left sibling doesn't have its INCOMPLETE_SPLIT flag set. On
1172  * the first iteration, we temporarily release the lock on the current
1173  * page, and check the left sibling and also construct a search stack
1174  * to. On subsequent iterations, we know we stepped right from a page
1175  * that passed these tests, so it's OK.
1176  */
1177  if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) || P_ISDELETED(opaque) ||
1178  P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page) ||
1179  P_INCOMPLETE_SPLIT(opaque))
1180  {
1181  /* Should never fail to delete a half-dead page */
1182  Assert(!P_ISHALFDEAD(opaque));
1183 
1184  _bt_relbuf(rel, buf);
1185  return ndeleted;
1186  }
1187 
1188  /*
1189  * First, remove downlink pointing to the page (or a parent of the
1190  * page, if we are going to delete a taller branch), and mark the page
1191  * as half-dead.
1192  */
1193  if (!P_ISHALFDEAD(opaque))
1194  {
1195  /*
1196  * We need an approximate pointer to the page's parent page. We
1197  * use the standard search mechanism to search for the page's high
1198  * key; this will give us a link to either the current parent or
1199  * someplace to its left (if there are multiple equal high keys).
1200  *
1201  * Also check if this is the right-half of an incomplete split
1202  * (see comment above).
1203  */
1204  if (!stack)
1205  {
1206  ScanKey itup_scankey;
1207  ItemId itemid;
1208  IndexTuple targetkey;
1209  Buffer lbuf;
1210  BlockNumber leftsib;
1211 
1212  itemid = PageGetItemId(page, P_HIKEY);
1213  targetkey = CopyIndexTuple((IndexTuple) PageGetItem(page, itemid));
1214 
1215  leftsib = opaque->btpo_prev;
1216 
1217  /*
1218  * To avoid deadlocks, we'd better drop the leaf page lock
1219  * before going further.
1220  */
1222 
1223  /*
1224  * Fetch the left sibling, to check that it's not marked with
1225  * INCOMPLETE_SPLIT flag. That would mean that the page
1226  * to-be-deleted doesn't have a downlink, and the page
1227  * deletion algorithm isn't prepared to handle that.
1228  */
1229  if (!P_LEFTMOST(opaque))
1230  {
1231  BTPageOpaque lopaque;
1232  Page lpage;
1233 
1234  lbuf = _bt_getbuf(rel, leftsib, BT_READ);
1235  lpage = BufferGetPage(lbuf);
1236  lopaque = (BTPageOpaque) PageGetSpecialPointer(lpage);
1237 
1238  /*
1239  * If the left sibling is split again by another backend,
1240  * after we released the lock, we know that the first
1241  * split must have finished, because we don't allow an
1242  * incompletely-split page to be split again. So we don't
1243  * need to walk right here.
1244  */
1245  if (lopaque->btpo_next == BufferGetBlockNumber(buf) &&
1246  P_INCOMPLETE_SPLIT(lopaque))
1247  {
1248  ReleaseBuffer(buf);
1249  _bt_relbuf(rel, lbuf);
1250  return ndeleted;
1251  }
1252  _bt_relbuf(rel, lbuf);
1253  }
1254 
1255  /* we need an insertion scan key for the search, so build one */
1256  itup_scankey = _bt_mkscankey(rel, targetkey);
1257  /* find the leftmost leaf page containing this key */
1258  stack = _bt_search(rel, rel->rd_rel->relnatts, itup_scankey,
1259  false, &lbuf, BT_READ, NULL);
1260  /* don't need a pin on the page */
1261  _bt_relbuf(rel, lbuf);
1262 
1263  /*
1264  * Re-lock the leaf page, and start over, to re-check that the
1265  * page can still be deleted.
1266  */
1268  continue;
1269  }
1270 
1271  if (!_bt_mark_page_halfdead(rel, buf, stack))
1272  {
1273  _bt_relbuf(rel, buf);
1274  return ndeleted;
1275  }
1276  }
1277 
1278  /*
1279  * Then unlink it from its siblings. Each call to
1280  * _bt_unlink_halfdead_page unlinks the topmost page from the branch,
1281  * making it shallower. Iterate until the leaf page is gone.
1282  */
1283  rightsib_empty = false;
1284  while (P_ISHALFDEAD(opaque))
1285  {
1286  if (!_bt_unlink_halfdead_page(rel, buf, &rightsib_empty))
1287  {
1288  /* _bt_unlink_halfdead_page already released buffer */
1289  return ndeleted;
1290  }
1291  ndeleted++;
1292  }
1293 
1294  rightsib = opaque->btpo_next;
1295 
1296  _bt_relbuf(rel, buf);
1297 
1298  /*
1299  * The page has now been deleted. If its right sibling is completely
1300  * empty, it's possible that the reason we haven't deleted it earlier
1301  * is that it was the rightmost child of the parent. Now that we
1302  * removed the downlink for this page, the right sibling might now be
1303  * the only child of the parent, and could be removed. It would be
1304  * picked up by the next vacuum anyway, but might as well try to
1305  * remove it now, so loop back to process the right sibling.
1306  */
1307  if (!rightsib_empty)
1308  break;
1309 
1310  buf = _bt_getbuf(rel, rightsib, BT_WRITE);
1311  }
1312 
1313  return ndeleted;
1314 }
#define BUFFER_LOCK_UNLOCK
Definition: bufmgr.h:87
BlockNumber btpo_next
Definition: nbtree.h:57
int errhint(const char *fmt,...)
Definition: elog.c:987
Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access)
Definition: nbtpage.c:570
static bool _bt_mark_page_halfdead(Relation rel, Buffer buf, BTStack stack)
Definition: nbtpage.c:1321
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:204
int errcode(int sqlerrcode)
Definition: elog.c:575
uint32 BlockNumber
Definition: block.h:31
void ReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:3309
#define P_INCOMPLETE_SPLIT(opaque)
Definition: nbtree.h:182
#define LOG
Definition: elog.h:26
Form_pg_class rd_rel
Definition: rel.h:114
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:354
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:67
#define BT_READ
Definition: nbtree.h:237
#define P_ISHALFDEAD(opaque)
Definition: nbtree.h:179
BlockNumber btpo_prev
Definition: nbtree.h:56
IndexTuple CopyIndexTuple(IndexTuple source)
Definition: indextuple.c:434
static char * buf
Definition: pg_test_fsync.c:65
#define RelationGetRelationName(relation)
Definition: rel.h:437
#define P_LEFTMOST(opaque)
Definition: nbtree.h:174
#define BufferGetPage(buffer)
Definition: bufmgr.h:160
#define ereport(elevel, rest)
Definition: elog.h:122
#define P_ISDELETED(opaque)
Definition: nbtree.h:178
#define P_ISROOT(opaque)
Definition: nbtree.h:177
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:232
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:3546
BTStack _bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey, Buffer *bufP, int access, Snapshot snapshot)
Definition: nbtsearch.c:97
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:720
#define NULL
Definition: c.h:229
#define Assert(condition)
Definition: c.h:675
#define PageGetSpecialPointer(page)
Definition: bufpage.h:323
#define P_HIKEY
Definition: nbtree.h:202
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2605
int errmsg(const char *fmt,...)
Definition: elog.c:797
static bool _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, bool *rightsib_empty)
Definition: nbtpage.c:1512
ScanKey _bt_mkscankey(Relation rel, IndexTuple itup)
Definition: nbtutils.c:62
#define BT_WRITE
Definition: nbtree.h:238
int Buffer
Definition: buf.h:23
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:175
#define PageGetItem(page, itemId)
Definition: bufpage.h:337
Pointer Page
Definition: bufpage.h:74
#define P_ISLEAF(opaque)
Definition: nbtree.h:176
void _bt_pageinit ( Page  page,
Size  size 
)

Definition at line 732 of file nbtpage.c.

References PageInit().

Referenced by _bt_blnewpage(), _bt_getbuf(), _bt_initmetapage(), _bt_restore_meta(), _bt_split(), btree_xlog_mark_page_halfdead(), btree_xlog_newroot(), btree_xlog_split(), and btree_xlog_unlink_page().

733 {
734  PageInit(page, size, sizeof(BTPageOpaqueData));
735 }
void PageInit(Page page, Size pageSize, Size specialSize)
Definition: bufpage.c:41
void _bt_parallel_advance_array_keys ( IndexScanDesc  scan)

Definition at line 889 of file nbtree.c.

References BTScanOpaqueData::arrayKeyCount, BTPARALLEL_DONE, BTPARALLEL_NOT_INITIALIZED, InvalidBlockNumber, OffsetToPointer, IndexScanDescData::opaque, IndexScanDescData::parallel_scan, SpinLockAcquire, and SpinLockRelease.

Referenced by _bt_advance_array_keys().

890 {
891  BTScanOpaque so = (BTScanOpaque) scan->opaque;
892  ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
893  BTParallelScanDesc btscan;
894 
895  btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
896  parallel_scan->ps_offset);
897 
898  so->arrayKeyCount++;
899  SpinLockAcquire(&btscan->btps_mutex);
900  if (btscan->btps_pageStatus == BTPARALLEL_DONE)
901  {
902  btscan->btps_scanPage = InvalidBlockNumber;
903  btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
904  btscan->btps_arrayKeyCount++;
905  }
906  SpinLockRelease(&btscan->btps_mutex);
907 }
ParallelIndexScanDesc parallel_scan
Definition: relscan.h:139
#define SpinLockAcquire(lock)
Definition: spin.h:62
BTScanOpaqueData * BTScanOpaque
Definition: nbtree.h:416
int arrayKeyCount
Definition: nbtree.h:385
#define OffsetToPointer(base, offset)
Definition: c.h:535
#define SpinLockRelease(lock)
Definition: spin.h:64
#define InvalidBlockNumber
Definition: block.h:33
void _bt_parallel_done ( IndexScanDesc  scan)

Definition at line 848 of file nbtree.c.

References BTScanOpaqueData::arrayKeyCount, BTPARALLEL_DONE, ConditionVariableBroadcast(), NULL, OffsetToPointer, IndexScanDescData::opaque, IndexScanDescData::parallel_scan, SpinLockAcquire, and SpinLockRelease.

Referenced by _bt_first(), and _bt_readnextpage().

849 {
850  BTScanOpaque so = (BTScanOpaque) scan->opaque;
851  ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
852  BTParallelScanDesc btscan;
853  bool status_changed = false;
854 
855  /* Do nothing, for non-parallel scans */
856  if (parallel_scan == NULL)
857  return;
858 
859  btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
860  parallel_scan->ps_offset);
861 
862  /*
863  * Mark the parallel scan as done for this combination of scan keys,
864  * unless some other process already did so. See also
865  * _bt_advance_array_keys.
866  */
867  SpinLockAcquire(&btscan->btps_mutex);
868  if (so->arrayKeyCount >= btscan->btps_arrayKeyCount &&
869  btscan->btps_pageStatus != BTPARALLEL_DONE)
870  {
871  btscan->btps_pageStatus = BTPARALLEL_DONE;
872  status_changed = true;
873  }
874  SpinLockRelease(&btscan->btps_mutex);
875 
876  /* wake up all the workers associated with this parallel scan */
877  if (status_changed)
878  ConditionVariableBroadcast(&btscan->btps_cv);
879 }
ParallelIndexScanDesc parallel_scan
Definition: relscan.h:139
int ConditionVariableBroadcast(ConditionVariable *cv)
struct BTParallelScanDescData * BTParallelScanDesc
Definition: nbtree.c:105
#define SpinLockAcquire(lock)
Definition: spin.h:62
BTScanOpaqueData * BTScanOpaque
Definition: nbtree.h:416
int arrayKeyCount
Definition: nbtree.h:385
#define OffsetToPointer(base, offset)
Definition: c.h:535
#define SpinLockRelease(lock)
Definition: spin.h:64
#define NULL
Definition: c.h:229
void _bt_parallel_release ( IndexScanDesc  scan,
BlockNumber  scan_page 
)

Definition at line 825 of file nbtree.c.

References BTPARALLEL_IDLE, BTParallelScanDescData::btps_cv, BTParallelScanDescData::btps_mutex, BTParallelScanDescData::btps_pageStatus, BTParallelScanDescData::btps_scanPage, ConditionVariableSignal(), OffsetToPointer, IndexScanDescData::parallel_scan, ParallelIndexScanDescData::ps_offset, SpinLockAcquire, and SpinLockRelease.

Referenced by _bt_readpage().

826 {
827  ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
828  BTParallelScanDesc btscan;
829 
830  btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
831  parallel_scan->ps_offset);
832 
833  SpinLockAcquire(&btscan->btps_mutex);
834  btscan->btps_scanPage = scan_page;
836  SpinLockRelease(&btscan->btps_mutex);
838 }
ParallelIndexScanDesc parallel_scan
Definition: relscan.h:139
bool ConditionVariableSignal(ConditionVariable *cv)
struct BTParallelScanDescData * BTParallelScanDesc
Definition: nbtree.c:105
#define SpinLockAcquire(lock)
Definition: spin.h:62
ConditionVariable btps_cv
Definition: nbtree.c:102
#define OffsetToPointer(base, offset)
Definition: c.h:535
#define SpinLockRelease(lock)
Definition: spin.h:64
BlockNumber btps_scanPage
Definition: nbtree.c:94
BTPS_State btps_pageStatus
Definition: nbtree.c:95
bool _bt_parallel_seize ( IndexScanDesc  scan,
BlockNumber pageno 
)

Definition at line 767 of file nbtree.c.

References BTScanOpaqueData::arrayKeyCount, BTPARALLEL_ADVANCING, BTPARALLEL_DONE, ConditionVariableCancelSleep(), ConditionVariableSleep(), OffsetToPointer, IndexScanDescData::opaque, P_NONE, IndexScanDescData::parallel_scan, SpinLockAcquire, SpinLockRelease, status(), and WAIT_EVENT_BTREE_PAGE.

Referenced by _bt_first(), _bt_readnextpage(), and _bt_steppage().

768 {
769  BTScanOpaque so = (BTScanOpaque) scan->opaque;
770  BTPS_State pageStatus;
771  bool exit_loop = false;
772  bool status = true;
773  ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
774  BTParallelScanDesc btscan;
775 
776  *pageno = P_NONE;
777 
778  btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
779  parallel_scan->ps_offset);
780 
781  while (1)
782  {
783  SpinLockAcquire(&btscan->btps_mutex);
784  pageStatus = btscan->btps_pageStatus;
785 
786  if (so->arrayKeyCount < btscan->btps_arrayKeyCount)
787  {
788  /* Parallel scan has already advanced to a new set of scankeys. */
789  status = false;
790  }
791  else if (pageStatus == BTPARALLEL_DONE)
792  {
793  /*
794  * We're done with this set of scankeys. This may be the end, or
795  * there could be more sets to try.
796  */
797  status = false;
798  }
799  else if (pageStatus != BTPARALLEL_ADVANCING)
800  {
801  /*
802  * We have successfully seized control of the scan for the purpose
803  * of advancing it to a new page!
804  */
805  btscan->btps_pageStatus = BTPARALLEL_ADVANCING;
806  *pageno = btscan->btps_scanPage;
807  exit_loop = true;
808  }
809  SpinLockRelease(&btscan->btps_mutex);
810  if (exit_loop || !status)
811  break;
813  }
815 
816  return status;
817 }
ParallelIndexScanDesc parallel_scan
Definition: relscan.h:139
#define P_NONE
Definition: nbtree.h:168
#define SpinLockAcquire(lock)
Definition: spin.h:62
void ConditionVariableCancelSleep(void)
BTScanOpaqueData * BTScanOpaque
Definition: nbtree.h:416
BTPS_State
Definition: nbtree.c:80
int arrayKeyCount
Definition: nbtree.h:385
#define OffsetToPointer(base, offset)
Definition: c.h:535
#define SpinLockRelease(lock)
Definition: spin.h:64
void ConditionVariableSleep(ConditionVariable *cv, uint32 wait_event_info)
static void static void status(const char *fmt,...) pg_attribute_printf(1
Definition: pg_regress.c:224
void _bt_preprocess_array_keys ( IndexScanDesc  scan)

Definition at line 192 of file nbtutils.c.

References _bt_find_extreme_element(), _bt_sort_array_elements(), ALLOCSET_SMALL_SIZES, AllocSetContextCreate(), ARR_ELEMTYPE, BTScanOpaqueData::arrayContext, BTScanOpaqueData::arrayKeyData, BTScanOpaqueData::arrayKeys, Assert, BTEqualStrategyNumber, BTGreaterEqualStrategyNumber, BTGreaterStrategyNumber, BTLessEqualStrategyNumber, BTLessStrategyNumber, cur, CurrentMemoryContext, DatumGetArrayTypeP, deconstruct_array(), BTArrayKeyInfo::elem_values, elog, ERROR, get_typlenbyvalalign(), i, IndexScanDescData::indexRelation, INDOPTION_DESC, IndexScanDescData::keyData, MemoryContextReset(), MemoryContextSwitchTo(), NULL, BTArrayKeyInfo::num_elems, BTScanOpaqueData::numArrayKeys, IndexScanDescData::numberOfKeys, IndexScanDescData::opaque, palloc(), palloc0(), RelationData::rd_indoption, BTArrayKeyInfo::scan_key, SK_ISNULL, SK_ROW_HEADER, SK_SEARCHARRAY, SK_SEARCHNOTNULL, and SK_SEARCHNULL.

Referenced by btrescan().

193 {
194  BTScanOpaque so = (BTScanOpaque) scan->opaque;
195  int numberOfKeys = scan->numberOfKeys;
196  int16 *indoption = scan->indexRelation->rd_indoption;
197  int numArrayKeys;
198  ScanKey cur;
199  int i;
200  MemoryContext oldContext;
201 
202  /* Quick check to see if there are any array keys */
203  numArrayKeys = 0;
204  for (i = 0; i < numberOfKeys; i++)
205  {
206  cur = &scan->keyData[i];
207  if (cur->sk_flags & SK_SEARCHARRAY)
208  {
209  numArrayKeys++;
210  Assert(!(cur->sk_flags & (SK_ROW_HEADER | SK_SEARCHNULL | SK_SEARCHNOTNULL)));
211  /* If any arrays are null as a whole, we can quit right now. */
212  if (cur->sk_flags & SK_ISNULL)
213  {
214  so->numArrayKeys = -1;
215  so->arrayKeyData = NULL;
216  return;
217  }
218  }
219  }
220 
221  /* Quit if nothing to do. */
222  if (numArrayKeys == 0)
223  {
224  so->numArrayKeys = 0;
225  so->arrayKeyData = NULL;
226  return;
227  }
228 
229  /*
230  * Make a scan-lifespan context to hold array-associated data, or reset it
231  * if we already have one from a previous rescan cycle.
232  */
233  if (so->arrayContext == NULL)
235  "BTree array context",
237  else
239 
240  oldContext = MemoryContextSwitchTo(so->arrayContext);
241 
242  /* Create modifiable copy of scan->keyData in the workspace context */
243  so->arrayKeyData = (ScanKey) palloc(scan->numberOfKeys * sizeof(ScanKeyData));
244  memcpy(so->arrayKeyData,
245  scan->keyData,
246  scan->numberOfKeys * sizeof(ScanKeyData));
247 
248  /* Allocate space for per-array data in the workspace context */
249  so->arrayKeys = (BTArrayKeyInfo *) palloc0(numArrayKeys * sizeof(BTArrayKeyInfo));
250 
251  /* Now process each array key */
252  numArrayKeys = 0;
253  for (i = 0; i < numberOfKeys; i++)
254  {
255  ArrayType *arrayval;
256  int16 elmlen;
257  bool elmbyval;
258  char elmalign;
259  int num_elems;
260  Datum *elem_values;
261  bool *elem_nulls;
262  int num_nonnulls;
263  int j;
264 
265  cur = &so->arrayKeyData[i];
266  if (!(cur->sk_flags & SK_SEARCHARRAY))
267  continue;
268 
269  /*
270  * First, deconstruct the array into elements. Anything allocated
271  * here (including a possibly detoasted array value) is in the
272  * workspace context.
273  */
274  arrayval = DatumGetArrayTypeP(cur->sk_argument);
275  /* We could cache this data, but not clear it's worth it */
277  &elmlen, &elmbyval, &elmalign);
278  deconstruct_array(arrayval,
279  ARR_ELEMTYPE(arrayval),
280  elmlen, elmbyval, elmalign,
281  &elem_values, &elem_nulls, &num_elems);
282 
283  /*
284  * Compress out any null elements. We can ignore them since we assume
285  * all btree operators are strict.
286  */
287  num_nonnulls = 0;
288  for (j = 0; j < num_e