PostgreSQL Source Code git master
All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Pages
nbtree.c File Reference
#include "postgres.h"
#include "access/nbtree.h"
#include "access/relscan.h"
#include "access/stratnum.h"
#include "commands/progress.h"
#include "commands/vacuum.h"
#include "nodes/execnodes.h"
#include "pgstat.h"
#include "storage/bulk_write.h"
#include "storage/condition_variable.h"
#include "storage/indexfsm.h"
#include "storage/ipc.h"
#include "storage/lmgr.h"
#include "storage/read_stream.h"
#include "utils/datum.h"
#include "utils/fmgrprotos.h"
#include "utils/index_selfuncs.h"
#include "utils/memutils.h"
Include dependency graph for nbtree.c:

Go to the source code of this file.

Data Structures

struct  BTParallelScanDescData
 

Typedefs

typedef struct BTParallelScanDescData BTParallelScanDescData
 
typedef struct BTParallelScanDescDataBTParallelScanDesc
 

Enumerations

enum  BTPS_State {
  BTPARALLEL_NOT_INITIALIZED , BTPARALLEL_NEED_PRIMSCAN , BTPARALLEL_ADVANCING , BTPARALLEL_IDLE ,
  BTPARALLEL_DONE
}
 

Functions

static void _bt_parallel_serialize_arrays (Relation rel, BTParallelScanDesc btscan, BTScanOpaque so)
 
static void _bt_parallel_restore_arrays (Relation rel, BTParallelScanDesc btscan, BTScanOpaque so)
 
static void btvacuumscan (IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state, BTCycleId cycleid)
 
static BlockNumber btvacuumpage (BTVacState *vstate, Buffer buf)
 
static BTVacuumPosting btreevacuumposting (BTVacState *vstate, IndexTuple posting, OffsetNumber updatedoffset, int *nremaining)
 
Datum bthandler (PG_FUNCTION_ARGS)
 
void btbuildempty (Relation index)
 
bool btinsert (Relation rel, Datum *values, bool *isnull, ItemPointer ht_ctid, Relation heapRel, IndexUniqueCheck checkUnique, bool indexUnchanged, IndexInfo *indexInfo)
 
bool btgettuple (IndexScanDesc scan, ScanDirection dir)
 
int64 btgetbitmap (IndexScanDesc scan, TIDBitmap *tbm)
 
IndexScanDesc btbeginscan (Relation rel, int nkeys, int norderbys)
 
void btrescan (IndexScanDesc scan, ScanKey scankey, int nscankeys, ScanKey orderbys, int norderbys)
 
void btendscan (IndexScanDesc scan)
 
void btmarkpos (IndexScanDesc scan)
 
void btrestrpos (IndexScanDesc scan)
 
Size btestimateparallelscan (Relation rel, int nkeys, int norderbys)
 
void btinitparallelscan (void *target)
 
void btparallelrescan (IndexScanDesc scan)
 
bool _bt_parallel_seize (IndexScanDesc scan, BlockNumber *next_scan_page, BlockNumber *last_curr_page, bool first)
 
void _bt_parallel_release (IndexScanDesc scan, BlockNumber next_scan_page, BlockNumber curr_page)
 
void _bt_parallel_done (IndexScanDesc scan)
 
void _bt_parallel_primscan_schedule (IndexScanDesc scan, BlockNumber curr_page)
 
IndexBulkDeleteResultbtbulkdelete (IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state)
 
IndexBulkDeleteResultbtvacuumcleanup (IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
 
bool btcanreturn (Relation index, int attno)
 
int btgettreeheight (Relation rel)
 
CompareType bttranslatestrategy (StrategyNumber strategy, Oid opfamily)
 
StrategyNumber bttranslatecmptype (CompareType cmptype, Oid opfamily)
 

Typedef Documentation

◆ BTParallelScanDesc

Definition at line 93 of file nbtree.c.

◆ BTParallelScanDescData

Enumeration Type Documentation

◆ BTPS_State

enum BTPS_State
Enumerator
BTPARALLEL_NOT_INITIALIZED 
BTPARALLEL_NEED_PRIMSCAN 
BTPARALLEL_ADVANCING 
BTPARALLEL_IDLE 
BTPARALLEL_DONE 

Definition at line 54 of file nbtree.c.

55{
BTPS_State
Definition: nbtree.c:55
@ BTPARALLEL_ADVANCING
Definition: nbtree.c:58
@ BTPARALLEL_NEED_PRIMSCAN
Definition: nbtree.c:57
@ BTPARALLEL_NOT_INITIALIZED
Definition: nbtree.c:56
@ BTPARALLEL_IDLE
Definition: nbtree.c:59
@ BTPARALLEL_DONE
Definition: nbtree.c:60

Function Documentation

◆ _bt_parallel_done()

void _bt_parallel_done ( IndexScanDesc  scan)

Definition at line 949 of file nbtree.c.

950{
951 BTScanOpaque so = (BTScanOpaque) scan->opaque;
952 ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
953 BTParallelScanDesc btscan;
954 bool status_changed = false;
955
957
958 /* Do nothing, for non-parallel scans */
959 if (parallel_scan == NULL)
960 return;
961
962 /*
963 * Should not mark parallel scan done when there's still a pending
964 * primitive index scan
965 */
966 if (so->needPrimScan)
967 return;
968
969 btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
970 parallel_scan->ps_offset_am);
971
972 /*
973 * Mark the parallel scan as done, unless some other process did so
974 * already
975 */
976 LWLockAcquire(&btscan->btps_lock, LW_EXCLUSIVE);
977 Assert(btscan->btps_pageStatus != BTPARALLEL_NEED_PRIMSCAN);
978 if (btscan->btps_pageStatus != BTPARALLEL_DONE)
979 {
980 btscan->btps_pageStatus = BTPARALLEL_DONE;
981 status_changed = true;
982 }
983 LWLockRelease(&btscan->btps_lock);
984
985 /* wake up all the workers associated with this parallel scan */
986 if (status_changed)
987 ConditionVariableBroadcast(&btscan->btps_cv);
988}
#define OffsetToPointer(base, offset)
Definition: c.h:743
void ConditionVariableBroadcast(ConditionVariable *cv)
Assert(PointerIsAligned(start, uint64))
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1182
void LWLockRelease(LWLock *lock)
Definition: lwlock.c:1902
@ LW_EXCLUSIVE
Definition: lwlock.h:114
struct BTParallelScanDescData * BTParallelScanDesc
Definition: nbtree.c:93
#define BTScanPosIsValid(scanpos)
Definition: nbtree.h:1021
BTScanOpaqueData * BTScanOpaque
Definition: nbtree.h:1096
bool needPrimScan
Definition: nbtree.h:1063
BTScanPosData currPos
Definition: nbtree.h:1092
struct ParallelIndexScanDescData * parallel_scan
Definition: relscan.h:191

References Assert(), BTPARALLEL_DONE, BTPARALLEL_NEED_PRIMSCAN, BTScanPosIsValid, ConditionVariableBroadcast(), BTScanOpaqueData::currPos, LW_EXCLUSIVE, LWLockAcquire(), LWLockRelease(), BTScanOpaqueData::needPrimScan, OffsetToPointer, IndexScanDescData::opaque, and IndexScanDescData::parallel_scan.

Referenced by _bt_endpoint(), _bt_first(), _bt_parallel_seize(), _bt_readnextpage(), and _bt_start_prim_scan().

◆ _bt_parallel_primscan_schedule()

void _bt_parallel_primscan_schedule ( IndexScanDesc  scan,
BlockNumber  curr_page 
)

Definition at line 999 of file nbtree.c.

1000{
1001 Relation rel = scan->indexRelation;
1002 BTScanOpaque so = (BTScanOpaque) scan->opaque;
1003 ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
1004 BTParallelScanDesc btscan;
1005
1006 Assert(so->numArrayKeys);
1007
1008 btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
1009 parallel_scan->ps_offset_am);
1010
1011 LWLockAcquire(&btscan->btps_lock, LW_EXCLUSIVE);
1012 if (btscan->btps_lastCurrPage == curr_page &&
1013 btscan->btps_pageStatus == BTPARALLEL_IDLE)
1014 {
1015 btscan->btps_nextScanPage = InvalidBlockNumber;
1016 btscan->btps_lastCurrPage = InvalidBlockNumber;
1017 btscan->btps_pageStatus = BTPARALLEL_NEED_PRIMSCAN;
1018
1019 /* Serialize scan's current array keys */
1020 _bt_parallel_serialize_arrays(rel, btscan, so);
1021 }
1022 LWLockRelease(&btscan->btps_lock);
1023}
#define InvalidBlockNumber
Definition: block.h:33
static void _bt_parallel_serialize_arrays(Relation rel, BTParallelScanDesc btscan, BTScanOpaque so)
Definition: nbtree.c:631
Relation indexRelation
Definition: relscan.h:137

References _bt_parallel_serialize_arrays(), Assert(), BTPARALLEL_IDLE, BTPARALLEL_NEED_PRIMSCAN, IndexScanDescData::indexRelation, InvalidBlockNumber, LW_EXCLUSIVE, LWLockAcquire(), LWLockRelease(), BTScanOpaqueData::numArrayKeys, OffsetToPointer, IndexScanDescData::opaque, and IndexScanDescData::parallel_scan.

Referenced by _bt_advance_array_keys(), and _bt_readpage().

◆ _bt_parallel_release()

void _bt_parallel_release ( IndexScanDesc  scan,
BlockNumber  next_scan_page,
BlockNumber  curr_page 
)

Definition at line 922 of file nbtree.c.

924{
925 ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
926 BTParallelScanDesc btscan;
927
928 Assert(BlockNumberIsValid(next_scan_page));
929
930 btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
931 parallel_scan->ps_offset_am);
932
934 btscan->btps_nextScanPage = next_scan_page;
935 btscan->btps_lastCurrPage = curr_page;
937 LWLockRelease(&btscan->btps_lock);
939}
static bool BlockNumberIsValid(BlockNumber blockNumber)
Definition: block.h:71
void ConditionVariableSignal(ConditionVariable *cv)
BTPS_State btps_pageStatus
Definition: nbtree.c:72
BlockNumber btps_lastCurrPage
Definition: nbtree.c:70
ConditionVariable btps_cv
Definition: nbtree.c:76
BlockNumber btps_nextScanPage
Definition: nbtree.c:69

References Assert(), BlockNumberIsValid(), BTPARALLEL_IDLE, BTParallelScanDescData::btps_cv, BTParallelScanDescData::btps_lastCurrPage, BTParallelScanDescData::btps_lock, BTParallelScanDescData::btps_nextScanPage, BTParallelScanDescData::btps_pageStatus, ConditionVariableSignal(), LW_EXCLUSIVE, LWLockAcquire(), LWLockRelease(), OffsetToPointer, IndexScanDescData::parallel_scan, and ParallelIndexScanDescData::ps_offset_am.

Referenced by _bt_readnextpage(), and _bt_readpage().

◆ _bt_parallel_restore_arrays()

static void _bt_parallel_restore_arrays ( Relation  rel,
BTParallelScanDesc  btscan,
BTScanOpaque  so 
)
static

Definition at line 674 of file nbtree.c.

676{
677 char *datumshared;
678
679 /* Space for serialized datums begins immediately after btps_arrElems[] */
680 datumshared = ((char *) &btscan->btps_arrElems[so->numArrayKeys]);
681 for (int i = 0; i < so->numArrayKeys; i++)
682 {
683 BTArrayKeyInfo *array = &so->arrayKeys[i];
684 ScanKey skey = &so->keyData[array->scan_key];
685 bool isnull;
686
687 if (array->num_elems != -1)
688 {
689 /* Restore SAOP array using its saved cur_elem */
690 Assert(!(skey->sk_flags & SK_BT_SKIP));
691 array->cur_elem = btscan->btps_arrElems[i];
692 skey->sk_argument = array->elem_values[array->cur_elem];
693 continue;
694 }
695
696 /* Restore skip array by restoring its key directly */
697 if (!array->attbyval && skey->sk_argument)
699 skey->sk_argument = (Datum) 0;
700 memcpy(&skey->sk_flags, datumshared, sizeof(int));
701 datumshared += sizeof(int);
702
703 Assert(skey->sk_flags & SK_BT_SKIP);
704
705 if (skey->sk_flags & (SK_BT_MINVAL | SK_BT_MAXVAL))
706 {
707 /* No sk_argument datum to restore */
708 continue;
709 }
710
711 skey->sk_argument = datumRestore(&datumshared, &isnull);
712 if (isnull)
713 {
714 Assert(skey->sk_argument == 0);
716 Assert(skey->sk_flags & SK_ISNULL);
717 }
718 }
719}
Datum datumRestore(char **start_address, bool *isnull)
Definition: datum.c:521
int i
Definition: isn.c:77
void pfree(void *pointer)
Definition: mcxt.c:2150
#define SK_BT_SKIP
Definition: nbtree.h:1136
#define SK_BT_MAXVAL
Definition: nbtree.h:1140
#define SK_BT_MINVAL
Definition: nbtree.h:1139
uintptr_t Datum
Definition: postgres.h:69
static Pointer DatumGetPointer(Datum X)
Definition: postgres.h:317
#define SK_SEARCHNULL
Definition: skey.h:121
#define SK_ISNULL
Definition: skey.h:115
bool attbyval
Definition: nbtree.h:1046
Datum * elem_values
Definition: nbtree.h:1041
int btps_arrElems[FLEXIBLE_ARRAY_MEMBER]
Definition: nbtree.c:83
BTArrayKeyInfo * arrayKeys
Definition: nbtree.h:1066
ScanKey keyData
Definition: nbtree.h:1058
int sk_flags
Definition: skey.h:66
Datum sk_argument
Definition: skey.h:72

References BTScanOpaqueData::arrayKeys, Assert(), BTArrayKeyInfo::attbyval, BTParallelScanDescData::btps_arrElems, BTArrayKeyInfo::cur_elem, DatumGetPointer(), datumRestore(), BTArrayKeyInfo::elem_values, i, BTScanOpaqueData::keyData, BTArrayKeyInfo::num_elems, BTScanOpaqueData::numArrayKeys, pfree(), BTArrayKeyInfo::scan_key, ScanKeyData::sk_argument, SK_BT_MAXVAL, SK_BT_MINVAL, SK_BT_SKIP, ScanKeyData::sk_flags, SK_ISNULL, and SK_SEARCHNULL.

Referenced by _bt_parallel_seize().

◆ _bt_parallel_seize()

bool _bt_parallel_seize ( IndexScanDesc  scan,
BlockNumber next_scan_page,
BlockNumber last_curr_page,
bool  first 
)

Definition at line 784 of file nbtree.c.

786{
787 Relation rel = scan->indexRelation;
788 BTScanOpaque so = (BTScanOpaque) scan->opaque;
789 bool exit_loop = false,
790 status = true,
791 endscan = false;
792 ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
793 BTParallelScanDesc btscan;
794
795 *next_scan_page = InvalidBlockNumber;
796 *last_curr_page = InvalidBlockNumber;
797
798 /*
799 * Reset so->currPos, and initialize moreLeft/moreRight such that the next
800 * call to _bt_readnextpage treats this backend similarly to a serial
801 * backend that steps from *last_curr_page to *next_scan_page (unless this
802 * backend's so->currPos is initialized by _bt_readfirstpage before then).
803 */
805 so->currPos.moreLeft = so->currPos.moreRight = true;
806
807 if (first)
808 {
809 /*
810 * Initialize array related state when called from _bt_first, assuming
811 * that this will be the first primitive index scan for the scan
812 */
813 so->needPrimScan = false;
814 so->scanBehind = false;
815 so->oppositeDirCheck = false;
816 }
817 else
818 {
819 /*
820 * Don't attempt to seize the scan when it requires another primitive
821 * index scan, since caller's backend cannot start it right now
822 */
823 if (so->needPrimScan)
824 return false;
825 }
826
827 btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
828 parallel_scan->ps_offset_am);
829
830 while (1)
831 {
832 LWLockAcquire(&btscan->btps_lock, LW_EXCLUSIVE);
833
834 if (btscan->btps_pageStatus == BTPARALLEL_DONE)
835 {
836 /* We're done with this parallel index scan */
837 status = false;
838 }
839 else if (btscan->btps_pageStatus == BTPARALLEL_IDLE &&
840 btscan->btps_nextScanPage == P_NONE)
841 {
842 /* End this parallel index scan */
843 status = false;
844 endscan = true;
845 }
846 else if (btscan->btps_pageStatus == BTPARALLEL_NEED_PRIMSCAN)
847 {
848 Assert(so->numArrayKeys);
849
850 if (first)
851 {
852 /* Can start scheduled primitive scan right away, so do so */
853 btscan->btps_pageStatus = BTPARALLEL_ADVANCING;
854
855 /* Restore scan's array keys from serialized values */
856 _bt_parallel_restore_arrays(rel, btscan, so);
857 exit_loop = true;
858 }
859 else
860 {
861 /*
862 * Don't attempt to seize the scan when it requires another
863 * primitive index scan, since caller's backend cannot start
864 * it right now
865 */
866 status = false;
867 }
868
869 /*
870 * Either way, update backend local state to indicate that a
871 * pending primitive scan is required
872 */
873 so->needPrimScan = true;
874 so->scanBehind = false;
875 so->oppositeDirCheck = false;
876 }
877 else if (btscan->btps_pageStatus != BTPARALLEL_ADVANCING)
878 {
879 /*
880 * We have successfully seized control of the scan for the purpose
881 * of advancing it to a new page!
882 */
883 btscan->btps_pageStatus = BTPARALLEL_ADVANCING;
884 Assert(btscan->btps_nextScanPage != P_NONE);
885 *next_scan_page = btscan->btps_nextScanPage;
886 *last_curr_page = btscan->btps_lastCurrPage;
887 exit_loop = true;
888 }
889 LWLockRelease(&btscan->btps_lock);
890 if (exit_loop || !status)
891 break;
892 ConditionVariableSleep(&btscan->btps_cv, WAIT_EVENT_BTREE_PAGE);
893 }
895
896 /* When the scan has reached the rightmost (or leftmost) page, end it */
897 if (endscan)
898 _bt_parallel_done(scan);
899
900 return status;
901}
bool ConditionVariableCancelSleep(void)
void ConditionVariableSleep(ConditionVariable *cv, uint32 wait_event_info)
void _bt_parallel_done(IndexScanDesc scan)
Definition: nbtree.c:949
static void _bt_parallel_restore_arrays(Relation rel, BTParallelScanDesc btscan, BTScanOpaque so)
Definition: nbtree.c:674
#define P_NONE
Definition: nbtree.h:213
#define BTScanPosInvalidate(scanpos)
Definition: nbtree.h:1027
bool oppositeDirCheck
Definition: nbtree.h:1065
bool moreRight
Definition: nbtree.h:986
bool moreLeft
Definition: nbtree.h:985

References _bt_parallel_done(), _bt_parallel_restore_arrays(), Assert(), BTPARALLEL_ADVANCING, BTPARALLEL_DONE, BTPARALLEL_IDLE, BTPARALLEL_NEED_PRIMSCAN, BTScanPosInvalidate, ConditionVariableCancelSleep(), ConditionVariableSleep(), BTScanOpaqueData::currPos, IndexScanDescData::indexRelation, InvalidBlockNumber, LW_EXCLUSIVE, LWLockAcquire(), LWLockRelease(), BTScanPosData::moreLeft, BTScanPosData::moreRight, BTScanOpaqueData::needPrimScan, BTScanOpaqueData::numArrayKeys, OffsetToPointer, IndexScanDescData::opaque, BTScanOpaqueData::oppositeDirCheck, P_NONE, IndexScanDescData::parallel_scan, and BTScanOpaqueData::scanBehind.

Referenced by _bt_first(), and _bt_readnextpage().

◆ _bt_parallel_serialize_arrays()

static void _bt_parallel_serialize_arrays ( Relation  rel,
BTParallelScanDesc  btscan,
BTScanOpaque  so 
)
static

Definition at line 631 of file nbtree.c.

633{
634 char *datumshared;
635
636 /* Space for serialized datums begins immediately after btps_arrElems[] */
637 datumshared = ((char *) &btscan->btps_arrElems[so->numArrayKeys]);
638 for (int i = 0; i < so->numArrayKeys; i++)
639 {
640 BTArrayKeyInfo *array = &so->arrayKeys[i];
641 ScanKey skey = &so->keyData[array->scan_key];
642
643 if (array->num_elems != -1)
644 {
645 /* Save SAOP array's cur_elem (no need to copy key/datum) */
646 Assert(!(skey->sk_flags & SK_BT_SKIP));
647 btscan->btps_arrElems[i] = array->cur_elem;
648 continue;
649 }
650
651 /* Save all mutable state associated with skip array's key */
652 Assert(skey->sk_flags & SK_BT_SKIP);
653 memcpy(datumshared, &skey->sk_flags, sizeof(int));
654 datumshared += sizeof(int);
655
656 if (skey->sk_flags & (SK_BT_MINVAL | SK_BT_MAXVAL))
657 {
658 /* No sk_argument datum to serialize */
659 Assert(skey->sk_argument == 0);
660 continue;
661 }
662
663 datumSerialize(skey->sk_argument, (skey->sk_flags & SK_ISNULL) != 0,
664 array->attbyval, array->attlen, &datumshared);
665 }
666}
void datumSerialize(Datum value, bool isnull, bool typByVal, int typLen, char **start_address)
Definition: datum.c:459
int16 attlen
Definition: nbtree.h:1045

References BTScanOpaqueData::arrayKeys, Assert(), BTArrayKeyInfo::attbyval, BTArrayKeyInfo::attlen, BTParallelScanDescData::btps_arrElems, BTArrayKeyInfo::cur_elem, datumSerialize(), i, BTScanOpaqueData::keyData, BTArrayKeyInfo::num_elems, BTScanOpaqueData::numArrayKeys, BTArrayKeyInfo::scan_key, ScanKeyData::sk_argument, SK_BT_MAXVAL, SK_BT_MINVAL, SK_BT_SKIP, ScanKeyData::sk_flags, and SK_ISNULL.

Referenced by _bt_parallel_primscan_schedule().

◆ btbeginscan()

IndexScanDesc btbeginscan ( Relation  rel,
int  nkeys,
int  norderbys 
)

Definition at line 332 of file nbtree.c.

333{
334 IndexScanDesc scan;
335 BTScanOpaque so;
336
337 /* no order by operators allowed */
338 Assert(norderbys == 0);
339
340 /* get the scan */
341 scan = RelationGetIndexScan(rel, nkeys, norderbys);
342
343 /* allocate private workspace */
344 so = (BTScanOpaque) palloc(sizeof(BTScanOpaqueData));
347 if (scan->numberOfKeys > 0)
348 so->keyData = (ScanKey) palloc(scan->numberOfKeys * sizeof(ScanKeyData));
349 else
350 so->keyData = NULL;
351
352 so->skipScan = false;
353 so->needPrimScan = false;
354 so->scanBehind = false;
355 so->oppositeDirCheck = false;
356 so->arrayKeys = NULL;
357 so->orderProcs = NULL;
358 so->arrayContext = NULL;
359
360 so->killedItems = NULL; /* until needed */
361 so->numKilled = 0;
362
363 /*
364 * We don't know yet whether the scan will be index-only, so we do not
365 * allocate the tuple workspace arrays until btrescan. However, we set up
366 * scan->xs_itupdesc whether we'll need it or not, since that's so cheap.
367 */
368 so->currTuples = so->markTuples = NULL;
369
370 scan->xs_itupdesc = RelationGetDescr(rel);
371
372 scan->opaque = so;
373
374 return scan;
375}
IndexScanDesc RelationGetIndexScan(Relation indexRelation, int nkeys, int norderbys)
Definition: genam.c:80
void * palloc(Size size)
Definition: mcxt.c:1943
#define RelationGetDescr(relation)
Definition: rel.h:542
ScanKeyData * ScanKey
Definition: skey.h:75
char * markTuples
Definition: nbtree.h:1080
FmgrInfo * orderProcs
Definition: nbtree.h:1067
int * killedItems
Definition: nbtree.h:1071
char * currTuples
Definition: nbtree.h:1079
BTScanPosData markPos
Definition: nbtree.h:1093
MemoryContext arrayContext
Definition: nbtree.h:1068
struct TupleDescData * xs_itupdesc
Definition: relscan.h:168

References BTScanOpaqueData::arrayContext, BTScanOpaqueData::arrayKeys, Assert(), BTScanPosInvalidate, BTScanOpaqueData::currPos, BTScanOpaqueData::currTuples, BTScanOpaqueData::keyData, BTScanOpaqueData::killedItems, BTScanOpaqueData::markPos, BTScanOpaqueData::markTuples, BTScanOpaqueData::needPrimScan, IndexScanDescData::numberOfKeys, BTScanOpaqueData::numKilled, IndexScanDescData::opaque, BTScanOpaqueData::oppositeDirCheck, BTScanOpaqueData::orderProcs, palloc(), RelationGetDescr, RelationGetIndexScan(), BTScanOpaqueData::scanBehind, BTScanOpaqueData::skipScan, and IndexScanDescData::xs_itupdesc.

Referenced by bthandler().

◆ btbuildempty()

void btbuildempty ( Relation  index)

Definition at line 179 of file nbtree.c.

180{
181 bool allequalimage = _bt_allequalimage(index, false);
182 BulkWriteState *bulkstate;
183 BulkWriteBuffer metabuf;
184
186
187 /* Construct metapage. */
188 metabuf = smgr_bulk_get_buf(bulkstate);
189 _bt_initmetapage((Page) metabuf, P_NONE, 0, allequalimage);
190 smgr_bulk_write(bulkstate, BTREE_METAPAGE, metabuf, true);
191
192 smgr_bulk_finish(bulkstate);
193}
PageData * Page
Definition: bufpage.h:82
BulkWriteState * smgr_bulk_start_rel(Relation rel, ForkNumber forknum)
Definition: bulk_write.c:87
void smgr_bulk_write(BulkWriteState *bulkstate, BlockNumber blocknum, BulkWriteBuffer buf, bool page_std)
Definition: bulk_write.c:323
BulkWriteBuffer smgr_bulk_get_buf(BulkWriteState *bulkstate)
Definition: bulk_write.c:347
void smgr_bulk_finish(BulkWriteState *bulkstate)
Definition: bulk_write.c:130
void _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level, bool allequalimage)
Definition: nbtpage.c:67
#define BTREE_METAPAGE
Definition: nbtree.h:149
bool _bt_allequalimage(Relation rel, bool debugmessage)
Definition: nbtutils.c:4273
@ INIT_FORKNUM
Definition: relpath.h:61
Definition: type.h:96

References _bt_allequalimage(), _bt_initmetapage(), BTREE_METAPAGE, INIT_FORKNUM, P_NONE, smgr_bulk_finish(), smgr_bulk_get_buf(), smgr_bulk_start_rel(), and smgr_bulk_write().

Referenced by bthandler().

◆ btbulkdelete()

IndexBulkDeleteResult * btbulkdelete ( IndexVacuumInfo info,
IndexBulkDeleteResult stats,
IndexBulkDeleteCallback  callback,
void *  callback_state 
)

Definition at line 1033 of file nbtree.c.

1035{
1036 Relation rel = info->index;
1037 BTCycleId cycleid;
1038
1039 /* allocate stats if first time through, else re-use existing struct */
1040 if (stats == NULL)
1042
1043 /* Establish the vacuum cycle ID to use for this scan */
1044 /* The ENSURE stuff ensures we clean up shared memory on failure */
1046 {
1047 cycleid = _bt_start_vacuum(rel);
1048
1049 btvacuumscan(info, stats, callback, callback_state, cycleid);
1050 }
1052 _bt_end_vacuum(rel);
1053
1054 return stats;
1055}
#define PG_ENSURE_ERROR_CLEANUP(cleanup_function, arg)
Definition: ipc.h:47
#define PG_END_ENSURE_ERROR_CLEANUP(cleanup_function, arg)
Definition: ipc.h:52
void * palloc0(Size size)
Definition: mcxt.c:1973
static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state, BTCycleId cycleid)
Definition: nbtree.c:1151
uint16 BTCycleId
Definition: nbtree.h:30
void _bt_end_vacuum(Relation rel)
Definition: nbtutils.c:3618
void _bt_end_vacuum_callback(int code, Datum arg)
Definition: nbtutils.c:3646
BTCycleId _bt_start_vacuum(Relation rel)
Definition: nbtutils.c:3561
static Datum PointerGetDatum(const void *X)
Definition: postgres.h:327
Relation index
Definition: genam.h:69
static void callback(struct sockaddr *addr, struct sockaddr *mask, void *unused)
Definition: test_ifaddrs.c:46

References _bt_end_vacuum(), _bt_end_vacuum_callback(), _bt_start_vacuum(), btvacuumscan(), callback(), IndexVacuumInfo::index, palloc0(), PG_END_ENSURE_ERROR_CLEANUP, PG_ENSURE_ERROR_CLEANUP, and PointerGetDatum().

Referenced by bthandler().

◆ btcanreturn()

bool btcanreturn ( Relation  index,
int  attno 
)

Definition at line 1712 of file nbtree.c.

1713{
1714 return true;
1715}

Referenced by bthandler().

◆ btendscan()

void btendscan ( IndexScanDesc  scan)

Definition at line 438 of file nbtree.c.

439{
440 BTScanOpaque so = (BTScanOpaque) scan->opaque;
441
442 /* we aren't holding any read locks, but gotta drop the pins */
444 {
445 /* Before leaving current page, deal with any killed items */
446 if (so->numKilled > 0)
447 _bt_killitems(scan);
449 }
450
451 so->markItemIndex = -1;
453
454 /* No need to invalidate positions, the RAM is about to be freed. */
455
456 /* Release storage */
457 if (so->keyData != NULL)
458 pfree(so->keyData);
459 /* so->arrayKeys and so->orderProcs are in arrayContext */
460 if (so->arrayContext != NULL)
462 if (so->killedItems != NULL)
463 pfree(so->killedItems);
464 if (so->currTuples != NULL)
465 pfree(so->currTuples);
466 /* so->markTuples should not be pfree'd, see btrescan */
467 pfree(so);
468}
if(TABLE==NULL||TABLE_index==NULL)
Definition: isn.c:81
void MemoryContextDelete(MemoryContext context)
Definition: mcxt.c:485
#define BTScanPosUnpinIfPinned(scanpos)
Definition: nbtree.h:1015
void _bt_killitems(IndexScanDesc scan)
Definition: nbtutils.c:3310

References _bt_killitems(), BTScanOpaqueData::arrayContext, BTScanPosIsValid, BTScanPosUnpinIfPinned, BTScanOpaqueData::currPos, BTScanOpaqueData::currTuples, if(), BTScanOpaqueData::keyData, BTScanOpaqueData::killedItems, BTScanOpaqueData::markItemIndex, BTScanOpaqueData::markPos, MemoryContextDelete(), BTScanOpaqueData::numKilled, IndexScanDescData::opaque, and pfree().

Referenced by bthandler().

◆ btestimateparallelscan()

Size btestimateparallelscan ( Relation  rel,
int  nkeys,
int  norderbys 
)

Definition at line 558 of file nbtree.c.

559{
561 Size estnbtreeshared,
562 genericattrspace;
563
564 /*
565 * Pessimistically assume that every input scan key will be output with
566 * its own SAOP array
567 */
568 estnbtreeshared = offsetof(BTParallelScanDescData, btps_arrElems) +
569 sizeof(int) * nkeys;
570
571 /* Single column indexes cannot possibly use a skip array */
572 if (nkeyatts == 1)
573 return estnbtreeshared;
574
575 /*
576 * Pessimistically assume that all attributes prior to the least
577 * significant attribute require a skip array (and an associated key)
578 */
579 genericattrspace = datumEstimateSpace((Datum) 0, false, true,
580 sizeof(Datum));
581 for (int attnum = 1; attnum < nkeyatts; attnum++)
582 {
583 CompactAttribute *attr;
584
585 /*
586 * We make the conservative assumption that every index column will
587 * also require a skip array.
588 *
589 * Every skip array must have space to store its scan key's sk_flags.
590 */
591 estnbtreeshared = add_size(estnbtreeshared, sizeof(int));
592
593 /* Consider space required to store a datum of opclass input type */
594 attr = TupleDescCompactAttr(rel->rd_att, attnum - 1);
595 if (attr->attbyval)
596 {
597 /* This index attribute stores pass-by-value datums */
598 Size estfixed = datumEstimateSpace((Datum) 0, false,
599 true, attr->attlen);
600
601 estnbtreeshared = add_size(estnbtreeshared, estfixed);
602 continue;
603 }
604
605 /*
606 * This index attribute stores pass-by-reference datums.
607 *
608 * Assume that serializing this array will use just as much space as a
609 * pass-by-value datum, in addition to space for the largest possible
610 * whole index tuple (this is not just a per-datum portion of the
611 * largest possible tuple because that'd be almost as large anyway).
612 *
613 * This is quite conservative, but it's not clear how we could do much
614 * better. The executor requires an up-front storage request size
615 * that reliably covers the scan's high watermark memory usage. We
616 * can't be sure of the real high watermark until the scan is over.
617 */
618 estnbtreeshared = add_size(estnbtreeshared, genericattrspace);
619 estnbtreeshared = add_size(estnbtreeshared, BTMaxItemSize);
620 }
621
622 return estnbtreeshared;
623}
int16_t int16
Definition: c.h:497
size_t Size
Definition: c.h:576
Size datumEstimateSpace(Datum value, bool isnull, bool typByVal, int typLen)
Definition: datum.c:412
#define BTMaxItemSize
Definition: nbtree.h:165
int16 attnum
Definition: pg_attribute.h:74
#define IndexRelationGetNumberOfKeyAttributes(relation)
Definition: rel.h:535
Size add_size(Size s1, Size s2)
Definition: shmem.c:493
int16 attlen
Definition: tupdesc.h:71
TupleDesc rd_att
Definition: rel.h:112
static CompactAttribute * TupleDescCompactAttr(TupleDesc tupdesc, int i)
Definition: tupdesc.h:175

References add_size(), CompactAttribute::attbyval, CompactAttribute::attlen, attnum, BTMaxItemSize, BTParallelScanDescData::btps_arrElems, datumEstimateSpace(), IndexRelationGetNumberOfKeyAttributes, RelationData::rd_att, and TupleDescCompactAttr().

Referenced by bthandler().

◆ btgetbitmap()

int64 btgetbitmap ( IndexScanDesc  scan,
TIDBitmap tbm 
)

Definition at line 286 of file nbtree.c.

287{
288 BTScanOpaque so = (BTScanOpaque) scan->opaque;
289 int64 ntids = 0;
290 ItemPointer heapTid;
291
292 /* Each loop iteration performs another primitive index scan */
293 do
294 {
295 /* Fetch the first page & tuple */
297 {
298 /* Save tuple ID, and continue scanning */
299 heapTid = &scan->xs_heaptid;
300 tbm_add_tuples(tbm, heapTid, 1, false);
301 ntids++;
302
303 for (;;)
304 {
305 /*
306 * Advance to next tuple within page. This is the same as the
307 * easy case in _bt_next().
308 */
309 if (++so->currPos.itemIndex > so->currPos.lastItem)
310 {
311 /* let _bt_next do the heavy lifting */
312 if (!_bt_next(scan, ForwardScanDirection))
313 break;
314 }
315
316 /* Save tuple ID, and continue scanning */
317 heapTid = &so->currPos.items[so->currPos.itemIndex].heapTid;
318 tbm_add_tuples(tbm, heapTid, 1, false);
319 ntids++;
320 }
321 }
322 /* Now see if we need another primitive index scan */
324
325 return ntids;
326}
int64_t int64
Definition: c.h:499
bool _bt_first(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:882
bool _bt_next(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:1541
bool _bt_start_prim_scan(IndexScanDesc scan, ScanDirection dir)
Definition: nbtutils.c:1339
@ ForwardScanDirection
Definition: sdir.h:28
int lastItem
Definition: nbtree.h:996
BTScanPosItem items[MaxTIDsPerBTreePage]
Definition: nbtree.h:999
int itemIndex
Definition: nbtree.h:997
ItemPointerData heapTid
Definition: nbtree.h:957
ItemPointerData xs_heaptid
Definition: relscan.h:172
void tbm_add_tuples(TIDBitmap *tbm, const ItemPointer tids, int ntids, bool recheck)
Definition: tidbitmap.c:366

References _bt_first(), _bt_next(), BTScanOpaqueData::currPos, ForwardScanDirection, BTScanPosItem::heapTid, if(), BTScanPosData::itemIndex, BTScanPosData::items, BTScanPosData::lastItem, IndexScanDescData::opaque, tbm_add_tuples(), and IndexScanDescData::xs_heaptid.

Referenced by bthandler().

◆ btgettreeheight()

int btgettreeheight ( Relation  rel)

Definition at line 1721 of file nbtree.c.

1722{
1723 return _bt_getrootheight(rel);
1724}
int _bt_getrootheight(Relation rel)
Definition: nbtpage.c:675

References _bt_getrootheight().

Referenced by bthandler().

◆ btgettuple()

bool btgettuple ( IndexScanDesc  scan,
ScanDirection  dir 
)

Definition at line 226 of file nbtree.c.

227{
228 BTScanOpaque so = (BTScanOpaque) scan->opaque;
229 bool res;
230
231 /* btree indexes are never lossy */
232 scan->xs_recheck = false;
233
234 /* Each loop iteration performs another primitive index scan */
235 do
236 {
237 /*
238 * If we've already initialized this scan, we can just advance it in
239 * the appropriate direction. If we haven't done so yet, we call
240 * _bt_first() to get the first item in the scan.
241 */
243 res = _bt_first(scan, dir);
244 else
245 {
246 /*
247 * Check to see if we should kill the previously-fetched tuple.
248 */
249 if (scan->kill_prior_tuple)
250 {
251 /*
252 * Yes, remember it for later. (We'll deal with all such
253 * tuples at once right before leaving the index page.) The
254 * test for numKilled overrun is not just paranoia: if the
255 * caller reverses direction in the indexscan then the same
256 * item might get entered multiple times. It's not worth
257 * trying to optimize that, so we don't detect it, but instead
258 * just forget any excess entries.
259 */
260 if (so->killedItems == NULL)
261 so->killedItems = (int *)
262 palloc(MaxTIDsPerBTreePage * sizeof(int));
264 so->killedItems[so->numKilled++] = so->currPos.itemIndex;
265 }
266
267 /*
268 * Now continue the scan.
269 */
270 res = _bt_next(scan, dir);
271 }
272
273 /* If we have a tuple, return it ... */
274 if (res)
275 break;
276 /* ... otherwise see if we need another primitive index scan */
277 } while (so->numArrayKeys && _bt_start_prim_scan(scan, dir));
278
279 return res;
280}
#define MaxTIDsPerBTreePage
Definition: nbtree.h:186
bool kill_prior_tuple
Definition: relscan.h:147

References _bt_first(), _bt_next(), BTScanPosIsValid, BTScanOpaqueData::currPos, if(), BTScanPosData::itemIndex, IndexScanDescData::kill_prior_tuple, BTScanOpaqueData::killedItems, MaxTIDsPerBTreePage, BTScanOpaqueData::numKilled, IndexScanDescData::opaque, palloc(), and IndexScanDescData::xs_recheck.

Referenced by bthandler().

◆ bthandler()

Datum bthandler ( PG_FUNCTION_ARGS  )

Definition at line 115 of file nbtree.c.

116{
118
120 amroutine->amsupport = BTNProcs;
121 amroutine->amoptsprocnum = BTOPTIONS_PROC;
122 amroutine->amcanorder = true;
123 amroutine->amcanorderbyop = false;
124 amroutine->amcanhash = false;
125 amroutine->amconsistentequality = true;
126 amroutine->amconsistentordering = true;
127 amroutine->amcanbackward = true;
128 amroutine->amcanunique = true;
129 amroutine->amcanmulticol = true;
130 amroutine->amoptionalkey = true;
131 amroutine->amsearcharray = true;
132 amroutine->amsearchnulls = true;
133 amroutine->amstorage = false;
134 amroutine->amclusterable = true;
135 amroutine->ampredlocks = true;
136 amroutine->amcanparallel = true;
137 amroutine->amcanbuildparallel = true;
138 amroutine->amcaninclude = true;
139 amroutine->amusemaintenanceworkmem = false;
140 amroutine->amsummarizing = false;
141 amroutine->amparallelvacuumoptions =
143 amroutine->amkeytype = InvalidOid;
144
145 amroutine->ambuild = btbuild;
146 amroutine->ambuildempty = btbuildempty;
147 amroutine->aminsert = btinsert;
148 amroutine->aminsertcleanup = NULL;
149 amroutine->ambulkdelete = btbulkdelete;
150 amroutine->amvacuumcleanup = btvacuumcleanup;
151 amroutine->amcanreturn = btcanreturn;
152 amroutine->amcostestimate = btcostestimate;
153 amroutine->amgettreeheight = btgettreeheight;
154 amroutine->amoptions = btoptions;
155 amroutine->amproperty = btproperty;
157 amroutine->amvalidate = btvalidate;
158 amroutine->amadjustmembers = btadjustmembers;
159 amroutine->ambeginscan = btbeginscan;
160 amroutine->amrescan = btrescan;
161 amroutine->amgettuple = btgettuple;
162 amroutine->amgetbitmap = btgetbitmap;
163 amroutine->amendscan = btendscan;
164 amroutine->ammarkpos = btmarkpos;
165 amroutine->amrestrpos = btrestrpos;
171
172 PG_RETURN_POINTER(amroutine);
173}
#define PG_RETURN_POINTER(x)
Definition: fmgr.h:361
bool btcanreturn(Relation index, int attno)
Definition: nbtree.c:1712
StrategyNumber bttranslatecmptype(CompareType cmptype, Oid opfamily)
Definition: nbtree.c:1747
IndexScanDesc btbeginscan(Relation rel, int nkeys, int norderbys)
Definition: nbtree.c:332
Size btestimateparallelscan(Relation rel, int nkeys, int norderbys)
Definition: nbtree.c:558
IndexBulkDeleteResult * btvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
Definition: nbtree.c:1063
CompareType bttranslatestrategy(StrategyNumber strategy, Oid opfamily)
Definition: nbtree.c:1727
bool btgettuple(IndexScanDesc scan, ScanDirection dir)
Definition: nbtree.c:226
void btparallelrescan(IndexScanDesc scan)
Definition: nbtree.c:741
bool btinsert(Relation rel, Datum *values, bool *isnull, ItemPointer ht_ctid, Relation heapRel, IndexUniqueCheck checkUnique, bool indexUnchanged, IndexInfo *indexInfo)
Definition: nbtree.c:202
void btbuildempty(Relation index)
Definition: nbtree.c:179
int btgettreeheight(Relation rel)
Definition: nbtree.c:1721
void btinitparallelscan(void *target)
Definition: nbtree.c:725
IndexBulkDeleteResult * btbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state)
Definition: nbtree.c:1033
int64 btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
Definition: nbtree.c:286
void btmarkpos(IndexScanDesc scan)
Definition: nbtree.c:474
void btendscan(IndexScanDesc scan)
Definition: nbtree.c:438
void btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, ScanKey orderbys, int norderbys)
Definition: nbtree.c:381
void btrestrpos(IndexScanDesc scan)
Definition: nbtree.c:500
#define BTNProcs
Definition: nbtree.h:723
#define BTOPTIONS_PROC
Definition: nbtree.h:721
IndexBuildResult * btbuild(Relation heap, Relation index, IndexInfo *indexInfo)
Definition: nbtsort.c:295
char * btbuildphasename(int64 phasenum)
Definition: nbtutils.c:3742
bytea * btoptions(Datum reloptions, bool validate)
Definition: nbtutils.c:3696
bool btproperty(Oid index_oid, int attno, IndexAMProperty prop, const char *propname, bool *res, bool *isnull)
Definition: nbtutils.c:3719
bool btvalidate(Oid opclassoid)
Definition: nbtvalidate.c:40
void btadjustmembers(Oid opfamilyoid, Oid opclassoid, List *operators, List *functions)
Definition: nbtvalidate.c:288
#define makeNode(_type_)
Definition: nodes.h:161
#define InvalidOid
Definition: postgres_ext.h:35
void btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count, Cost *indexStartupCost, Cost *indexTotalCost, Selectivity *indexSelectivity, double *indexCorrelation, double *indexPages)
Definition: selfuncs.c:7226
#define BTMaxStrategyNumber
Definition: stratnum.h:35
ambuildphasename_function ambuildphasename
Definition: amapi.h:304
ambuildempty_function ambuildempty
Definition: amapi.h:294
amvacuumcleanup_function amvacuumcleanup
Definition: amapi.h:298
bool amclusterable
Definition: amapi.h:268
amoptions_function amoptions
Definition: amapi.h:302
amestimateparallelscan_function amestimateparallelscan
Definition: amapi.h:316
amrestrpos_function amrestrpos
Definition: amapi.h:313
aminsert_function aminsert
Definition: amapi.h:295
amendscan_function amendscan
Definition: amapi.h:311
amtranslate_strategy_function amtranslatestrategy
Definition: amapi.h:321
uint16 amoptsprocnum
Definition: amapi.h:242
amparallelrescan_function amparallelrescan
Definition: amapi.h:318
Oid amkeytype
Definition: amapi.h:284
bool amconsistentordering
Definition: amapi.h:252
bool ampredlocks
Definition: amapi.h:270
uint16 amsupport
Definition: amapi.h:240
amtranslate_cmptype_function amtranslatecmptype
Definition: amapi.h:322
amcostestimate_function amcostestimate
Definition: amapi.h:300
bool amcanorderbyop
Definition: amapi.h:246
amadjustmembers_function amadjustmembers
Definition: amapi.h:306
ambuild_function ambuild
Definition: amapi.h:293
bool amstorage
Definition: amapi.h:266
uint16 amstrategies
Definition: amapi.h:238
bool amoptionalkey
Definition: amapi.h:260
amgettuple_function amgettuple
Definition: amapi.h:309
amcanreturn_function amcanreturn
Definition: amapi.h:299
bool amcanunique
Definition: amapi.h:256
amgetbitmap_function amgetbitmap
Definition: amapi.h:310
amproperty_function amproperty
Definition: amapi.h:303
ambulkdelete_function ambulkdelete
Definition: amapi.h:297
bool amsearcharray
Definition: amapi.h:262
bool amsummarizing
Definition: amapi.h:280
amvalidate_function amvalidate
Definition: amapi.h:305
ammarkpos_function ammarkpos
Definition: amapi.h:312
bool amcanmulticol
Definition: amapi.h:258
bool amusemaintenanceworkmem
Definition: amapi.h:278
ambeginscan_function ambeginscan
Definition: amapi.h:307
bool amcanparallel
Definition: amapi.h:272
amrescan_function amrescan
Definition: amapi.h:308
bool amcanorder
Definition: amapi.h:244
bool amcanbuildparallel
Definition: amapi.h:274
aminitparallelscan_function aminitparallelscan
Definition: amapi.h:317
uint8 amparallelvacuumoptions
Definition: amapi.h:282
aminsertcleanup_function aminsertcleanup
Definition: amapi.h:296
bool amcanbackward
Definition: amapi.h:254
amgettreeheight_function amgettreeheight
Definition: amapi.h:301
bool amcaninclude
Definition: amapi.h:276
bool amsearchnulls
Definition: amapi.h:264
bool amconsistentequality
Definition: amapi.h:250
bool amcanhash
Definition: amapi.h:248
#define VACUUM_OPTION_PARALLEL_BULKDEL
Definition: vacuum.h:48
#define VACUUM_OPTION_PARALLEL_COND_CLEANUP
Definition: vacuum.h:55

References IndexAmRoutine::amadjustmembers, IndexAmRoutine::ambeginscan, IndexAmRoutine::ambuild, IndexAmRoutine::ambuildempty, IndexAmRoutine::ambuildphasename, IndexAmRoutine::ambulkdelete, IndexAmRoutine::amcanbackward, IndexAmRoutine::amcanbuildparallel, IndexAmRoutine::amcanhash, IndexAmRoutine::amcaninclude, IndexAmRoutine::amcanmulticol, IndexAmRoutine::amcanorder, IndexAmRoutine::amcanorderbyop, IndexAmRoutine::amcanparallel, IndexAmRoutine::amcanreturn, IndexAmRoutine::amcanunique, IndexAmRoutine::amclusterable, IndexAmRoutine::amconsistentequality, IndexAmRoutine::amconsistentordering, IndexAmRoutine::amcostestimate, IndexAmRoutine::amendscan, IndexAmRoutine::amestimateparallelscan, IndexAmRoutine::amgetbitmap, IndexAmRoutine::amgettreeheight, IndexAmRoutine::amgettuple, IndexAmRoutine::aminitparallelscan, IndexAmRoutine::aminsert, IndexAmRoutine::aminsertcleanup, IndexAmRoutine::amkeytype, IndexAmRoutine::ammarkpos, IndexAmRoutine::amoptionalkey, IndexAmRoutine::amoptions, IndexAmRoutine::amoptsprocnum, IndexAmRoutine::amparallelrescan, IndexAmRoutine::amparallelvacuumoptions, IndexAmRoutine::ampredlocks, IndexAmRoutine::amproperty, IndexAmRoutine::amrescan, IndexAmRoutine::amrestrpos, IndexAmRoutine::amsearcharray, IndexAmRoutine::amsearchnulls, IndexAmRoutine::amstorage, IndexAmRoutine::amstrategies, IndexAmRoutine::amsummarizing, IndexAmRoutine::amsupport, IndexAmRoutine::amtranslatecmptype, IndexAmRoutine::amtranslatestrategy, IndexAmRoutine::amusemaintenanceworkmem, IndexAmRoutine::amvacuumcleanup, IndexAmRoutine::amvalidate, btadjustmembers(), btbeginscan(), btbuild(), btbuildempty(), btbuildphasename(), btbulkdelete(), btcanreturn(), btcostestimate(), btendscan(), btestimateparallelscan(), btgetbitmap(), btgettreeheight(), btgettuple(), btinitparallelscan(), btinsert(), btmarkpos(), BTMaxStrategyNumber, BTNProcs, btoptions(), BTOPTIONS_PROC, btparallelrescan(), btproperty(), btrescan(), btrestrpos(), bttranslatecmptype(), bttranslatestrategy(), btvacuumcleanup(), btvalidate(), InvalidOid, makeNode, PG_RETURN_POINTER, VACUUM_OPTION_PARALLEL_BULKDEL, and VACUUM_OPTION_PARALLEL_COND_CLEANUP.

◆ btinitparallelscan()

void btinitparallelscan ( void *  target)

◆ btinsert()

bool btinsert ( Relation  rel,
Datum values,
bool *  isnull,
ItemPointer  ht_ctid,
Relation  heapRel,
IndexUniqueCheck  checkUnique,
bool  indexUnchanged,
IndexInfo indexInfo 
)

Definition at line 202 of file nbtree.c.

207{
208 bool result;
209 IndexTuple itup;
210
211 /* generate an index tuple */
212 itup = index_form_tuple(RelationGetDescr(rel), values, isnull);
213 itup->t_tid = *ht_ctid;
214
215 result = _bt_doinsert(rel, itup, checkUnique, indexUnchanged, heapRel);
216
217 pfree(itup);
218
219 return result;
220}
static Datum values[MAXATTR]
Definition: bootstrap.c:151
IndexTuple index_form_tuple(TupleDesc tupleDescriptor, const Datum *values, const bool *isnull)
Definition: indextuple.c:44
bool _bt_doinsert(Relation rel, IndexTuple itup, IndexUniqueCheck checkUnique, bool indexUnchanged, Relation heapRel)
Definition: nbtinsert.c:102
ItemPointerData t_tid
Definition: itup.h:37

References _bt_doinsert(), index_form_tuple(), pfree(), RelationGetDescr, IndexTupleData::t_tid, and values.

Referenced by bthandler().

◆ btmarkpos()

void btmarkpos ( IndexScanDesc  scan)

Definition at line 474 of file nbtree.c.

475{
476 BTScanOpaque so = (BTScanOpaque) scan->opaque;
477
478 /* There may be an old mark with a pin (but no lock). */
480
481 /*
482 * Just record the current itemIndex. If we later step to next page
483 * before releasing the marked position, _bt_steppage makes a full copy of
484 * the currPos struct in markPos. If (as often happens) the mark is moved
485 * before we leave the page, we don't have to do that work.
486 */
487 if (BTScanPosIsValid(so->currPos))
489 else
490 {
492 so->markItemIndex = -1;
493 }
494}

References BTScanPosInvalidate, BTScanPosIsValid, BTScanPosUnpinIfPinned, BTScanOpaqueData::currPos, BTScanPosData::itemIndex, BTScanOpaqueData::markItemIndex, BTScanOpaqueData::markPos, and IndexScanDescData::opaque.

Referenced by bthandler().

◆ btparallelrescan()

void btparallelrescan ( IndexScanDesc  scan)

Definition at line 741 of file nbtree.c.

742{
743 BTParallelScanDesc btscan;
744 ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
745
746 Assert(parallel_scan);
747
748 btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
749 parallel_scan->ps_offset_am);
750
751 /*
752 * In theory, we don't need to acquire the LWLock here, because there
753 * shouldn't be any other workers running at this point, but we do so for
754 * consistency.
755 */
760 LWLockRelease(&btscan->btps_lock);
761}

References Assert(), BTPARALLEL_NOT_INITIALIZED, BTParallelScanDescData::btps_lastCurrPage, BTParallelScanDescData::btps_lock, BTParallelScanDescData::btps_nextScanPage, BTParallelScanDescData::btps_pageStatus, InvalidBlockNumber, LW_EXCLUSIVE, LWLockAcquire(), LWLockRelease(), OffsetToPointer, IndexScanDescData::parallel_scan, and ParallelIndexScanDescData::ps_offset_am.

Referenced by bthandler().

◆ btreevacuumposting()

static BTVacuumPosting btreevacuumposting ( BTVacState vstate,
IndexTuple  posting,
OffsetNumber  updatedoffset,
int *  nremaining 
)
static

Definition at line 1663 of file nbtree.c.

1665{
1666 int live = 0;
1667 int nitem = BTreeTupleGetNPosting(posting);
1669 BTVacuumPosting vacposting = NULL;
1670
1671 for (int i = 0; i < nitem; i++)
1672 {
1673 if (!vstate->callback(items + i, vstate->callback_state))
1674 {
1675 /* Live table TID */
1676 live++;
1677 }
1678 else if (vacposting == NULL)
1679 {
1680 /*
1681 * First dead table TID encountered.
1682 *
1683 * It's now clear that we need to delete one or more dead table
1684 * TIDs, so start maintaining metadata describing how to update
1685 * existing posting list tuple.
1686 */
1687 vacposting = palloc(offsetof(BTVacuumPostingData, deletetids) +
1688 nitem * sizeof(uint16));
1689
1690 vacposting->itup = posting;
1691 vacposting->updatedoffset = updatedoffset;
1692 vacposting->ndeletedtids = 0;
1693 vacposting->deletetids[vacposting->ndeletedtids++] = i;
1694 }
1695 else
1696 {
1697 /* Second or subsequent dead table TID */
1698 vacposting->deletetids[vacposting->ndeletedtids++] = i;
1699 }
1700 }
1701
1702 *nremaining = live;
1703 return vacposting;
1704}
uint16_t uint16
Definition: c.h:501
static uint16 BTreeTupleGetNPosting(IndexTuple posting)
Definition: nbtree.h:519
static ItemPointer BTreeTupleGetPosting(IndexTuple posting)
Definition: nbtree.h:538
IndexBulkDeleteCallback callback
Definition: nbtree.h:335
void * callback_state
Definition: nbtree.h:336
uint16 deletetids[FLEXIBLE_ARRAY_MEMBER]
Definition: nbtree.h:922
uint16 ndeletedtids
Definition: nbtree.h:921
IndexTuple itup
Definition: nbtree.h:917
OffsetNumber updatedoffset
Definition: nbtree.h:918
static ItemArray items
Definition: test_tidstore.c:48

References BTreeTupleGetNPosting(), BTreeTupleGetPosting(), BTVacState::callback, BTVacState::callback_state, BTVacuumPostingData::deletetids, i, items, BTVacuumPostingData::itup, BTVacuumPostingData::ndeletedtids, palloc(), and BTVacuumPostingData::updatedoffset.

Referenced by btvacuumpage().

◆ btrescan()

void btrescan ( IndexScanDesc  scan,
ScanKey  scankey,
int  nscankeys,
ScanKey  orderbys,
int  norderbys 
)

Definition at line 381 of file nbtree.c.

383{
384 BTScanOpaque so = (BTScanOpaque) scan->opaque;
385
386 /* we aren't holding any read locks, but gotta drop the pins */
388 {
389 /* Before leaving current page, deal with any killed items */
390 if (so->numKilled > 0)
391 _bt_killitems(scan);
394 }
395
396 so->markItemIndex = -1;
397 so->needPrimScan = false;
398 so->scanBehind = false;
399 so->oppositeDirCheck = false;
402
403 /*
404 * Allocate tuple workspace arrays, if needed for an index-only scan and
405 * not already done in a previous rescan call. To save on palloc
406 * overhead, both workspaces are allocated as one palloc block; only this
407 * function and btendscan know that.
408 *
409 * NOTE: this data structure also makes it safe to return data from a
410 * "name" column, even though btree name_ops uses an underlying storage
411 * datatype of cstring. The risk there is that "name" is supposed to be
412 * padded to NAMEDATALEN, but the actual index tuple is probably shorter.
413 * However, since we only return data out of tuples sitting in the
414 * currTuples array, a fetch of NAMEDATALEN bytes can at worst pull some
415 * data out of the markTuples array --- running off the end of memory for
416 * a SIGSEGV is not possible. Yeah, this is ugly as sin, but it beats
417 * adding special-case treatment for name_ops elsewhere.
418 */
419 if (scan->xs_want_itup && so->currTuples == NULL)
420 {
421 so->currTuples = (char *) palloc(BLCKSZ * 2);
422 so->markTuples = so->currTuples + BLCKSZ;
423 }
424
425 /*
426 * Reset the scan keys
427 */
428 if (scankey && scan->numberOfKeys > 0)
429 memcpy(scan->keyData, scankey, scan->numberOfKeys * sizeof(ScanKeyData));
430 so->numberOfKeys = 0; /* until _bt_preprocess_keys sets it */
431 so->numArrayKeys = 0; /* ditto */
432}
struct ScanKeyData * keyData
Definition: relscan.h:141

References _bt_killitems(), BTScanPosInvalidate, BTScanPosIsValid, BTScanPosUnpinIfPinned, BTScanOpaqueData::currPos, BTScanOpaqueData::currTuples, if(), IndexScanDescData::keyData, BTScanOpaqueData::markItemIndex, BTScanOpaqueData::markPos, BTScanOpaqueData::markTuples, BTScanOpaqueData::needPrimScan, BTScanOpaqueData::numArrayKeys, BTScanOpaqueData::numberOfKeys, IndexScanDescData::numberOfKeys, BTScanOpaqueData::numKilled, IndexScanDescData::opaque, BTScanOpaqueData::oppositeDirCheck, palloc(), BTScanOpaqueData::scanBehind, and IndexScanDescData::xs_want_itup.

Referenced by bthandler().

◆ btrestrpos()

void btrestrpos ( IndexScanDesc  scan)

Definition at line 500 of file nbtree.c.

501{
502 BTScanOpaque so = (BTScanOpaque) scan->opaque;
503
504 if (so->markItemIndex >= 0)
505 {
506 /*
507 * The scan has never moved to a new page since the last mark. Just
508 * restore the itemIndex.
509 *
510 * NB: In this case we can't count on anything in so->markPos to be
511 * accurate.
512 */
514 }
515 else
516 {
517 /*
518 * The scan moved to a new page after last mark or restore, and we are
519 * now restoring to the marked page. We aren't holding any read
520 * locks, but if we're still holding the pin for the current position,
521 * we must drop it.
522 */
523 if (BTScanPosIsValid(so->currPos))
524 {
525 /* Before leaving current page, deal with any killed items */
526 if (so->numKilled > 0)
527 _bt_killitems(scan);
529 }
530
531 if (BTScanPosIsValid(so->markPos))
532 {
533 /* bump pin on mark buffer for assignment to current buffer */
534 if (BTScanPosIsPinned(so->markPos))
536 memcpy(&so->currPos, &so->markPos,
537 offsetof(BTScanPosData, items[1]) +
538 so->markPos.lastItem * sizeof(BTScanPosItem));
539 if (so->currTuples)
540 memcpy(so->currTuples, so->markTuples,
542 /* Reset the scan's array keys (see _bt_steppage for why) */
543 if (so->numArrayKeys)
544 {
546 so->needPrimScan = false;
547 }
548 }
549 else
551 }
552}
void IncrBufferRefCount(Buffer buffer)
Definition: bufmgr.c:5405
#define BTScanPosIsPinned(scanpos)
Definition: nbtree.h:1004
void _bt_start_array_keys(IndexScanDesc scan, ScanDirection dir)
Definition: nbtutils.c:611
Buffer buf
Definition: nbtree.h:964
int nextTupleOffset
Definition: nbtree.h:979
ScanDirection dir
Definition: nbtree.h:973

References _bt_killitems(), _bt_start_array_keys(), BTScanPosInvalidate, BTScanPosIsPinned, BTScanPosIsValid, BTScanPosUnpinIfPinned, BTScanPosData::buf, BTScanOpaqueData::currPos, BTScanOpaqueData::currTuples, BTScanPosData::dir, if(), IncrBufferRefCount(), BTScanPosData::itemIndex, items, BTScanPosData::lastItem, BTScanOpaqueData::markItemIndex, BTScanOpaqueData::markPos, BTScanOpaqueData::markTuples, BTScanOpaqueData::needPrimScan, BTScanPosData::nextTupleOffset, BTScanOpaqueData::numArrayKeys, BTScanOpaqueData::numKilled, and IndexScanDescData::opaque.

Referenced by bthandler().

◆ bttranslatecmptype()

StrategyNumber bttranslatecmptype ( CompareType  cmptype,
Oid  opfamily 
)

Definition at line 1747 of file nbtree.c.

1748{
1749 switch (cmptype)
1750 {
1751 case COMPARE_LT:
1752 return BTLessStrategyNumber;
1753 case COMPARE_LE:
1755 case COMPARE_EQ:
1756 return BTEqualStrategyNumber;
1757 case COMPARE_GE:
1759 case COMPARE_GT:
1761 default:
1762 return InvalidStrategy;
1763 }
1764}
@ COMPARE_LE
Definition: cmptype.h:35
@ COMPARE_GT
Definition: cmptype.h:38
@ COMPARE_EQ
Definition: cmptype.h:36
@ COMPARE_GE
Definition: cmptype.h:37
@ COMPARE_LT
Definition: cmptype.h:34
#define BTGreaterStrategyNumber
Definition: stratnum.h:33
#define InvalidStrategy
Definition: stratnum.h:24
#define BTLessStrategyNumber
Definition: stratnum.h:29
#define BTEqualStrategyNumber
Definition: stratnum.h:31
#define BTLessEqualStrategyNumber
Definition: stratnum.h:30
#define BTGreaterEqualStrategyNumber
Definition: stratnum.h:32

References BTEqualStrategyNumber, BTGreaterEqualStrategyNumber, BTGreaterStrategyNumber, BTLessEqualStrategyNumber, BTLessStrategyNumber, COMPARE_EQ, COMPARE_GE, COMPARE_GT, COMPARE_LE, COMPARE_LT, and InvalidStrategy.

Referenced by bthandler().

◆ bttranslatestrategy()

CompareType bttranslatestrategy ( StrategyNumber  strategy,
Oid  opfamily 
)

Definition at line 1727 of file nbtree.c.

1728{
1729 switch (strategy)
1730 {
1732 return COMPARE_LT;
1734 return COMPARE_LE;
1736 return COMPARE_EQ;
1738 return COMPARE_GE;
1740 return COMPARE_GT;
1741 default:
1742 return COMPARE_INVALID;
1743 }
1744}
@ COMPARE_INVALID
Definition: cmptype.h:33

References BTEqualStrategyNumber, BTGreaterEqualStrategyNumber, BTGreaterStrategyNumber, BTLessEqualStrategyNumber, BTLessStrategyNumber, COMPARE_EQ, COMPARE_GE, COMPARE_GT, COMPARE_INVALID, COMPARE_LE, and COMPARE_LT.

Referenced by bthandler().

◆ btvacuumcleanup()

IndexBulkDeleteResult * btvacuumcleanup ( IndexVacuumInfo info,
IndexBulkDeleteResult stats 
)

Definition at line 1063 of file nbtree.c.

1064{
1065 BlockNumber num_delpages;
1066
1067 /* No-op in ANALYZE ONLY mode */
1068 if (info->analyze_only)
1069 return stats;
1070
1071 /*
1072 * If btbulkdelete was called, we need not do anything (we just maintain
1073 * the information used within _bt_vacuum_needs_cleanup() by calling
1074 * _bt_set_cleanup_info() below).
1075 *
1076 * If btbulkdelete was _not_ called, then we have a choice to make: we
1077 * must decide whether or not a btvacuumscan() call is needed now (i.e.
1078 * whether the ongoing VACUUM operation can entirely avoid a physical scan
1079 * of the index). A call to _bt_vacuum_needs_cleanup() decides it for us
1080 * now.
1081 */
1082 if (stats == NULL)
1083 {
1084 /* Check if VACUUM operation can entirely avoid btvacuumscan() call */
1085 if (!_bt_vacuum_needs_cleanup(info->index))
1086 return NULL;
1087
1088 /*
1089 * Since we aren't going to actually delete any leaf items, there's no
1090 * need to go through all the vacuum-cycle-ID pushups here.
1091 *
1092 * Posting list tuples are a source of inaccuracy for cleanup-only
1093 * scans. btvacuumscan() will assume that the number of index tuples
1094 * from each page can be used as num_index_tuples, even though
1095 * num_index_tuples is supposed to represent the number of TIDs in the
1096 * index. This naive approach can underestimate the number of tuples
1097 * in the index significantly.
1098 *
1099 * We handle the problem by making num_index_tuples an estimate in
1100 * cleanup-only case.
1101 */
1103 btvacuumscan(info, stats, NULL, NULL, 0);
1104 stats->estimated_count = true;
1105 }
1106
1107 /*
1108 * Maintain num_delpages value in metapage for _bt_vacuum_needs_cleanup().
1109 *
1110 * num_delpages is the number of deleted pages now in the index that were
1111 * not safe to place in the FSM to be recycled just yet. num_delpages is
1112 * greater than 0 only when _bt_pagedel() actually deleted pages during
1113 * our call to btvacuumscan(). Even then, _bt_pendingfsm_finalize() must
1114 * have failed to place any newly deleted pages in the FSM just moments
1115 * ago. (Actually, there are edge cases where recycling of the current
1116 * VACUUM's newly deleted pages does not even become safe by the time the
1117 * next VACUUM comes around. See nbtree/README.)
1118 */
1119 Assert(stats->pages_deleted >= stats->pages_free);
1120 num_delpages = stats->pages_deleted - stats->pages_free;
1121 _bt_set_cleanup_info(info->index, num_delpages);
1122
1123 /*
1124 * It's quite possible for us to be fooled by concurrent page splits into
1125 * double-counting some index tuples, so disbelieve any total that exceeds
1126 * the underlying heap's count ... if we know that accurately. Otherwise
1127 * this might just make matters worse.
1128 */
1129 if (!info->estimated_count)
1130 {
1131 if (stats->num_index_tuples > info->num_heap_tuples)
1132 stats->num_index_tuples = info->num_heap_tuples;
1133 }
1134
1135 return stats;
1136}
uint32 BlockNumber
Definition: block.h:31
void _bt_set_cleanup_info(Relation rel, BlockNumber num_delpages)
Definition: nbtpage.c:232
bool _bt_vacuum_needs_cleanup(Relation rel)
Definition: nbtpage.c:179
BlockNumber pages_deleted
Definition: genam.h:105
BlockNumber pages_free
Definition: genam.h:106
double num_index_tuples
Definition: genam.h:102
double num_heap_tuples
Definition: genam.h:75
bool analyze_only
Definition: genam.h:71
bool estimated_count
Definition: genam.h:73

References _bt_set_cleanup_info(), _bt_vacuum_needs_cleanup(), IndexVacuumInfo::analyze_only, Assert(), btvacuumscan(), IndexVacuumInfo::estimated_count, IndexBulkDeleteResult::estimated_count, IndexVacuumInfo::index, IndexVacuumInfo::num_heap_tuples, IndexBulkDeleteResult::num_index_tuples, IndexBulkDeleteResult::pages_deleted, IndexBulkDeleteResult::pages_free, and palloc0().

Referenced by bthandler().

◆ btvacuumpage()

static BlockNumber btvacuumpage ( BTVacState vstate,
Buffer  buf 
)
static

Definition at line 1325 of file nbtree.c.

1326{
1327 IndexVacuumInfo *info = vstate->info;
1328 IndexBulkDeleteResult *stats = vstate->stats;
1330 void *callback_state = vstate->callback_state;
1331 Relation rel = info->index;
1332 Relation heaprel = info->heaprel;
1333 bool attempt_pagedel;
1334 BlockNumber blkno,
1335 backtrack_to;
1337 Page page;
1338 BTPageOpaque opaque;
1339
1340 blkno = scanblkno;
1341
1342backtrack:
1343
1344 attempt_pagedel = false;
1345 backtrack_to = P_NONE;
1346
1347 _bt_lockbuf(rel, buf, BT_READ);
1348 page = BufferGetPage(buf);
1349 opaque = NULL;
1350 if (!PageIsNew(page))
1351 {
1352 _bt_checkpage(rel, buf);
1353 opaque = BTPageGetOpaque(page);
1354 }
1355
1356 Assert(blkno <= scanblkno);
1357 if (blkno != scanblkno)
1358 {
1359 /*
1360 * We're backtracking.
1361 *
1362 * We followed a right link to a sibling leaf page (a page that
1363 * happens to be from a block located before scanblkno). The only
1364 * case we want to do anything with is a live leaf page having the
1365 * current vacuum cycle ID.
1366 *
1367 * The page had better be in a state that's consistent with what we
1368 * expect. Check for conditions that imply corruption in passing. It
1369 * can't be half-dead because only an interrupted VACUUM process can
1370 * leave pages in that state, so we'd definitely have dealt with it
1371 * back when the page was the scanblkno page (half-dead pages are
1372 * always marked fully deleted by _bt_pagedel(), barring corruption).
1373 */
1374 if (!opaque || !P_ISLEAF(opaque) || P_ISHALFDEAD(opaque))
1375 {
1376 Assert(false);
1377 ereport(LOG,
1378 (errcode(ERRCODE_INDEX_CORRUPTED),
1379 errmsg_internal("right sibling %u of scanblkno %u unexpectedly in an inconsistent state in index \"%s\"",
1380 blkno, scanblkno, RelationGetRelationName(rel))));
1381 _bt_relbuf(rel, buf);
1382 return scanblkno;
1383 }
1384
1385 /*
1386 * We may have already processed the page in an earlier call, when the
1387 * page was scanblkno. This happens when the leaf page split occurred
1388 * after the scan began, but before the right sibling page became the
1389 * scanblkno.
1390 *
1391 * Page may also have been deleted by current btvacuumpage() call,
1392 * since _bt_pagedel() sometimes deletes the right sibling page of
1393 * scanblkno in passing (it does so after we decided where to
1394 * backtrack to). We don't need to process this page as a deleted
1395 * page a second time now (in fact, it would be wrong to count it as a
1396 * deleted page in the bulk delete statistics a second time).
1397 */
1398 if (opaque->btpo_cycleid != vstate->cycleid || P_ISDELETED(opaque))
1399 {
1400 /* Done with current scanblkno (and all lower split pages) */
1401 _bt_relbuf(rel, buf);
1402 return scanblkno;
1403 }
1404 }
1405
1406 if (!opaque || BTPageIsRecyclable(page, heaprel))
1407 {
1408 /* Okay to recycle this page (which could be leaf or internal) */
1409 RecordFreeIndexPage(rel, blkno);
1410 stats->pages_deleted++;
1411 stats->pages_free++;
1412 }
1413 else if (P_ISDELETED(opaque))
1414 {
1415 /*
1416 * Already deleted page (which could be leaf or internal). Can't
1417 * recycle yet.
1418 */
1419 stats->pages_deleted++;
1420 }
1421 else if (P_ISHALFDEAD(opaque))
1422 {
1423 /* Half-dead leaf page (from interrupted VACUUM) -- finish deleting */
1424 attempt_pagedel = true;
1425
1426 /*
1427 * _bt_pagedel() will increment both pages_newly_deleted and
1428 * pages_deleted stats in all cases (barring corruption)
1429 */
1430 }
1431 else if (P_ISLEAF(opaque))
1432 {
1434 int ndeletable;
1436 int nupdatable;
1437 OffsetNumber offnum,
1438 minoff,
1439 maxoff;
1440 int nhtidsdead,
1441 nhtidslive;
1442
1443 /*
1444 * Trade in the initial read lock for a full cleanup lock on this
1445 * page. We must get such a lock on every leaf page over the course
1446 * of the vacuum scan, whether or not it actually contains any
1447 * deletable tuples --- see nbtree/README.
1448 */
1450
1451 /*
1452 * Check whether we need to backtrack to earlier pages. What we are
1453 * concerned about is a page split that happened since we started the
1454 * vacuum scan. If the split moved tuples on the right half of the
1455 * split (i.e. the tuples that sort high) to a block that we already
1456 * passed over, then we might have missed the tuples. We need to
1457 * backtrack now. (Must do this before possibly clearing btpo_cycleid
1458 * or deleting scanblkno page below!)
1459 */
1460 if (vstate->cycleid != 0 &&
1461 opaque->btpo_cycleid == vstate->cycleid &&
1462 !(opaque->btpo_flags & BTP_SPLIT_END) &&
1463 !P_RIGHTMOST(opaque) &&
1464 opaque->btpo_next < scanblkno)
1465 backtrack_to = opaque->btpo_next;
1466
1467 ndeletable = 0;
1468 nupdatable = 0;
1469 minoff = P_FIRSTDATAKEY(opaque);
1470 maxoff = PageGetMaxOffsetNumber(page);
1471 nhtidsdead = 0;
1472 nhtidslive = 0;
1473 if (callback)
1474 {
1475 /* btbulkdelete callback tells us what to delete (or update) */
1476 for (offnum = minoff;
1477 offnum <= maxoff;
1478 offnum = OffsetNumberNext(offnum))
1479 {
1480 IndexTuple itup;
1481
1482 itup = (IndexTuple) PageGetItem(page,
1483 PageGetItemId(page, offnum));
1484
1485 Assert(!BTreeTupleIsPivot(itup));
1486 if (!BTreeTupleIsPosting(itup))
1487 {
1488 /* Regular tuple, standard table TID representation */
1489 if (callback(&itup->t_tid, callback_state))
1490 {
1491 deletable[ndeletable++] = offnum;
1492 nhtidsdead++;
1493 }
1494 else
1495 nhtidslive++;
1496 }
1497 else
1498 {
1499 BTVacuumPosting vacposting;
1500 int nremaining;
1501
1502 /* Posting list tuple */
1503 vacposting = btreevacuumposting(vstate, itup, offnum,
1504 &nremaining);
1505 if (vacposting == NULL)
1506 {
1507 /*
1508 * All table TIDs from the posting tuple remain, so no
1509 * delete or update required
1510 */
1511 Assert(nremaining == BTreeTupleGetNPosting(itup));
1512 }
1513 else if (nremaining > 0)
1514 {
1515
1516 /*
1517 * Store metadata about posting list tuple in
1518 * updatable array for entire page. Existing tuple
1519 * will be updated during the later call to
1520 * _bt_delitems_vacuum().
1521 */
1522 Assert(nremaining < BTreeTupleGetNPosting(itup));
1523 updatable[nupdatable++] = vacposting;
1524 nhtidsdead += BTreeTupleGetNPosting(itup) - nremaining;
1525 }
1526 else
1527 {
1528 /*
1529 * All table TIDs from the posting list must be
1530 * deleted. We'll delete the index tuple completely
1531 * (no update required).
1532 */
1533 Assert(nremaining == 0);
1534 deletable[ndeletable++] = offnum;
1535 nhtidsdead += BTreeTupleGetNPosting(itup);
1536 pfree(vacposting);
1537 }
1538
1539 nhtidslive += nremaining;
1540 }
1541 }
1542 }
1543
1544 /*
1545 * Apply any needed deletes or updates. We issue just one
1546 * _bt_delitems_vacuum() call per page, so as to minimize WAL traffic.
1547 */
1548 if (ndeletable > 0 || nupdatable > 0)
1549 {
1550 Assert(nhtidsdead >= ndeletable + nupdatable);
1551 _bt_delitems_vacuum(rel, buf, deletable, ndeletable, updatable,
1552 nupdatable);
1553
1554 stats->tuples_removed += nhtidsdead;
1555 /* must recompute maxoff */
1556 maxoff = PageGetMaxOffsetNumber(page);
1557
1558 /* can't leak memory here */
1559 for (int i = 0; i < nupdatable; i++)
1560 pfree(updatable[i]);
1561 }
1562 else
1563 {
1564 /*
1565 * If the leaf page has been split during this vacuum cycle, it
1566 * seems worth expending a write to clear btpo_cycleid even if we
1567 * don't have any deletions to do. (If we do, _bt_delitems_vacuum
1568 * takes care of this.) This ensures we won't process the page
1569 * again.
1570 *
1571 * We treat this like a hint-bit update because there's no need to
1572 * WAL-log it.
1573 */
1574 Assert(nhtidsdead == 0);
1575 if (vstate->cycleid != 0 &&
1576 opaque->btpo_cycleid == vstate->cycleid)
1577 {
1578 opaque->btpo_cycleid = 0;
1579 MarkBufferDirtyHint(buf, true);
1580 }
1581 }
1582
1583 /*
1584 * If the leaf page is now empty, try to delete it; else count the
1585 * live tuples (live table TIDs in posting lists are counted as
1586 * separate live tuples). We don't delete when backtracking, though,
1587 * since that would require teaching _bt_pagedel() about backtracking
1588 * (doesn't seem worth adding more complexity to deal with that).
1589 *
1590 * We don't count the number of live TIDs during cleanup-only calls to
1591 * btvacuumscan (i.e. when callback is not set). We count the number
1592 * of index tuples directly instead. This avoids the expense of
1593 * directly examining all of the tuples on each page. VACUUM will
1594 * treat num_index_tuples as an estimate in cleanup-only case, so it
1595 * doesn't matter that this underestimates num_index_tuples
1596 * significantly in some cases.
1597 */
1598 if (minoff > maxoff)
1599 attempt_pagedel = (blkno == scanblkno);
1600 else if (callback)
1601 stats->num_index_tuples += nhtidslive;
1602 else
1603 stats->num_index_tuples += maxoff - minoff + 1;
1604
1605 Assert(!attempt_pagedel || nhtidslive == 0);
1606 }
1607
1608 if (attempt_pagedel)
1609 {
1610 MemoryContext oldcontext;
1611
1612 /* Run pagedel in a temp context to avoid memory leakage */
1614 oldcontext = MemoryContextSwitchTo(vstate->pagedelcontext);
1615
1616 /*
1617 * _bt_pagedel maintains the bulk delete stats on our behalf;
1618 * pages_newly_deleted and pages_deleted are likely to be incremented
1619 * during call
1620 */
1621 Assert(blkno == scanblkno);
1622 _bt_pagedel(rel, buf, vstate);
1623
1624 MemoryContextSwitchTo(oldcontext);
1625 /* pagedel released buffer, so we shouldn't */
1626 }
1627 else
1628 _bt_relbuf(rel, buf);
1629
1630 if (backtrack_to != P_NONE)
1631 {
1632 blkno = backtrack_to;
1633
1634 /* check for vacuum delay while not holding any buffer lock */
1635 vacuum_delay_point(false);
1636
1637 /*
1638 * We can't use _bt_getbuf() here because it always applies
1639 * _bt_checkpage(), which will barf on an all-zero page. We want to
1640 * recycle all-zero pages, not fail. Also, we want to use a
1641 * nondefault buffer access strategy.
1642 */
1644 info->strategy);
1645 goto backtrack;
1646 }
1647
1648 return scanblkno;
1649}
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:4231
void MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
Definition: bufmgr.c:5437
Buffer ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
Definition: bufmgr.c:805
static Page BufferGetPage(Buffer buffer)
Definition: bufmgr.h:417
@ RBM_NORMAL
Definition: bufmgr.h:46
static Item PageGetItem(const PageData *page, const ItemIdData *itemId)
Definition: bufpage.h:354
static bool PageIsNew(const PageData *page)
Definition: bufpage.h:234
static ItemId PageGetItemId(Page page, OffsetNumber offsetNumber)
Definition: bufpage.h:244
static OffsetNumber PageGetMaxOffsetNumber(const PageData *page)
Definition: bufpage.h:372
int errmsg_internal(const char *fmt,...)
Definition: elog.c:1158
int errcode(int sqlerrcode)
Definition: elog.c:854
#define LOG
Definition: elog.h:31
#define ereport(elevel,...)
Definition: elog.h:149
bool(* IndexBulkDeleteCallback)(ItemPointer itemptr, void *state)
Definition: genam.h:110
void RecordFreeIndexPage(Relation rel, BlockNumber freeBlock)
Definition: indexfsm.c:52
IndexTupleData * IndexTuple
Definition: itup.h:53
#define MaxIndexTuplesPerPage
Definition: itup.h:181
void MemoryContextReset(MemoryContext context)
Definition: mcxt.c:414
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:1023
void _bt_pagedel(Relation rel, Buffer leafbuf, BTVacState *vstate)
Definition: nbtpage.c:1802
void _bt_delitems_vacuum(Relation rel, Buffer buf, OffsetNumber *deletable, int ndeletable, BTVacuumPosting *updatable, int nupdatable)
Definition: nbtpage.c:1154
void _bt_checkpage(Relation rel, Buffer buf)
Definition: nbtpage.c:797
void _bt_upgradelockbufcleanup(Relation rel, Buffer buf)
Definition: nbtpage.c:1109
void _bt_lockbuf(Relation rel, Buffer buf, int access)
Definition: nbtpage.c:1039
static BTVacuumPosting btreevacuumposting(BTVacState *vstate, IndexTuple posting, OffsetNumber updatedoffset, int *nremaining)
Definition: nbtree.c:1663
#define P_ISHALFDEAD(opaque)
Definition: nbtree.h:225
static bool BTreeTupleIsPivot(IndexTuple itup)
Definition: nbtree.h:481
#define P_ISLEAF(opaque)
Definition: nbtree.h:221
#define BTPageGetOpaque(page)
Definition: nbtree.h:74
#define P_ISDELETED(opaque)
Definition: nbtree.h:223
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:370
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:220
#define BT_READ
Definition: nbtree.h:730
static bool BTPageIsRecyclable(Page page, Relation heaprel)
Definition: nbtree.h:292
static bool BTreeTupleIsPosting(IndexTuple itup)
Definition: nbtree.h:493
#define BTP_SPLIT_END
Definition: nbtree.h:82
#define OffsetNumberNext(offsetNumber)
Definition: off.h:52
uint16 OffsetNumber
Definition: off.h:24
static MemoryContext MemoryContextSwitchTo(MemoryContext context)
Definition: palloc.h:124
static char * buf
Definition: pg_test_fsync.c:72
#define RelationGetRelationName(relation)
Definition: rel.h:550
@ MAIN_FORKNUM
Definition: relpath.h:58
BlockNumber btpo_next
Definition: nbtree.h:66
uint16 btpo_flags
Definition: nbtree.h:68
BTCycleId btpo_cycleid
Definition: nbtree.h:69
IndexBulkDeleteResult * stats
Definition: nbtree.h:334
BTCycleId cycleid
Definition: nbtree.h:337
MemoryContext pagedelcontext
Definition: nbtree.h:338
IndexVacuumInfo * info
Definition: nbtree.h:333
double tuples_removed
Definition: genam.h:103
BufferAccessStrategy strategy
Definition: genam.h:76
Relation heaprel
Definition: genam.h:70
void vacuum_delay_point(bool is_analyze)
Definition: vacuum.c:2404

References _bt_checkpage(), _bt_delitems_vacuum(), _bt_lockbuf(), _bt_pagedel(), _bt_relbuf(), _bt_upgradelockbufcleanup(), Assert(), BT_READ, BTP_SPLIT_END, BTPageGetOpaque, BTPageIsRecyclable(), BTPageOpaqueData::btpo_cycleid, BTPageOpaqueData::btpo_flags, BTPageOpaqueData::btpo_next, BTreeTupleGetNPosting(), BTreeTupleIsPivot(), BTreeTupleIsPosting(), btreevacuumposting(), buf, BufferGetBlockNumber(), BufferGetPage(), BTVacState::callback, callback(), BTVacState::callback_state, BTVacState::cycleid, ereport, errcode(), errmsg_internal(), IndexVacuumInfo::heaprel, i, IndexVacuumInfo::index, BTVacState::info, LOG, MAIN_FORKNUM, MarkBufferDirtyHint(), MaxIndexTuplesPerPage, MemoryContextReset(), MemoryContextSwitchTo(), IndexBulkDeleteResult::num_index_tuples, OffsetNumberNext, P_FIRSTDATAKEY, P_ISDELETED, P_ISHALFDEAD, P_ISLEAF, P_NONE, P_RIGHTMOST, BTVacState::pagedelcontext, PageGetItem(), PageGetItemId(), PageGetMaxOffsetNumber(), PageIsNew(), IndexBulkDeleteResult::pages_deleted, IndexBulkDeleteResult::pages_free, pfree(), RBM_NORMAL, ReadBufferExtended(), RecordFreeIndexPage(), RelationGetRelationName, BTVacState::stats, IndexVacuumInfo::strategy, IndexTupleData::t_tid, IndexBulkDeleteResult::tuples_removed, and vacuum_delay_point().

Referenced by btvacuumscan().

◆ btvacuumscan()

static void btvacuumscan ( IndexVacuumInfo info,
IndexBulkDeleteResult stats,
IndexBulkDeleteCallback  callback,
void *  callback_state,
BTCycleId  cycleid 
)
static

Definition at line 1151 of file nbtree.c.

1154{
1155 Relation rel = info->index;
1156 BTVacState vstate;
1157 BlockNumber num_pages;
1158 bool needLock;
1160 ReadStream *stream = NULL;
1161
1162 /*
1163 * Reset fields that track information about the entire index now. This
1164 * avoids double-counting in the case where a single VACUUM command
1165 * requires multiple scans of the index.
1166 *
1167 * Avoid resetting the tuples_removed and pages_newly_deleted fields here,
1168 * since they track information about the VACUUM command, and so must last
1169 * across each call to btvacuumscan().
1170 *
1171 * (Note that pages_free is treated as state about the whole index, not
1172 * the current VACUUM. This is appropriate because RecordFreeIndexPage()
1173 * calls are idempotent, and get repeated for the same deleted pages in
1174 * some scenarios. The point for us is to track the number of recyclable
1175 * pages in the index at the end of the VACUUM command.)
1176 */
1177 stats->num_pages = 0;
1178 stats->num_index_tuples = 0;
1179 stats->pages_deleted = 0;
1180 stats->pages_free = 0;
1181
1182 /* Set up info to pass down to btvacuumpage */
1183 vstate.info = info;
1184 vstate.stats = stats;
1185 vstate.callback = callback;
1186 vstate.callback_state = callback_state;
1187 vstate.cycleid = cycleid;
1188
1189 /* Create a temporary memory context to run _bt_pagedel in */
1191 "_bt_pagedel",
1193
1194 /* Initialize vstate fields used by _bt_pendingfsm_finalize */
1195 vstate.bufsize = 0;
1196 vstate.maxbufsize = 0;
1197 vstate.pendingpages = NULL;
1198 vstate.npendingpages = 0;
1199 /* Consider applying _bt_pendingfsm_finalize optimization */
1200 _bt_pendingfsm_init(rel, &vstate, (callback == NULL));
1201
1202 /*
1203 * The outer loop iterates over all index pages except the metapage, in
1204 * physical order (we hope the kernel will cooperate in providing
1205 * read-ahead for speed). It is critical that we visit all leaf pages,
1206 * including ones added after we start the scan, else we might fail to
1207 * delete some deletable tuples. Hence, we must repeatedly check the
1208 * relation length. We must acquire the relation-extension lock while
1209 * doing so to avoid a race condition: if someone else is extending the
1210 * relation, there is a window where bufmgr/smgr have created a new
1211 * all-zero page but it hasn't yet been write-locked by _bt_getbuf(). If
1212 * we manage to scan such a page here, we'll improperly assume it can be
1213 * recycled. Taking the lock synchronizes things enough to prevent a
1214 * problem: either num_pages won't include the new page, or _bt_getbuf
1215 * already has write lock on the buffer and it will be fully initialized
1216 * before we can examine it. Also, we need not worry if a page is added
1217 * immediately after we look; the page splitting code already has
1218 * write-lock on the left page before it adds a right page, so we must
1219 * already have processed any tuples due to be moved into such a page.
1220 *
1221 * XXX: Now that new pages are locked with RBM_ZERO_AND_LOCK, I don't
1222 * think the use of the extension lock is still required.
1223 *
1224 * We can skip locking for new or temp relations, however, since no one
1225 * else could be accessing them.
1226 */
1227 needLock = !RELATION_IS_LOCAL(rel);
1228
1230
1231 /*
1232 * It is safe to use batchmode as block_range_read_stream_cb takes no
1233 * locks.
1234 */
1237 info->strategy,
1238 rel,
1241 &p,
1242 0);
1243 for (;;)
1244 {
1245 /* Get the current relation length */
1246 if (needLock)
1248 num_pages = RelationGetNumberOfBlocks(rel);
1249 if (needLock)
1251
1252 if (info->report_progress)
1254 num_pages);
1255
1256 /* Quit if we've scanned the whole relation */
1257 if (p.current_blocknum >= num_pages)
1258 break;
1259
1260 p.last_exclusive = num_pages;
1261
1262 /* Iterate over pages, then loop back to recheck relation length */
1263 while (true)
1264 {
1265 BlockNumber current_block;
1266 Buffer buf;
1267
1268 /* call vacuum_delay_point while not holding any buffer lock */
1269 vacuum_delay_point(false);
1270
1271 buf = read_stream_next_buffer(stream, NULL);
1272
1273 if (!BufferIsValid(buf))
1274 break;
1275
1276 current_block = btvacuumpage(&vstate, buf);
1277
1278 if (info->report_progress)
1280 current_block);
1281 }
1282
1283 /*
1284 * We have to reset the read stream to use it again. After returning
1285 * InvalidBuffer, the read stream API won't invoke our callback again
1286 * until the stream has been reset.
1287 */
1288 read_stream_reset(stream);
1289 }
1290
1291 read_stream_end(stream);
1292
1293 /* Set statistics num_pages field to final size of index */
1294 stats->num_pages = num_pages;
1295
1297
1298 /*
1299 * If there were any calls to _bt_pagedel() during scan of the index then
1300 * see if any of the resulting pages can be placed in the FSM now. When
1301 * it's not safe we'll have to leave it up to a future VACUUM operation.
1302 *
1303 * Finally, if we placed any pages in the FSM (either just now or during
1304 * the scan), forcibly update the upper-level FSM pages to ensure that
1305 * searchers can find them.
1306 */
1307 _bt_pendingfsm_finalize(rel, &vstate);
1308 if (stats->pages_free > 0)
1310}
void pgstat_progress_update_param(int index, int64 val)
int Buffer
Definition: buf.h:23
#define RelationGetNumberOfBlocks(reln)
Definition: bufmgr.h:283
static bool BufferIsValid(Buffer bufnum)
Definition: bufmgr.h:368
void IndexFreeSpaceMapVacuum(Relation rel)
Definition: indexfsm.c:71
void LockRelationForExtension(Relation relation, LOCKMODE lockmode)
Definition: lmgr.c:424
void UnlockRelationForExtension(Relation relation, LOCKMODE lockmode)
Definition: lmgr.c:474
#define ExclusiveLock
Definition: lockdefs.h:42
MemoryContext CurrentMemoryContext
Definition: mcxt.c:159
#define AllocSetContextCreate
Definition: memutils.h:149
#define ALLOCSET_DEFAULT_SIZES
Definition: memutils.h:180
void _bt_pendingfsm_finalize(Relation rel, BTVacState *vstate)
Definition: nbtpage.c:2996
void _bt_pendingfsm_init(Relation rel, BTVacState *vstate, bool cleanuponly)
Definition: nbtpage.c:2954
static BlockNumber btvacuumpage(BTVacState *vstate, Buffer buf)
Definition: nbtree.c:1325
#define PROGRESS_SCAN_BLOCKS_DONE
Definition: progress.h:125
#define PROGRESS_SCAN_BLOCKS_TOTAL
Definition: progress.h:124
void read_stream_reset(ReadStream *stream)
Definition: read_stream.c:1010
Buffer read_stream_next_buffer(ReadStream *stream, void **per_buffer_data)
Definition: read_stream.c:770
ReadStream * read_stream_begin_relation(int flags, BufferAccessStrategy strategy, Relation rel, ForkNumber forknum, ReadStreamBlockNumberCB callback, void *callback_private_data, size_t per_buffer_data_size)
Definition: read_stream.c:716
void read_stream_end(ReadStream *stream)
Definition: read_stream.c:1055
BlockNumber block_range_read_stream_cb(ReadStream *stream, void *callback_private_data, void *per_buffer_data)
Definition: read_stream.c:162
#define READ_STREAM_USE_BATCHING
Definition: read_stream.h:64
#define READ_STREAM_FULL
Definition: read_stream.h:43
#define RELATION_IS_LOCAL(relation)
Definition: rel.h:659
BTPendingFSM * pendingpages
Definition: nbtree.h:345
int npendingpages
Definition: nbtree.h:346
int bufsize
Definition: nbtree.h:343
int maxbufsize
Definition: nbtree.h:344
BlockNumber num_pages
Definition: genam.h:100
bool report_progress
Definition: genam.h:72

References _bt_pendingfsm_finalize(), _bt_pendingfsm_init(), ALLOCSET_DEFAULT_SIZES, AllocSetContextCreate, block_range_read_stream_cb(), BTREE_METAPAGE, btvacuumpage(), buf, BufferIsValid(), BTVacState::bufsize, BTVacState::callback, callback(), BTVacState::callback_state, BlockRangeReadStreamPrivate::current_blocknum, CurrentMemoryContext, BTVacState::cycleid, ExclusiveLock, IndexVacuumInfo::index, IndexFreeSpaceMapVacuum(), BTVacState::info, BlockRangeReadStreamPrivate::last_exclusive, LockRelationForExtension(), MAIN_FORKNUM, BTVacState::maxbufsize, MemoryContextDelete(), BTVacState::npendingpages, IndexBulkDeleteResult::num_index_tuples, IndexBulkDeleteResult::num_pages, BTVacState::pagedelcontext, IndexBulkDeleteResult::pages_deleted, IndexBulkDeleteResult::pages_free, BTVacState::pendingpages, pgstat_progress_update_param(), PROGRESS_SCAN_BLOCKS_DONE, PROGRESS_SCAN_BLOCKS_TOTAL, read_stream_begin_relation(), read_stream_end(), READ_STREAM_FULL, read_stream_next_buffer(), read_stream_reset(), READ_STREAM_USE_BATCHING, RELATION_IS_LOCAL, RelationGetNumberOfBlocks, IndexVacuumInfo::report_progress, BTVacState::stats, IndexVacuumInfo::strategy, UnlockRelationForExtension(), and vacuum_delay_point().

Referenced by btbulkdelete(), and btvacuumcleanup().