PostgreSQL Source Code git master
nbtree.c File Reference
#include "postgres.h"
#include "access/nbtree.h"
#include "access/relscan.h"
#include "access/stratnum.h"
#include "commands/progress.h"
#include "commands/vacuum.h"
#include "nodes/execnodes.h"
#include "pgstat.h"
#include "storage/bulk_write.h"
#include "storage/condition_variable.h"
#include "storage/indexfsm.h"
#include "storage/ipc.h"
#include "storage/lmgr.h"
#include "storage/read_stream.h"
#include "utils/datum.h"
#include "utils/fmgrprotos.h"
#include "utils/index_selfuncs.h"
#include "utils/memutils.h"
Include dependency graph for nbtree.c:

Go to the source code of this file.

Data Structures

struct  BTParallelScanDescData
 

Typedefs

typedef struct BTParallelScanDescData BTParallelScanDescData
 
typedef struct BTParallelScanDescDataBTParallelScanDesc
 

Enumerations

enum  BTPS_State {
  BTPARALLEL_NOT_INITIALIZED , BTPARALLEL_NEED_PRIMSCAN , BTPARALLEL_ADVANCING , BTPARALLEL_IDLE ,
  BTPARALLEL_DONE
}
 

Functions

static bool _bt_start_prim_scan (IndexScanDesc scan)
 
static void _bt_parallel_serialize_arrays (Relation rel, BTParallelScanDesc btscan, BTScanOpaque so)
 
static void _bt_parallel_restore_arrays (Relation rel, BTParallelScanDesc btscan, BTScanOpaque so)
 
static void btvacuumscan (IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state, BTCycleId cycleid)
 
static BlockNumber btvacuumpage (BTVacState *vstate, Buffer buf)
 
static BTVacuumPosting btreevacuumposting (BTVacState *vstate, IndexTuple posting, OffsetNumber updatedoffset, int *nremaining)
 
Datum bthandler (PG_FUNCTION_ARGS)
 
void btbuildempty (Relation index)
 
bool btinsert (Relation rel, Datum *values, bool *isnull, ItemPointer ht_ctid, Relation heapRel, IndexUniqueCheck checkUnique, bool indexUnchanged, IndexInfo *indexInfo)
 
bool btgettuple (IndexScanDesc scan, ScanDirection dir)
 
int64 btgetbitmap (IndexScanDesc scan, TIDBitmap *tbm)
 
IndexScanDesc btbeginscan (Relation rel, int nkeys, int norderbys)
 
void btrescan (IndexScanDesc scan, ScanKey scankey, int nscankeys, ScanKey orderbys, int norderbys)
 
void btendscan (IndexScanDesc scan)
 
void btmarkpos (IndexScanDesc scan)
 
void btrestrpos (IndexScanDesc scan)
 
Size btestimateparallelscan (Relation rel, int nkeys, int norderbys)
 
void btinitparallelscan (void *target)
 
void btparallelrescan (IndexScanDesc scan)
 
bool _bt_parallel_seize (IndexScanDesc scan, BlockNumber *next_scan_page, BlockNumber *last_curr_page, bool first)
 
void _bt_parallel_release (IndexScanDesc scan, BlockNumber next_scan_page, BlockNumber curr_page)
 
void _bt_parallel_done (IndexScanDesc scan)
 
void _bt_parallel_primscan_schedule (IndexScanDesc scan, BlockNumber curr_page)
 
IndexBulkDeleteResultbtbulkdelete (IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state)
 
IndexBulkDeleteResultbtvacuumcleanup (IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
 
bool btcanreturn (Relation index, int attno)
 
int btgettreeheight (Relation rel)
 
CompareType bttranslatestrategy (StrategyNumber strategy, Oid opfamily)
 
StrategyNumber bttranslatecmptype (CompareType cmptype, Oid opfamily)
 

Typedef Documentation

◆ BTParallelScanDesc

Definition at line 93 of file nbtree.c.

◆ BTParallelScanDescData

Enumeration Type Documentation

◆ BTPS_State

enum BTPS_State
Enumerator
BTPARALLEL_NOT_INITIALIZED 
BTPARALLEL_NEED_PRIMSCAN 
BTPARALLEL_ADVANCING 
BTPARALLEL_IDLE 
BTPARALLEL_DONE 

Definition at line 54 of file nbtree.c.

55{
BTPS_State
Definition: nbtree.c:55
@ BTPARALLEL_ADVANCING
Definition: nbtree.c:58
@ BTPARALLEL_NEED_PRIMSCAN
Definition: nbtree.c:57
@ BTPARALLEL_NOT_INITIALIZED
Definition: nbtree.c:56
@ BTPARALLEL_IDLE
Definition: nbtree.c:59
@ BTPARALLEL_DONE
Definition: nbtree.c:60

Function Documentation

◆ _bt_parallel_done()

void _bt_parallel_done ( IndexScanDesc  scan)

Definition at line 1041 of file nbtree.c.

1042{
1043 BTScanOpaque so = (BTScanOpaque) scan->opaque;
1044 ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
1045 BTParallelScanDesc btscan;
1046 bool status_changed = false;
1047
1049
1050 /* Do nothing, for non-parallel scans */
1051 if (parallel_scan == NULL)
1052 return;
1053
1054 /*
1055 * Should not mark parallel scan done when there's still a pending
1056 * primitive index scan
1057 */
1058 if (so->needPrimScan)
1059 return;
1060
1061 btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
1062 parallel_scan->ps_offset_am);
1063
1064 /*
1065 * Mark the parallel scan as done, unless some other process did so
1066 * already
1067 */
1068 LWLockAcquire(&btscan->btps_lock, LW_EXCLUSIVE);
1069 Assert(btscan->btps_pageStatus != BTPARALLEL_NEED_PRIMSCAN);
1070 if (btscan->btps_pageStatus != BTPARALLEL_DONE)
1071 {
1072 btscan->btps_pageStatus = BTPARALLEL_DONE;
1073 status_changed = true;
1074 }
1075 LWLockRelease(&btscan->btps_lock);
1076
1077 /* wake up all the workers associated with this parallel scan */
1078 if (status_changed)
1079 ConditionVariableBroadcast(&btscan->btps_cv);
1080}
#define OffsetToPointer(base, offset)
Definition: c.h:791
void ConditionVariableBroadcast(ConditionVariable *cv)
Assert(PointerIsAligned(start, uint64))
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1178
void LWLockRelease(LWLock *lock)
Definition: lwlock.c:1898
@ LW_EXCLUSIVE
Definition: lwlock.h:112
struct BTParallelScanDescData * BTParallelScanDesc
Definition: nbtree.c:93
#define BTScanPosIsValid(scanpos)
Definition: nbtree.h:1021
BTScanOpaqueData * BTScanOpaque
Definition: nbtree.h:1097
bool needPrimScan
Definition: nbtree.h:1063
BTScanPosData currPos
Definition: nbtree.h:1093
struct ParallelIndexScanDescData * parallel_scan
Definition: relscan.h:193

References Assert(), BTPARALLEL_DONE, BTPARALLEL_NEED_PRIMSCAN, BTScanPosIsValid, ConditionVariableBroadcast(), BTScanOpaqueData::currPos, LW_EXCLUSIVE, LWLockAcquire(), LWLockRelease(), BTScanOpaqueData::needPrimScan, OffsetToPointer, IndexScanDescData::opaque, and IndexScanDescData::parallel_scan.

Referenced by _bt_endpoint(), _bt_first(), _bt_parallel_seize(), _bt_readnextpage(), and _bt_start_prim_scan().

◆ _bt_parallel_primscan_schedule()

void _bt_parallel_primscan_schedule ( IndexScanDesc  scan,
BlockNumber  curr_page 
)

Definition at line 1091 of file nbtree.c.

1092{
1093 Relation rel = scan->indexRelation;
1094 BTScanOpaque so = (BTScanOpaque) scan->opaque;
1095 ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
1096 BTParallelScanDesc btscan;
1097
1098 Assert(so->numArrayKeys);
1099
1100 btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
1101 parallel_scan->ps_offset_am);
1102
1103 LWLockAcquire(&btscan->btps_lock, LW_EXCLUSIVE);
1104 if (btscan->btps_lastCurrPage == curr_page &&
1105 btscan->btps_pageStatus == BTPARALLEL_IDLE)
1106 {
1107 btscan->btps_nextScanPage = InvalidBlockNumber;
1108 btscan->btps_lastCurrPage = InvalidBlockNumber;
1109 btscan->btps_pageStatus = BTPARALLEL_NEED_PRIMSCAN;
1110
1111 /* Serialize scan's current array keys */
1112 _bt_parallel_serialize_arrays(rel, btscan, so);
1113 }
1114 LWLockRelease(&btscan->btps_lock);
1115}
#define InvalidBlockNumber
Definition: block.h:33
static void _bt_parallel_serialize_arrays(Relation rel, BTParallelScanDesc btscan, BTScanOpaque so)
Definition: nbtree.c:723
Relation indexRelation
Definition: relscan.h:139

References _bt_parallel_serialize_arrays(), Assert(), BTPARALLEL_IDLE, BTPARALLEL_NEED_PRIMSCAN, IndexScanDescData::indexRelation, InvalidBlockNumber, LW_EXCLUSIVE, LWLockAcquire(), LWLockRelease(), BTScanOpaqueData::numArrayKeys, OffsetToPointer, IndexScanDescData::opaque, and IndexScanDescData::parallel_scan.

Referenced by _bt_advance_array_keys(), and _bt_readpage().

◆ _bt_parallel_release()

void _bt_parallel_release ( IndexScanDesc  scan,
BlockNumber  next_scan_page,
BlockNumber  curr_page 
)

Definition at line 1014 of file nbtree.c.

1016{
1017 ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
1018 BTParallelScanDesc btscan;
1019
1020 Assert(BlockNumberIsValid(next_scan_page));
1021
1022 btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
1023 parallel_scan->ps_offset_am);
1024
1026 btscan->btps_nextScanPage = next_scan_page;
1027 btscan->btps_lastCurrPage = curr_page;
1029 LWLockRelease(&btscan->btps_lock);
1031}
static bool BlockNumberIsValid(BlockNumber blockNumber)
Definition: block.h:71
void ConditionVariableSignal(ConditionVariable *cv)
BTPS_State btps_pageStatus
Definition: nbtree.c:72
BlockNumber btps_lastCurrPage
Definition: nbtree.c:70
ConditionVariable btps_cv
Definition: nbtree.c:76
BlockNumber btps_nextScanPage
Definition: nbtree.c:69

References Assert(), BlockNumberIsValid(), BTPARALLEL_IDLE, BTParallelScanDescData::btps_cv, BTParallelScanDescData::btps_lastCurrPage, BTParallelScanDescData::btps_lock, BTParallelScanDescData::btps_nextScanPage, BTParallelScanDescData::btps_pageStatus, ConditionVariableSignal(), LW_EXCLUSIVE, LWLockAcquire(), LWLockRelease(), OffsetToPointer, IndexScanDescData::parallel_scan, and ParallelIndexScanDescData::ps_offset_am.

Referenced by _bt_readnextpage(), and _bt_readpage().

◆ _bt_parallel_restore_arrays()

static void _bt_parallel_restore_arrays ( Relation  rel,
BTParallelScanDesc  btscan,
BTScanOpaque  so 
)
static

Definition at line 766 of file nbtree.c.

768{
769 char *datumshared;
770
771 /* Space for serialized datums begins immediately after btps_arrElems[] */
772 datumshared = ((char *) &btscan->btps_arrElems[so->numArrayKeys]);
773 for (int i = 0; i < so->numArrayKeys; i++)
774 {
775 BTArrayKeyInfo *array = &so->arrayKeys[i];
776 ScanKey skey = &so->keyData[array->scan_key];
777 bool isnull;
778
779 if (array->num_elems != -1)
780 {
781 /* Restore SAOP array using its saved cur_elem */
782 Assert(!(skey->sk_flags & SK_BT_SKIP));
783 array->cur_elem = btscan->btps_arrElems[i];
784 skey->sk_argument = array->elem_values[array->cur_elem];
785 continue;
786 }
787
788 /* Restore skip array by restoring its key directly */
789 if (!array->attbyval && skey->sk_argument)
791 skey->sk_argument = (Datum) 0;
792 memcpy(&skey->sk_flags, datumshared, sizeof(int));
793 datumshared += sizeof(int);
794
795 Assert(skey->sk_flags & SK_BT_SKIP);
796
797 if (skey->sk_flags & (SK_BT_MINVAL | SK_BT_MAXVAL))
798 {
799 /* No sk_argument datum to restore */
800 continue;
801 }
802
803 skey->sk_argument = datumRestore(&datumshared, &isnull);
804 if (isnull)
805 {
806 Assert(skey->sk_argument == 0);
808 Assert(skey->sk_flags & SK_ISNULL);
809 }
810 }
811}
Datum datumRestore(char **start_address, bool *isnull)
Definition: datum.c:521
int i
Definition: isn.c:77
void pfree(void *pointer)
Definition: mcxt.c:1616
#define SK_BT_SKIP
Definition: nbtree.h:1106
#define SK_BT_MAXVAL
Definition: nbtree.h:1110
#define SK_BT_MINVAL
Definition: nbtree.h:1109
uint64_t Datum
Definition: postgres.h:70
static Pointer DatumGetPointer(Datum X)
Definition: postgres.h:342
#define SK_SEARCHNULL
Definition: skey.h:121
#define SK_ISNULL
Definition: skey.h:115
bool attbyval
Definition: nbtree.h:1046
Datum * elem_values
Definition: nbtree.h:1041
int btps_arrElems[FLEXIBLE_ARRAY_MEMBER]
Definition: nbtree.c:83
BTArrayKeyInfo * arrayKeys
Definition: nbtree.h:1066
ScanKey keyData
Definition: nbtree.h:1058
int sk_flags
Definition: skey.h:66
Datum sk_argument
Definition: skey.h:72

References BTScanOpaqueData::arrayKeys, Assert(), BTArrayKeyInfo::attbyval, BTParallelScanDescData::btps_arrElems, BTArrayKeyInfo::cur_elem, DatumGetPointer(), datumRestore(), BTArrayKeyInfo::elem_values, i, BTScanOpaqueData::keyData, BTArrayKeyInfo::num_elems, BTScanOpaqueData::numArrayKeys, pfree(), BTArrayKeyInfo::scan_key, ScanKeyData::sk_argument, SK_BT_MAXVAL, SK_BT_MINVAL, SK_BT_SKIP, ScanKeyData::sk_flags, SK_ISNULL, and SK_SEARCHNULL.

Referenced by _bt_parallel_seize().

◆ _bt_parallel_seize()

bool _bt_parallel_seize ( IndexScanDesc  scan,
BlockNumber next_scan_page,
BlockNumber last_curr_page,
bool  first 
)

Definition at line 876 of file nbtree.c.

878{
879 Relation rel = scan->indexRelation;
880 BTScanOpaque so = (BTScanOpaque) scan->opaque;
881 bool exit_loop = false,
882 status = true,
883 endscan = false;
884 ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
885 BTParallelScanDesc btscan;
886
887 *next_scan_page = InvalidBlockNumber;
888 *last_curr_page = InvalidBlockNumber;
889
890 /*
891 * Reset so->currPos, and initialize moreLeft/moreRight such that the next
892 * call to _bt_readnextpage treats this backend similarly to a serial
893 * backend that steps from *last_curr_page to *next_scan_page (unless this
894 * backend's so->currPos is initialized by _bt_readfirstpage before then).
895 */
897 so->currPos.moreLeft = so->currPos.moreRight = true;
898
899 if (first)
900 {
901 /*
902 * Initialize array related state when called from _bt_first, assuming
903 * that this will be the first primitive index scan for the scan
904 */
905 so->needPrimScan = false;
906 so->scanBehind = false;
907 so->oppositeDirCheck = false;
908 }
909 else
910 {
911 /*
912 * Don't attempt to seize the scan when it requires another primitive
913 * index scan, since caller's backend cannot start it right now
914 */
915 if (so->needPrimScan)
916 return false;
917 }
918
919 btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
920 parallel_scan->ps_offset_am);
921
922 while (1)
923 {
924 LWLockAcquire(&btscan->btps_lock, LW_EXCLUSIVE);
925
926 if (btscan->btps_pageStatus == BTPARALLEL_DONE)
927 {
928 /* We're done with this parallel index scan */
929 status = false;
930 }
931 else if (btscan->btps_pageStatus == BTPARALLEL_IDLE &&
932 btscan->btps_nextScanPage == P_NONE)
933 {
934 /* End this parallel index scan */
935 status = false;
936 endscan = true;
937 }
938 else if (btscan->btps_pageStatus == BTPARALLEL_NEED_PRIMSCAN)
939 {
940 Assert(so->numArrayKeys);
941
942 if (first)
943 {
944 /* Can start scheduled primitive scan right away, so do so */
945 btscan->btps_pageStatus = BTPARALLEL_ADVANCING;
946
947 /* Restore scan's array keys from serialized values */
948 _bt_parallel_restore_arrays(rel, btscan, so);
949 exit_loop = true;
950 }
951 else
952 {
953 /*
954 * Don't attempt to seize the scan when it requires another
955 * primitive index scan, since caller's backend cannot start
956 * it right now
957 */
958 status = false;
959 }
960
961 /*
962 * Either way, update backend local state to indicate that a
963 * pending primitive scan is required
964 */
965 so->needPrimScan = true;
966 so->scanBehind = false;
967 so->oppositeDirCheck = false;
968 }
969 else if (btscan->btps_pageStatus != BTPARALLEL_ADVANCING)
970 {
971 /*
972 * We have successfully seized control of the scan for the purpose
973 * of advancing it to a new page!
974 */
975 btscan->btps_pageStatus = BTPARALLEL_ADVANCING;
976 Assert(btscan->btps_nextScanPage != P_NONE);
977 *next_scan_page = btscan->btps_nextScanPage;
978 *last_curr_page = btscan->btps_lastCurrPage;
979 exit_loop = true;
980 }
981 LWLockRelease(&btscan->btps_lock);
982 if (exit_loop || !status)
983 break;
984 ConditionVariableSleep(&btscan->btps_cv, WAIT_EVENT_BTREE_PAGE);
985 }
987
988 /* When the scan has reached the rightmost (or leftmost) page, end it */
989 if (endscan)
990 _bt_parallel_done(scan);
991
992 return status;
993}
bool ConditionVariableCancelSleep(void)
void ConditionVariableSleep(ConditionVariable *cv, uint32 wait_event_info)
void _bt_parallel_done(IndexScanDesc scan)
Definition: nbtree.c:1041
static void _bt_parallel_restore_arrays(Relation rel, BTParallelScanDesc btscan, BTScanOpaque so)
Definition: nbtree.c:766
#define P_NONE
Definition: nbtree.h:213
#define BTScanPosInvalidate(scanpos)
Definition: nbtree.h:1027
bool oppositeDirCheck
Definition: nbtree.h:1065
bool moreRight
Definition: nbtree.h:986
bool moreLeft
Definition: nbtree.h:985

References _bt_parallel_done(), _bt_parallel_restore_arrays(), Assert(), BTPARALLEL_ADVANCING, BTPARALLEL_DONE, BTPARALLEL_IDLE, BTPARALLEL_NEED_PRIMSCAN, BTScanPosInvalidate, ConditionVariableCancelSleep(), ConditionVariableSleep(), BTScanOpaqueData::currPos, IndexScanDescData::indexRelation, InvalidBlockNumber, LW_EXCLUSIVE, LWLockAcquire(), LWLockRelease(), BTScanPosData::moreLeft, BTScanPosData::moreRight, BTScanOpaqueData::needPrimScan, BTScanOpaqueData::numArrayKeys, OffsetToPointer, IndexScanDescData::opaque, BTScanOpaqueData::oppositeDirCheck, P_NONE, IndexScanDescData::parallel_scan, and BTScanOpaqueData::scanBehind.

Referenced by _bt_first(), and _bt_readnextpage().

◆ _bt_parallel_serialize_arrays()

static void _bt_parallel_serialize_arrays ( Relation  rel,
BTParallelScanDesc  btscan,
BTScanOpaque  so 
)
static

Definition at line 723 of file nbtree.c.

725{
726 char *datumshared;
727
728 /* Space for serialized datums begins immediately after btps_arrElems[] */
729 datumshared = ((char *) &btscan->btps_arrElems[so->numArrayKeys]);
730 for (int i = 0; i < so->numArrayKeys; i++)
731 {
732 BTArrayKeyInfo *array = &so->arrayKeys[i];
733 ScanKey skey = &so->keyData[array->scan_key];
734
735 if (array->num_elems != -1)
736 {
737 /* Save SAOP array's cur_elem (no need to copy key/datum) */
738 Assert(!(skey->sk_flags & SK_BT_SKIP));
739 btscan->btps_arrElems[i] = array->cur_elem;
740 continue;
741 }
742
743 /* Save all mutable state associated with skip array's key */
744 Assert(skey->sk_flags & SK_BT_SKIP);
745 memcpy(datumshared, &skey->sk_flags, sizeof(int));
746 datumshared += sizeof(int);
747
748 if (skey->sk_flags & (SK_BT_MINVAL | SK_BT_MAXVAL))
749 {
750 /* No sk_argument datum to serialize */
751 Assert(skey->sk_argument == 0);
752 continue;
753 }
754
755 datumSerialize(skey->sk_argument, (skey->sk_flags & SK_ISNULL) != 0,
756 array->attbyval, array->attlen, &datumshared);
757 }
758}
void datumSerialize(Datum value, bool isnull, bool typByVal, int typLen, char **start_address)
Definition: datum.c:459
int16 attlen
Definition: nbtree.h:1045

References BTScanOpaqueData::arrayKeys, Assert(), BTArrayKeyInfo::attbyval, BTArrayKeyInfo::attlen, BTParallelScanDescData::btps_arrElems, BTArrayKeyInfo::cur_elem, datumSerialize(), i, BTScanOpaqueData::keyData, BTArrayKeyInfo::num_elems, BTScanOpaqueData::numArrayKeys, BTArrayKeyInfo::scan_key, ScanKeyData::sk_argument, SK_BT_MAXVAL, SK_BT_MINVAL, SK_BT_SKIP, ScanKeyData::sk_flags, and SK_ISNULL.

Referenced by _bt_parallel_primscan_schedule().

◆ _bt_start_prim_scan()

static bool _bt_start_prim_scan ( IndexScanDesc  scan)
static

Definition at line 659 of file nbtree.c.

660{
661 BTScanOpaque so = (BTScanOpaque) scan->opaque;
662
663 Assert(so->numArrayKeys);
664
665 so->scanBehind = so->oppositeDirCheck = false; /* reset */
666
667 /*
668 * Array keys are advanced within _bt_checkkeys when the scan reaches the
669 * leaf level (more precisely, they're advanced when the scan reaches the
670 * end of each distinct set of array elements). This process avoids
671 * repeat access to leaf pages (across multiple primitive index scans) by
672 * advancing the scan's array keys when it allows the primitive index scan
673 * to find nearby matching tuples (or when it eliminates ranges of array
674 * key space that can't possibly be satisfied by any index tuple).
675 *
676 * _bt_checkkeys sets a simple flag variable to schedule another primitive
677 * index scan. The flag tells us what to do.
678 *
679 * We cannot rely on _bt_first always reaching _bt_checkkeys. There are
680 * various cases where that won't happen. For example, if the index is
681 * completely empty, then _bt_first won't call _bt_readpage/_bt_checkkeys.
682 * We also don't expect a call to _bt_checkkeys during searches for a
683 * non-existent value that happens to be lower/higher than any existing
684 * value in the index.
685 *
686 * We don't require special handling for these cases -- we don't need to
687 * be explicitly instructed to _not_ perform another primitive index scan.
688 * It's up to code under the control of _bt_first to always set the flag
689 * when another primitive index scan will be required.
690 *
691 * This works correctly, even with the tricky cases listed above, which
692 * all involve access to leaf pages "near the boundaries of the key space"
693 * (whether it's from a leftmost/rightmost page, or an imaginary empty
694 * leaf root page). If _bt_checkkeys cannot be reached by a primitive
695 * index scan for one set of array keys, then it also won't be reached for
696 * any later set ("later" in terms of the direction that we scan the index
697 * and advance the arrays). The array keys won't have advanced in these
698 * cases, but that's the correct behavior (even _bt_advance_array_keys
699 * won't always advance the arrays at the point they become "exhausted").
700 */
701 if (so->needPrimScan)
702 {
703 /*
704 * Flag was set -- must call _bt_first again, which will reset the
705 * scan's needPrimScan flag
706 */
707 return true;
708 }
709
710 /* The top-level index scan ran out of tuples in this scan direction */
711 if (scan->parallel_scan != NULL)
712 _bt_parallel_done(scan);
713
714 return false;
715}

References _bt_parallel_done(), Assert(), BTScanOpaqueData::needPrimScan, BTScanOpaqueData::numArrayKeys, IndexScanDescData::opaque, BTScanOpaqueData::oppositeDirCheck, IndexScanDescData::parallel_scan, and BTScanOpaqueData::scanBehind.

Referenced by btgetbitmap(), and btgettuple().

◆ btbeginscan()

IndexScanDesc btbeginscan ( Relation  rel,
int  nkeys,
int  norderbys 
)

Definition at line 337 of file nbtree.c.

338{
339 IndexScanDesc scan;
340 BTScanOpaque so;
341
342 /* no order by operators allowed */
343 Assert(norderbys == 0);
344
345 /* get the scan */
346 scan = RelationGetIndexScan(rel, nkeys, norderbys);
347
348 /* allocate private workspace */
352 if (scan->numberOfKeys > 0)
353 so->keyData = (ScanKey) palloc(scan->numberOfKeys * sizeof(ScanKeyData));
354 else
355 so->keyData = NULL;
356
357 so->skipScan = false;
358 so->needPrimScan = false;
359 so->scanBehind = false;
360 so->oppositeDirCheck = false;
361 so->arrayKeys = NULL;
362 so->orderProcs = NULL;
363 so->arrayContext = NULL;
364
365 so->killedItems = NULL; /* until needed */
366 so->numKilled = 0;
367
368 /*
369 * We don't know yet whether the scan will be index-only, so we do not
370 * allocate the tuple workspace arrays until btrescan. However, we set up
371 * scan->xs_itupdesc whether we'll need it or not, since that's so cheap.
372 */
373 so->currTuples = so->markTuples = NULL;
374
375 scan->xs_itupdesc = RelationGetDescr(rel);
376
377 scan->opaque = so;
378
379 return scan;
380}
#define palloc_object(type)
Definition: fe_memutils.h:74
IndexScanDesc RelationGetIndexScan(Relation indexRelation, int nkeys, int norderbys)
Definition: genam.c:80
void * palloc(Size size)
Definition: mcxt.c:1387
#define RelationGetDescr(relation)
Definition: rel.h:541
ScanKeyData * ScanKey
Definition: skey.h:75
char * markTuples
Definition: nbtree.h:1081
FmgrInfo * orderProcs
Definition: nbtree.h:1067
int * killedItems
Definition: nbtree.h:1071
char * currTuples
Definition: nbtree.h:1080
BTScanPosData markPos
Definition: nbtree.h:1094
MemoryContext arrayContext
Definition: nbtree.h:1068
struct TupleDescData * xs_itupdesc
Definition: relscan.h:170

References BTScanOpaqueData::arrayContext, BTScanOpaqueData::arrayKeys, Assert(), BTScanPosInvalidate, BTScanOpaqueData::currPos, BTScanOpaqueData::currTuples, BTScanOpaqueData::keyData, BTScanOpaqueData::killedItems, BTScanOpaqueData::markPos, BTScanOpaqueData::markTuples, BTScanOpaqueData::needPrimScan, IndexScanDescData::numberOfKeys, BTScanOpaqueData::numKilled, IndexScanDescData::opaque, BTScanOpaqueData::oppositeDirCheck, BTScanOpaqueData::orderProcs, palloc(), palloc_object, RelationGetDescr, RelationGetIndexScan(), BTScanOpaqueData::scanBehind, BTScanOpaqueData::skipScan, and IndexScanDescData::xs_itupdesc.

Referenced by bthandler().

◆ btbuildempty()

void btbuildempty ( Relation  index)

Definition at line 181 of file nbtree.c.

182{
183 bool allequalimage = _bt_allequalimage(index, false);
184 BulkWriteState *bulkstate;
185 BulkWriteBuffer metabuf;
186
188
189 /* Construct metapage. */
190 metabuf = smgr_bulk_get_buf(bulkstate);
191 _bt_initmetapage((Page) metabuf, P_NONE, 0, allequalimage);
192 smgr_bulk_write(bulkstate, BTREE_METAPAGE, metabuf, true);
193
194 smgr_bulk_finish(bulkstate);
195}
PageData * Page
Definition: bufpage.h:81
BulkWriteState * smgr_bulk_start_rel(Relation rel, ForkNumber forknum)
Definition: bulk_write.c:87
void smgr_bulk_write(BulkWriteState *bulkstate, BlockNumber blocknum, BulkWriteBuffer buf, bool page_std)
Definition: bulk_write.c:323
BulkWriteBuffer smgr_bulk_get_buf(BulkWriteState *bulkstate)
Definition: bulk_write.c:347
void smgr_bulk_finish(BulkWriteState *bulkstate)
Definition: bulk_write.c:130
void _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level, bool allequalimage)
Definition: nbtpage.c:68
#define BTREE_METAPAGE
Definition: nbtree.h:149
bool _bt_allequalimage(Relation rel, bool debugmessage)
Definition: nbtutils.c:1181
@ INIT_FORKNUM
Definition: relpath.h:61
Definition: type.h:96

References _bt_allequalimage(), _bt_initmetapage(), BTREE_METAPAGE, INIT_FORKNUM, P_NONE, smgr_bulk_finish(), smgr_bulk_get_buf(), smgr_bulk_start_rel(), and smgr_bulk_write().

Referenced by bthandler().

◆ btbulkdelete()

IndexBulkDeleteResult * btbulkdelete ( IndexVacuumInfo info,
IndexBulkDeleteResult stats,
IndexBulkDeleteCallback  callback,
void *  callback_state 
)

Definition at line 1125 of file nbtree.c.

1127{
1128 Relation rel = info->index;
1129 BTCycleId cycleid;
1130
1131 /* allocate stats if first time through, else re-use existing struct */
1132 if (stats == NULL)
1134
1135 /* Establish the vacuum cycle ID to use for this scan */
1136 /* The ENSURE stuff ensures we clean up shared memory on failure */
1138 {
1139 cycleid = _bt_start_vacuum(rel);
1140
1141 btvacuumscan(info, stats, callback, callback_state, cycleid);
1142 }
1144 _bt_end_vacuum(rel);
1145
1146 return stats;
1147}
#define palloc0_object(type)
Definition: fe_memutils.h:75
#define PG_ENSURE_ERROR_CLEANUP(cleanup_function, arg)
Definition: ipc.h:47
#define PG_END_ENSURE_ERROR_CLEANUP(cleanup_function, arg)
Definition: ipc.h:52
static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state, BTCycleId cycleid)
Definition: nbtree.c:1243
uint16 BTCycleId
Definition: nbtree.h:30
void _bt_end_vacuum(Relation rel)
Definition: nbtutils.c:526
void _bt_end_vacuum_callback(int code, Datum arg)
Definition: nbtutils.c:554
BTCycleId _bt_start_vacuum(Relation rel)
Definition: nbtutils.c:469
static Datum PointerGetDatum(const void *X)
Definition: postgres.h:352
Relation index
Definition: genam.h:73
static void callback(struct sockaddr *addr, struct sockaddr *mask, void *unused)
Definition: test_ifaddrs.c:46

References _bt_end_vacuum(), _bt_end_vacuum_callback(), _bt_start_vacuum(), btvacuumscan(), callback(), IndexVacuumInfo::index, palloc0_object, PG_END_ENSURE_ERROR_CLEANUP, PG_ENSURE_ERROR_CLEANUP, and PointerGetDatum().

Referenced by bthandler().

◆ btcanreturn()

bool btcanreturn ( Relation  index,
int  attno 
)

Definition at line 1805 of file nbtree.c.

1806{
1807 return true;
1808}

Referenced by bthandler().

◆ btendscan()

void btendscan ( IndexScanDesc  scan)

Definition at line 461 of file nbtree.c.

462{
463 BTScanOpaque so = (BTScanOpaque) scan->opaque;
464
465 /* we aren't holding any read locks, but gotta drop the pins */
467 {
468 /* Before leaving current page, deal with any killed items */
469 if (so->numKilled > 0)
470 _bt_killitems(scan);
472 }
473
474 so->markItemIndex = -1;
476
477 /* No need to invalidate positions, the RAM is about to be freed. */
478
479 /* Release storage */
480 if (so->keyData != NULL)
481 pfree(so->keyData);
482 /* so->arrayKeys and so->orderProcs are in arrayContext */
483 if (so->arrayContext != NULL)
485 if (so->killedItems != NULL)
486 pfree(so->killedItems);
487 if (so->currTuples != NULL)
488 pfree(so->currTuples);
489 /* so->markTuples should not be pfree'd, see btrescan */
490 pfree(so);
491}
if(TABLE==NULL||TABLE_index==NULL)
Definition: isn.c:81
void MemoryContextDelete(MemoryContext context)
Definition: mcxt.c:472
#define BTScanPosUnpinIfPinned(scanpos)
Definition: nbtree.h:1015
void _bt_killitems(IndexScanDesc scan)
Definition: nbtutils.c:205

References _bt_killitems(), BTScanOpaqueData::arrayContext, BTScanPosIsValid, BTScanPosUnpinIfPinned, BTScanOpaqueData::currPos, BTScanOpaqueData::currTuples, if(), BTScanOpaqueData::keyData, BTScanOpaqueData::killedItems, BTScanOpaqueData::markItemIndex, BTScanOpaqueData::markPos, MemoryContextDelete(), BTScanOpaqueData::numKilled, IndexScanDescData::opaque, and pfree().

Referenced by bthandler().

◆ btestimateparallelscan()

Size btestimateparallelscan ( Relation  rel,
int  nkeys,
int  norderbys 
)

Definition at line 581 of file nbtree.c.

582{
584 Size estnbtreeshared,
585 genericattrspace;
586
587 /*
588 * Pessimistically assume that every input scan key will be output with
589 * its own SAOP array
590 */
591 estnbtreeshared = offsetof(BTParallelScanDescData, btps_arrElems) +
592 sizeof(int) * nkeys;
593
594 /* Single column indexes cannot possibly use a skip array */
595 if (nkeyatts == 1)
596 return estnbtreeshared;
597
598 /*
599 * Pessimistically assume that all attributes prior to the least
600 * significant attribute require a skip array (and an associated key)
601 */
602 genericattrspace = datumEstimateSpace((Datum) 0, false, true,
603 sizeof(Datum));
604 for (int attnum = 1; attnum < nkeyatts; attnum++)
605 {
606 CompactAttribute *attr;
607
608 /*
609 * We make the conservative assumption that every index column will
610 * also require a skip array.
611 *
612 * Every skip array must have space to store its scan key's sk_flags.
613 */
614 estnbtreeshared = add_size(estnbtreeshared, sizeof(int));
615
616 /* Consider space required to store a datum of opclass input type */
617 attr = TupleDescCompactAttr(rel->rd_att, attnum - 1);
618 if (attr->attbyval)
619 {
620 /* This index attribute stores pass-by-value datums */
621 Size estfixed = datumEstimateSpace((Datum) 0, false,
622 true, attr->attlen);
623
624 estnbtreeshared = add_size(estnbtreeshared, estfixed);
625 continue;
626 }
627
628 /*
629 * This index attribute stores pass-by-reference datums.
630 *
631 * Assume that serializing this array will use just as much space as a
632 * pass-by-value datum, in addition to space for the largest possible
633 * whole index tuple (this is not just a per-datum portion of the
634 * largest possible tuple because that'd be almost as large anyway).
635 *
636 * This is quite conservative, but it's not clear how we could do much
637 * better. The executor requires an up-front storage request size
638 * that reliably covers the scan's high watermark memory usage. We
639 * can't be sure of the real high watermark until the scan is over.
640 */
641 estnbtreeshared = add_size(estnbtreeshared, genericattrspace);
642 estnbtreeshared = add_size(estnbtreeshared, BTMaxItemSize);
643 }
644
645 return estnbtreeshared;
646}
int16_t int16
Definition: c.h:547
size_t Size
Definition: c.h:625
Size datumEstimateSpace(Datum value, bool isnull, bool typByVal, int typLen)
Definition: datum.c:412
#define BTMaxItemSize
Definition: nbtree.h:165
int16 attnum
Definition: pg_attribute.h:74
#define IndexRelationGetNumberOfKeyAttributes(relation)
Definition: rel.h:534
Size add_size(Size s1, Size s2)
Definition: shmem.c:495
int16 attlen
Definition: tupdesc.h:71
TupleDesc rd_att
Definition: rel.h:112
static CompactAttribute * TupleDescCompactAttr(TupleDesc tupdesc, int i)
Definition: tupdesc.h:175

References add_size(), CompactAttribute::attbyval, CompactAttribute::attlen, attnum, BTMaxItemSize, BTParallelScanDescData::btps_arrElems, datumEstimateSpace(), IndexRelationGetNumberOfKeyAttributes, RelationData::rd_att, and TupleDescCompactAttr().

Referenced by bthandler().

◆ btgetbitmap()

int64 btgetbitmap ( IndexScanDesc  scan,
TIDBitmap tbm 
)

Definition at line 289 of file nbtree.c.

290{
291 BTScanOpaque so = (BTScanOpaque) scan->opaque;
292 int64 ntids = 0;
293 ItemPointer heapTid;
294
295 Assert(scan->heapRelation == NULL);
296
297 /* Each loop iteration performs another primitive index scan */
298 do
299 {
300 /* Fetch the first page & tuple */
302 {
303 /* Save tuple ID, and continue scanning */
304 heapTid = &scan->xs_heaptid;
305 tbm_add_tuples(tbm, heapTid, 1, false);
306 ntids++;
307
308 for (;;)
309 {
310 /*
311 * Advance to next tuple within page. This is the same as the
312 * easy case in _bt_next().
313 */
314 if (++so->currPos.itemIndex > so->currPos.lastItem)
315 {
316 /* let _bt_next do the heavy lifting */
317 if (!_bt_next(scan, ForwardScanDirection))
318 break;
319 }
320
321 /* Save tuple ID, and continue scanning */
322 heapTid = &so->currPos.items[so->currPos.itemIndex].heapTid;
323 tbm_add_tuples(tbm, heapTid, 1, false);
324 ntids++;
325 }
326 }
327 /* Now see if we need another primitive index scan */
328 } while (so->numArrayKeys && _bt_start_prim_scan(scan));
329
330 return ntids;
331}
int64_t int64
Definition: c.h:549
static bool _bt_start_prim_scan(IndexScanDesc scan)
Definition: nbtree.c:659
bool _bt_first(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:877
bool _bt_next(IndexScanDesc scan, ScanDirection dir)
Definition: nbtsearch.c:1585
@ ForwardScanDirection
Definition: sdir.h:28
int lastItem
Definition: nbtree.h:996
BTScanPosItem items[MaxTIDsPerBTreePage]
Definition: nbtree.h:999
int itemIndex
Definition: nbtree.h:997
ItemPointerData heapTid
Definition: nbtree.h:957
ItemPointerData xs_heaptid
Definition: relscan.h:174
Relation heapRelation
Definition: relscan.h:138
void tbm_add_tuples(TIDBitmap *tbm, const ItemPointerData *tids, int ntids, bool recheck)
Definition: tidbitmap.c:367

References _bt_first(), _bt_next(), _bt_start_prim_scan(), Assert(), BTScanOpaqueData::currPos, ForwardScanDirection, IndexScanDescData::heapRelation, BTScanPosItem::heapTid, BTScanPosData::itemIndex, BTScanPosData::items, BTScanPosData::lastItem, BTScanOpaqueData::numArrayKeys, IndexScanDescData::opaque, tbm_add_tuples(), and IndexScanDescData::xs_heaptid.

Referenced by bthandler().

◆ btgettreeheight()

int btgettreeheight ( Relation  rel)

Definition at line 1814 of file nbtree.c.

1815{
1816 return _bt_getrootheight(rel);
1817}
int _bt_getrootheight(Relation rel)
Definition: nbtpage.c:676

References _bt_getrootheight().

Referenced by bthandler().

◆ btgettuple()

bool btgettuple ( IndexScanDesc  scan,
ScanDirection  dir 
)

Definition at line 228 of file nbtree.c.

229{
230 BTScanOpaque so = (BTScanOpaque) scan->opaque;
231 bool res;
232
233 Assert(scan->heapRelation != NULL);
234
235 /* btree indexes are never lossy */
236 scan->xs_recheck = false;
237
238 /* Each loop iteration performs another primitive index scan */
239 do
240 {
241 /*
242 * If we've already initialized this scan, we can just advance it in
243 * the appropriate direction. If we haven't done so yet, we call
244 * _bt_first() to get the first item in the scan.
245 */
246 if (!BTScanPosIsValid(so->currPos))
247 res = _bt_first(scan, dir);
248 else
249 {
250 /*
251 * Check to see if we should kill the previously-fetched tuple.
252 */
253 if (scan->kill_prior_tuple)
254 {
255 /*
256 * Yes, remember it for later. (We'll deal with all such
257 * tuples at once right before leaving the index page.) The
258 * test for numKilled overrun is not just paranoia: if the
259 * caller reverses direction in the indexscan then the same
260 * item might get entered multiple times. It's not worth
261 * trying to optimize that, so we don't detect it, but instead
262 * just forget any excess entries.
263 */
264 if (so->killedItems == NULL)
267 so->killedItems[so->numKilled++] = so->currPos.itemIndex;
268 }
269
270 /*
271 * Now continue the scan.
272 */
273 res = _bt_next(scan, dir);
274 }
275
276 /* If we have a tuple, return it ... */
277 if (res)
278 break;
279 /* ... otherwise see if we need another primitive index scan */
280 } while (so->numArrayKeys && _bt_start_prim_scan(scan));
281
282 return res;
283}
#define palloc_array(type, count)
Definition: fe_memutils.h:76
#define MaxTIDsPerBTreePage
Definition: nbtree.h:186
bool kill_prior_tuple
Definition: relscan.h:149

References _bt_first(), _bt_next(), _bt_start_prim_scan(), Assert(), BTScanPosIsValid, BTScanOpaqueData::currPos, IndexScanDescData::heapRelation, BTScanPosData::itemIndex, IndexScanDescData::kill_prior_tuple, BTScanOpaqueData::killedItems, MaxTIDsPerBTreePage, BTScanOpaqueData::numArrayKeys, BTScanOpaqueData::numKilled, IndexScanDescData::opaque, palloc_array, and IndexScanDescData::xs_recheck.

Referenced by bthandler().

◆ bthandler()

Datum bthandler ( PG_FUNCTION_ARGS  )

Definition at line 116 of file nbtree.c.

117{
118 static const IndexAmRoutine amroutine = {
119 .type = T_IndexAmRoutine,
120 .amstrategies = BTMaxStrategyNumber,
121 .amsupport = BTNProcs,
122 .amoptsprocnum = BTOPTIONS_PROC,
123 .amcanorder = true,
124 .amcanorderbyop = false,
125 .amcanhash = false,
126 .amconsistentequality = true,
127 .amconsistentordering = true,
128 .amcanbackward = true,
129 .amcanunique = true,
130 .amcanmulticol = true,
131 .amoptionalkey = true,
132 .amsearcharray = true,
133 .amsearchnulls = true,
134 .amstorage = false,
135 .amclusterable = true,
136 .ampredlocks = true,
137 .amcanparallel = true,
138 .amcanbuildparallel = true,
139 .amcaninclude = true,
140 .amusemaintenanceworkmem = false,
141 .amsummarizing = false,
142 .amparallelvacuumoptions =
144 .amkeytype = InvalidOid,
145
146 .ambuild = btbuild,
147 .ambuildempty = btbuildempty,
148 .aminsert = btinsert,
149 .aminsertcleanup = NULL,
150 .ambulkdelete = btbulkdelete,
151 .amvacuumcleanup = btvacuumcleanup,
152 .amcanreturn = btcanreturn,
153 .amcostestimate = btcostestimate,
154 .amgettreeheight = btgettreeheight,
155 .amoptions = btoptions,
156 .amproperty = btproperty,
157 .ambuildphasename = btbuildphasename,
158 .amvalidate = btvalidate,
159 .amadjustmembers = btadjustmembers,
160 .ambeginscan = btbeginscan,
161 .amrescan = btrescan,
162 .amgettuple = btgettuple,
163 .amgetbitmap = btgetbitmap,
164 .amendscan = btendscan,
165 .ammarkpos = btmarkpos,
166 .amrestrpos = btrestrpos,
167 .amestimateparallelscan = btestimateparallelscan,
168 .aminitparallelscan = btinitparallelscan,
169 .amparallelrescan = btparallelrescan,
170 .amtranslatestrategy = bttranslatestrategy,
171 .amtranslatecmptype = bttranslatecmptype,
172 };
173
174 PG_RETURN_POINTER(&amroutine);
175}
#define PG_RETURN_POINTER(x)
Definition: fmgr.h:363
bool btcanreturn(Relation index, int attno)
Definition: nbtree.c:1805
StrategyNumber bttranslatecmptype(CompareType cmptype, Oid opfamily)
Definition: nbtree.c:1840
IndexScanDesc btbeginscan(Relation rel, int nkeys, int norderbys)
Definition: nbtree.c:337
Size btestimateparallelscan(Relation rel, int nkeys, int norderbys)
Definition: nbtree.c:581
IndexBulkDeleteResult * btvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
Definition: nbtree.c:1155
CompareType bttranslatestrategy(StrategyNumber strategy, Oid opfamily)
Definition: nbtree.c:1820
bool btgettuple(IndexScanDesc scan, ScanDirection dir)
Definition: nbtree.c:228
void btparallelrescan(IndexScanDesc scan)
Definition: nbtree.c:833
bool btinsert(Relation rel, Datum *values, bool *isnull, ItemPointer ht_ctid, Relation heapRel, IndexUniqueCheck checkUnique, bool indexUnchanged, IndexInfo *indexInfo)
Definition: nbtree.c:204
void btbuildempty(Relation index)
Definition: nbtree.c:181
int btgettreeheight(Relation rel)
Definition: nbtree.c:1814
void btinitparallelscan(void *target)
Definition: nbtree.c:817
IndexBulkDeleteResult * btbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state)
Definition: nbtree.c:1125
int64 btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
Definition: nbtree.c:289
void btmarkpos(IndexScanDesc scan)
Definition: nbtree.c:497
void btendscan(IndexScanDesc scan)
Definition: nbtree.c:461
void btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, ScanKey orderbys, int norderbys)
Definition: nbtree.c:386
void btrestrpos(IndexScanDesc scan)
Definition: nbtree.c:523
#define BTNProcs
Definition: nbtree.h:723
#define BTOPTIONS_PROC
Definition: nbtree.h:721
IndexBuildResult * btbuild(Relation heap, Relation index, IndexInfo *indexInfo)
Definition: nbtsort.c:296
char * btbuildphasename(int64 phasenum)
Definition: nbtutils.c:650
bytea * btoptions(Datum reloptions, bool validate)
Definition: nbtutils.c:604
bool btproperty(Oid index_oid, int attno, IndexAMProperty prop, const char *propname, bool *res, bool *isnull)
Definition: nbtutils.c:627
bool btvalidate(Oid opclassoid)
Definition: nbtvalidate.c:40
void btadjustmembers(Oid opfamilyoid, Oid opclassoid, List *operators, List *functions)
Definition: nbtvalidate.c:288
#define InvalidOid
Definition: postgres_ext.h:37
void btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count, Cost *indexStartupCost, Cost *indexTotalCost, Selectivity *indexSelectivity, double *indexCorrelation, double *indexPages)
Definition: selfuncs.c:7686
#define BTMaxStrategyNumber
Definition: stratnum.h:35
NodeTag type
Definition: amapi.h:234
#define VACUUM_OPTION_PARALLEL_BULKDEL
Definition: vacuum.h:48
#define VACUUM_OPTION_PARALLEL_COND_CLEANUP
Definition: vacuum.h:55

References btadjustmembers(), btbeginscan(), btbuild(), btbuildempty(), btbuildphasename(), btbulkdelete(), btcanreturn(), btcostestimate(), btendscan(), btestimateparallelscan(), btgetbitmap(), btgettreeheight(), btgettuple(), btinitparallelscan(), btinsert(), btmarkpos(), BTMaxStrategyNumber, BTNProcs, btoptions(), BTOPTIONS_PROC, btparallelrescan(), btproperty(), btrescan(), btrestrpos(), bttranslatecmptype(), bttranslatestrategy(), btvacuumcleanup(), btvalidate(), InvalidOid, PG_RETURN_POINTER, IndexAmRoutine::type, VACUUM_OPTION_PARALLEL_BULKDEL, and VACUUM_OPTION_PARALLEL_COND_CLEANUP.

◆ btinitparallelscan()

void btinitparallelscan ( void *  target)

Definition at line 817 of file nbtree.c.

818{
819 BTParallelScanDesc bt_target = (BTParallelScanDesc) target;
820
821 LWLockInitialize(&bt_target->btps_lock,
822 LWTRANCHE_PARALLEL_BTREE_SCAN);
826 ConditionVariableInit(&bt_target->btps_cv);
827}
void ConditionVariableInit(ConditionVariable *cv)
void LWLockInitialize(LWLock *lock, int tranche_id)
Definition: lwlock.c:698

References BTPARALLEL_NOT_INITIALIZED, BTParallelScanDescData::btps_cv, BTParallelScanDescData::btps_lastCurrPage, BTParallelScanDescData::btps_lock, BTParallelScanDescData::btps_nextScanPage, BTParallelScanDescData::btps_pageStatus, ConditionVariableInit(), InvalidBlockNumber, and LWLockInitialize().

Referenced by bthandler().

◆ btinsert()

bool btinsert ( Relation  rel,
Datum values,
bool *  isnull,
ItemPointer  ht_ctid,
Relation  heapRel,
IndexUniqueCheck  checkUnique,
bool  indexUnchanged,
IndexInfo indexInfo 
)

Definition at line 204 of file nbtree.c.

209{
210 bool result;
211 IndexTuple itup;
212
213 /* generate an index tuple */
214 itup = index_form_tuple(RelationGetDescr(rel), values, isnull);
215 itup->t_tid = *ht_ctid;
216
217 result = _bt_doinsert(rel, itup, checkUnique, indexUnchanged, heapRel);
218
219 pfree(itup);
220
221 return result;
222}
static Datum values[MAXATTR]
Definition: bootstrap.c:155
IndexTuple index_form_tuple(TupleDesc tupleDescriptor, const Datum *values, const bool *isnull)
Definition: indextuple.c:44
bool _bt_doinsert(Relation rel, IndexTuple itup, IndexUniqueCheck checkUnique, bool indexUnchanged, Relation heapRel)
Definition: nbtinsert.c:104
ItemPointerData t_tid
Definition: itup.h:37

References _bt_doinsert(), index_form_tuple(), pfree(), RelationGetDescr, IndexTupleData::t_tid, and values.

Referenced by bthandler().

◆ btmarkpos()

void btmarkpos ( IndexScanDesc  scan)

Definition at line 497 of file nbtree.c.

498{
499 BTScanOpaque so = (BTScanOpaque) scan->opaque;
500
501 /* There may be an old mark with a pin (but no lock). */
503
504 /*
505 * Just record the current itemIndex. If we later step to next page
506 * before releasing the marked position, _bt_steppage makes a full copy of
507 * the currPos struct in markPos. If (as often happens) the mark is moved
508 * before we leave the page, we don't have to do that work.
509 */
510 if (BTScanPosIsValid(so->currPos))
512 else
513 {
515 so->markItemIndex = -1;
516 }
517}

References BTScanPosInvalidate, BTScanPosIsValid, BTScanPosUnpinIfPinned, BTScanOpaqueData::currPos, BTScanPosData::itemIndex, BTScanOpaqueData::markItemIndex, BTScanOpaqueData::markPos, and IndexScanDescData::opaque.

Referenced by bthandler().

◆ btparallelrescan()

void btparallelrescan ( IndexScanDesc  scan)

Definition at line 833 of file nbtree.c.

834{
835 BTParallelScanDesc btscan;
836 ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
837
838 Assert(parallel_scan);
839
840 btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan,
841 parallel_scan->ps_offset_am);
842
843 /*
844 * In theory, we don't need to acquire the LWLock here, because there
845 * shouldn't be any other workers running at this point, but we do so for
846 * consistency.
847 */
852 LWLockRelease(&btscan->btps_lock);
853}

References Assert(), BTPARALLEL_NOT_INITIALIZED, BTParallelScanDescData::btps_lastCurrPage, BTParallelScanDescData::btps_lock, BTParallelScanDescData::btps_nextScanPage, BTParallelScanDescData::btps_pageStatus, InvalidBlockNumber, LW_EXCLUSIVE, LWLockAcquire(), LWLockRelease(), OffsetToPointer, IndexScanDescData::parallel_scan, and ParallelIndexScanDescData::ps_offset_am.

Referenced by bthandler().

◆ btreevacuumposting()

static BTVacuumPosting btreevacuumposting ( BTVacState vstate,
IndexTuple  posting,
OffsetNumber  updatedoffset,
int *  nremaining 
)
static

Definition at line 1756 of file nbtree.c.

1758{
1759 int live = 0;
1760 int nitem = BTreeTupleGetNPosting(posting);
1762 BTVacuumPosting vacposting = NULL;
1763
1764 for (int i = 0; i < nitem; i++)
1765 {
1766 if (!vstate->callback(items + i, vstate->callback_state))
1767 {
1768 /* Live table TID */
1769 live++;
1770 }
1771 else if (vacposting == NULL)
1772 {
1773 /*
1774 * First dead table TID encountered.
1775 *
1776 * It's now clear that we need to delete one or more dead table
1777 * TIDs, so start maintaining metadata describing how to update
1778 * existing posting list tuple.
1779 */
1780 vacposting = palloc(offsetof(BTVacuumPostingData, deletetids) +
1781 nitem * sizeof(uint16));
1782
1783 vacposting->itup = posting;
1784 vacposting->updatedoffset = updatedoffset;
1785 vacposting->ndeletedtids = 0;
1786 vacposting->deletetids[vacposting->ndeletedtids++] = i;
1787 }
1788 else
1789 {
1790 /* Second or subsequent dead table TID */
1791 vacposting->deletetids[vacposting->ndeletedtids++] = i;
1792 }
1793 }
1794
1795 *nremaining = live;
1796 return vacposting;
1797}
uint16_t uint16
Definition: c.h:551
static uint16 BTreeTupleGetNPosting(IndexTuple posting)
Definition: nbtree.h:519
static ItemPointer BTreeTupleGetPosting(IndexTuple posting)
Definition: nbtree.h:538
IndexBulkDeleteCallback callback
Definition: nbtree.h:335
void * callback_state
Definition: nbtree.h:336
uint16 deletetids[FLEXIBLE_ARRAY_MEMBER]
Definition: nbtree.h:922
uint16 ndeletedtids
Definition: nbtree.h:921
IndexTuple itup
Definition: nbtree.h:917
OffsetNumber updatedoffset
Definition: nbtree.h:918
static ItemArray items
Definition: test_tidstore.c:48

References BTreeTupleGetNPosting(), BTreeTupleGetPosting(), BTVacState::callback, BTVacState::callback_state, BTVacuumPostingData::deletetids, i, items, BTVacuumPostingData::itup, BTVacuumPostingData::ndeletedtids, palloc(), and BTVacuumPostingData::updatedoffset.

Referenced by btvacuumpage().

◆ btrescan()

void btrescan ( IndexScanDesc  scan,
ScanKey  scankey,
int  nscankeys,
ScanKey  orderbys,
int  norderbys 
)

Definition at line 386 of file nbtree.c.

388{
389 BTScanOpaque so = (BTScanOpaque) scan->opaque;
390
391 /* we aren't holding any read locks, but gotta drop the pins */
393 {
394 /* Before leaving current page, deal with any killed items */
395 if (so->numKilled > 0)
396 _bt_killitems(scan);
399 }
400
401 /*
402 * We prefer to eagerly drop leaf page pins before btgettuple returns.
403 * This avoids making VACUUM wait to acquire a cleanup lock on the page.
404 *
405 * We cannot safely drop leaf page pins during index-only scans due to a
406 * race condition involving VACUUM setting pages all-visible in the VM.
407 * It's also unsafe for plain index scans that use a non-MVCC snapshot.
408 *
409 * When we drop pins eagerly, the mechanism that marks so->killedItems[]
410 * index tuples LP_DEAD has to deal with concurrent TID recycling races.
411 * The scheme used to detect unsafe TID recycling won't work when scanning
412 * unlogged relations (since it involves saving an affected page's LSN).
413 * Opt out of eager pin dropping during unlogged relation scans for now
414 * (this is preferable to opting out of kill_prior_tuple LP_DEAD setting).
415 *
416 * Also opt out of dropping leaf page pins eagerly during bitmap scans.
417 * Pins cannot be held for more than an instant during bitmap scans either
418 * way, so we might as well avoid wasting cycles on acquiring page LSNs.
419 *
420 * See nbtree/README section on making concurrent TID recycling safe.
421 *
422 * Note: so->dropPin should never change across rescans.
423 */
424 so->dropPin = (!scan->xs_want_itup &&
427 scan->heapRelation != NULL);
428
429 so->markItemIndex = -1;
430 so->needPrimScan = false;
431 so->scanBehind = false;
432 so->oppositeDirCheck = false;
435
436 /*
437 * Allocate tuple workspace arrays, if needed for an index-only scan and
438 * not already done in a previous rescan call. To save on palloc
439 * overhead, both workspaces are allocated as one palloc block; only this
440 * function and btendscan know that.
441 */
442 if (scan->xs_want_itup && so->currTuples == NULL)
443 {
444 so->currTuples = (char *) palloc(BLCKSZ * 2);
445 so->markTuples = so->currTuples + BLCKSZ;
446 }
447
448 /*
449 * Reset the scan keys
450 */
451 if (scankey && scan->numberOfKeys > 0)
452 memcpy(scan->keyData, scankey, scan->numberOfKeys * sizeof(ScanKeyData));
453 so->numberOfKeys = 0; /* until _bt_preprocess_keys sets it */
454 so->numArrayKeys = 0; /* ditto */
455}
#define RelationNeedsWAL(relation)
Definition: rel.h:638
#define IsMVCCSnapshot(snapshot)
Definition: snapmgr.h:55
struct ScanKeyData * keyData
Definition: relscan.h:143
struct SnapshotData * xs_snapshot
Definition: relscan.h:140

References _bt_killitems(), BTScanPosInvalidate, BTScanPosIsValid, BTScanPosUnpinIfPinned, BTScanOpaqueData::currPos, BTScanOpaqueData::currTuples, BTScanOpaqueData::dropPin, IndexScanDescData::heapRelation, if(), IndexScanDescData::indexRelation, IsMVCCSnapshot, IndexScanDescData::keyData, BTScanOpaqueData::markItemIndex, BTScanOpaqueData::markPos, BTScanOpaqueData::markTuples, BTScanOpaqueData::needPrimScan, BTScanOpaqueData::numArrayKeys, BTScanOpaqueData::numberOfKeys, IndexScanDescData::numberOfKeys, BTScanOpaqueData::numKilled, IndexScanDescData::opaque, BTScanOpaqueData::oppositeDirCheck, palloc(), RelationNeedsWAL, BTScanOpaqueData::scanBehind, IndexScanDescData::xs_snapshot, and IndexScanDescData::xs_want_itup.

Referenced by bthandler().

◆ btrestrpos()

void btrestrpos ( IndexScanDesc  scan)

Definition at line 523 of file nbtree.c.

524{
525 BTScanOpaque so = (BTScanOpaque) scan->opaque;
526
527 if (so->markItemIndex >= 0)
528 {
529 /*
530 * The scan has never moved to a new page since the last mark. Just
531 * restore the itemIndex.
532 *
533 * NB: In this case we can't count on anything in so->markPos to be
534 * accurate.
535 */
537 }
538 else
539 {
540 /*
541 * The scan moved to a new page after last mark or restore, and we are
542 * now restoring to the marked page. We aren't holding any read
543 * locks, but if we're still holding the pin for the current position,
544 * we must drop it.
545 */
546 if (BTScanPosIsValid(so->currPos))
547 {
548 /* Before leaving current page, deal with any killed items */
549 if (so->numKilled > 0)
550 _bt_killitems(scan);
552 }
553
554 if (BTScanPosIsValid(so->markPos))
555 {
556 /* bump pin on mark buffer for assignment to current buffer */
557 if (BTScanPosIsPinned(so->markPos))
559 memcpy(&so->currPos, &so->markPos,
560 offsetof(BTScanPosData, items[1]) +
561 so->markPos.lastItem * sizeof(BTScanPosItem));
562 if (so->currTuples)
563 memcpy(so->currTuples, so->markTuples,
565 /* Reset the scan's array keys (see _bt_steppage for why) */
566 if (so->numArrayKeys)
567 {
569 so->needPrimScan = false;
570 }
571 }
572 else
574 }
575}
void IncrBufferRefCount(Buffer buffer)
Definition: bufmgr.c:5493
void _bt_start_array_keys(IndexScanDesc scan, ScanDirection dir)
Definition: nbtreadpage.c:537
#define BTScanPosIsPinned(scanpos)
Definition: nbtree.h:1004
Buffer buf
Definition: nbtree.h:964
int nextTupleOffset
Definition: nbtree.h:979
ScanDirection dir
Definition: nbtree.h:973

References _bt_killitems(), _bt_start_array_keys(), BTScanPosInvalidate, BTScanPosIsPinned, BTScanPosIsValid, BTScanPosUnpinIfPinned, BTScanPosData::buf, BTScanOpaqueData::currPos, BTScanOpaqueData::currTuples, BTScanPosData::dir, if(), IncrBufferRefCount(), BTScanPosData::itemIndex, items, BTScanPosData::lastItem, BTScanOpaqueData::markItemIndex, BTScanOpaqueData::markPos, BTScanOpaqueData::markTuples, BTScanOpaqueData::needPrimScan, BTScanPosData::nextTupleOffset, BTScanOpaqueData::numArrayKeys, BTScanOpaqueData::numKilled, and IndexScanDescData::opaque.

Referenced by bthandler().

◆ bttranslatecmptype()

StrategyNumber bttranslatecmptype ( CompareType  cmptype,
Oid  opfamily 
)

Definition at line 1840 of file nbtree.c.

1841{
1842 switch (cmptype)
1843 {
1844 case COMPARE_LT:
1845 return BTLessStrategyNumber;
1846 case COMPARE_LE:
1848 case COMPARE_EQ:
1849 return BTEqualStrategyNumber;
1850 case COMPARE_GE:
1852 case COMPARE_GT:
1854 default:
1855 return InvalidStrategy;
1856 }
1857}
@ COMPARE_LE
Definition: cmptype.h:35
@ COMPARE_GT
Definition: cmptype.h:38
@ COMPARE_EQ
Definition: cmptype.h:36
@ COMPARE_GE
Definition: cmptype.h:37
@ COMPARE_LT
Definition: cmptype.h:34
#define BTGreaterStrategyNumber
Definition: stratnum.h:33
#define InvalidStrategy
Definition: stratnum.h:24
#define BTLessStrategyNumber
Definition: stratnum.h:29
#define BTEqualStrategyNumber
Definition: stratnum.h:31
#define BTLessEqualStrategyNumber
Definition: stratnum.h:30
#define BTGreaterEqualStrategyNumber
Definition: stratnum.h:32

References BTEqualStrategyNumber, BTGreaterEqualStrategyNumber, BTGreaterStrategyNumber, BTLessEqualStrategyNumber, BTLessStrategyNumber, COMPARE_EQ, COMPARE_GE, COMPARE_GT, COMPARE_LE, COMPARE_LT, and InvalidStrategy.

Referenced by bthandler().

◆ bttranslatestrategy()

CompareType bttranslatestrategy ( StrategyNumber  strategy,
Oid  opfamily 
)

Definition at line 1820 of file nbtree.c.

1821{
1822 switch (strategy)
1823 {
1825 return COMPARE_LT;
1827 return COMPARE_LE;
1829 return COMPARE_EQ;
1831 return COMPARE_GE;
1833 return COMPARE_GT;
1834 default:
1835 return COMPARE_INVALID;
1836 }
1837}
@ COMPARE_INVALID
Definition: cmptype.h:33

References BTEqualStrategyNumber, BTGreaterEqualStrategyNumber, BTGreaterStrategyNumber, BTLessEqualStrategyNumber, BTLessStrategyNumber, COMPARE_EQ, COMPARE_GE, COMPARE_GT, COMPARE_INVALID, COMPARE_LE, and COMPARE_LT.

Referenced by bthandler().

◆ btvacuumcleanup()

IndexBulkDeleteResult * btvacuumcleanup ( IndexVacuumInfo info,
IndexBulkDeleteResult stats 
)

Definition at line 1155 of file nbtree.c.

1156{
1157 BlockNumber num_delpages;
1158
1159 /* No-op in ANALYZE ONLY mode */
1160 if (info->analyze_only)
1161 return stats;
1162
1163 /*
1164 * If btbulkdelete was called, we need not do anything (we just maintain
1165 * the information used within _bt_vacuum_needs_cleanup() by calling
1166 * _bt_set_cleanup_info() below).
1167 *
1168 * If btbulkdelete was _not_ called, then we have a choice to make: we
1169 * must decide whether or not a btvacuumscan() call is needed now (i.e.
1170 * whether the ongoing VACUUM operation can entirely avoid a physical scan
1171 * of the index). A call to _bt_vacuum_needs_cleanup() decides it for us
1172 * now.
1173 */
1174 if (stats == NULL)
1175 {
1176 /* Check if VACUUM operation can entirely avoid btvacuumscan() call */
1177 if (!_bt_vacuum_needs_cleanup(info->index))
1178 return NULL;
1179
1180 /*
1181 * Since we aren't going to actually delete any leaf items, there's no
1182 * need to go through all the vacuum-cycle-ID pushups here.
1183 *
1184 * Posting list tuples are a source of inaccuracy for cleanup-only
1185 * scans. btvacuumscan() will assume that the number of index tuples
1186 * from each page can be used as num_index_tuples, even though
1187 * num_index_tuples is supposed to represent the number of TIDs in the
1188 * index. This naive approach can underestimate the number of tuples
1189 * in the index significantly.
1190 *
1191 * We handle the problem by making num_index_tuples an estimate in
1192 * cleanup-only case.
1193 */
1195 btvacuumscan(info, stats, NULL, NULL, 0);
1196 stats->estimated_count = true;
1197 }
1198
1199 /*
1200 * Maintain num_delpages value in metapage for _bt_vacuum_needs_cleanup().
1201 *
1202 * num_delpages is the number of deleted pages now in the index that were
1203 * not safe to place in the FSM to be recycled just yet. num_delpages is
1204 * greater than 0 only when _bt_pagedel() actually deleted pages during
1205 * our call to btvacuumscan(). Even then, _bt_pendingfsm_finalize() must
1206 * have failed to place any newly deleted pages in the FSM just moments
1207 * ago. (Actually, there are edge cases where recycling of the current
1208 * VACUUM's newly deleted pages does not even become safe by the time the
1209 * next VACUUM comes around. See nbtree/README.)
1210 */
1211 Assert(stats->pages_deleted >= stats->pages_free);
1212 num_delpages = stats->pages_deleted - stats->pages_free;
1213 _bt_set_cleanup_info(info->index, num_delpages);
1214
1215 /*
1216 * It's quite possible for us to be fooled by concurrent page splits into
1217 * double-counting some index tuples, so disbelieve any total that exceeds
1218 * the underlying heap's count ... if we know that accurately. Otherwise
1219 * this might just make matters worse.
1220 */
1221 if (!info->estimated_count)
1222 {
1223 if (stats->num_index_tuples > info->num_heap_tuples)
1224 stats->num_index_tuples = info->num_heap_tuples;
1225 }
1226
1227 return stats;
1228}
uint32 BlockNumber
Definition: block.h:31
void _bt_set_cleanup_info(Relation rel, BlockNumber num_delpages)
Definition: nbtpage.c:233
bool _bt_vacuum_needs_cleanup(Relation rel)
Definition: nbtpage.c:180
BlockNumber pages_deleted
Definition: genam.h:109
BlockNumber pages_free
Definition: genam.h:110
double num_index_tuples
Definition: genam.h:106
double num_heap_tuples
Definition: genam.h:79
bool analyze_only
Definition: genam.h:75
bool estimated_count
Definition: genam.h:77

References _bt_set_cleanup_info(), _bt_vacuum_needs_cleanup(), IndexVacuumInfo::analyze_only, Assert(), btvacuumscan(), IndexVacuumInfo::estimated_count, IndexBulkDeleteResult::estimated_count, IndexVacuumInfo::index, IndexVacuumInfo::num_heap_tuples, IndexBulkDeleteResult::num_index_tuples, IndexBulkDeleteResult::pages_deleted, IndexBulkDeleteResult::pages_free, and palloc0_object.

Referenced by bthandler().

◆ btvacuumpage()

static BlockNumber btvacuumpage ( BTVacState vstate,
Buffer  buf 
)
static

Definition at line 1418 of file nbtree.c.

1419{
1420 IndexVacuumInfo *info = vstate->info;
1421 IndexBulkDeleteResult *stats = vstate->stats;
1423 void *callback_state = vstate->callback_state;
1424 Relation rel = info->index;
1425 Relation heaprel = info->heaprel;
1426 bool attempt_pagedel;
1427 BlockNumber blkno,
1428 backtrack_to;
1430 Page page;
1431 BTPageOpaque opaque;
1432
1433 blkno = scanblkno;
1434
1435backtrack:
1436
1437 attempt_pagedel = false;
1438 backtrack_to = P_NONE;
1439
1440 _bt_lockbuf(rel, buf, BT_READ);
1441 page = BufferGetPage(buf);
1442 opaque = NULL;
1443 if (!PageIsNew(page))
1444 {
1445 _bt_checkpage(rel, buf);
1446 opaque = BTPageGetOpaque(page);
1447 }
1448
1449 Assert(blkno <= scanblkno);
1450 if (blkno != scanblkno)
1451 {
1452 /*
1453 * We're backtracking.
1454 *
1455 * We followed a right link to a sibling leaf page (a page that
1456 * happens to be from a block located before scanblkno). The only
1457 * case we want to do anything with is a live leaf page having the
1458 * current vacuum cycle ID.
1459 *
1460 * The page had better be in a state that's consistent with what we
1461 * expect. Check for conditions that imply corruption in passing. It
1462 * can't be half-dead because only an interrupted VACUUM process can
1463 * leave pages in that state, so we'd definitely have dealt with it
1464 * back when the page was the scanblkno page (half-dead pages are
1465 * always marked fully deleted by _bt_pagedel(), barring corruption).
1466 */
1467 if (!opaque || !P_ISLEAF(opaque) || P_ISHALFDEAD(opaque))
1468 {
1469 Assert(false);
1470 ereport(LOG,
1471 (errcode(ERRCODE_INDEX_CORRUPTED),
1472 errmsg_internal("right sibling %u of scanblkno %u unexpectedly in an inconsistent state in index \"%s\"",
1473 blkno, scanblkno, RelationGetRelationName(rel))));
1474 _bt_relbuf(rel, buf);
1475 return scanblkno;
1476 }
1477
1478 /*
1479 * We may have already processed the page in an earlier call, when the
1480 * page was scanblkno. This happens when the leaf page split occurred
1481 * after the scan began, but before the right sibling page became the
1482 * scanblkno.
1483 *
1484 * Page may also have been deleted by current btvacuumpage() call,
1485 * since _bt_pagedel() sometimes deletes the right sibling page of
1486 * scanblkno in passing (it does so after we decided where to
1487 * backtrack to). We don't need to process this page as a deleted
1488 * page a second time now (in fact, it would be wrong to count it as a
1489 * deleted page in the bulk delete statistics a second time).
1490 */
1491 if (opaque->btpo_cycleid != vstate->cycleid || P_ISDELETED(opaque))
1492 {
1493 /* Done with current scanblkno (and all lower split pages) */
1494 _bt_relbuf(rel, buf);
1495 return scanblkno;
1496 }
1497 }
1498
1499 if (!opaque || BTPageIsRecyclable(page, heaprel))
1500 {
1501 /* Okay to recycle this page (which could be leaf or internal) */
1502 RecordFreeIndexPage(rel, blkno);
1503 stats->pages_deleted++;
1504 stats->pages_free++;
1505 }
1506 else if (P_ISDELETED(opaque))
1507 {
1508 /*
1509 * Already deleted page (which could be leaf or internal). Can't
1510 * recycle yet.
1511 */
1512 stats->pages_deleted++;
1513 }
1514 else if (P_ISHALFDEAD(opaque))
1515 {
1516 /* Half-dead leaf page (from interrupted VACUUM) -- finish deleting */
1517 attempt_pagedel = true;
1518
1519 /*
1520 * _bt_pagedel() will increment both pages_newly_deleted and
1521 * pages_deleted stats in all cases (barring corruption)
1522 */
1523 }
1524 else if (P_ISLEAF(opaque))
1525 {
1527 int ndeletable;
1529 int nupdatable;
1530 OffsetNumber offnum,
1531 minoff,
1532 maxoff;
1533 int nhtidsdead,
1534 nhtidslive;
1535
1536 /*
1537 * Trade in the initial read lock for a full cleanup lock on this
1538 * page. We must get such a lock on every leaf page over the course
1539 * of the vacuum scan, whether or not it actually contains any
1540 * deletable tuples --- see nbtree/README.
1541 */
1543
1544 /*
1545 * Check whether we need to backtrack to earlier pages. What we are
1546 * concerned about is a page split that happened since we started the
1547 * vacuum scan. If the split moved tuples on the right half of the
1548 * split (i.e. the tuples that sort high) to a block that we already
1549 * passed over, then we might have missed the tuples. We need to
1550 * backtrack now. (Must do this before possibly clearing btpo_cycleid
1551 * or deleting scanblkno page below!)
1552 */
1553 if (vstate->cycleid != 0 &&
1554 opaque->btpo_cycleid == vstate->cycleid &&
1555 !(opaque->btpo_flags & BTP_SPLIT_END) &&
1556 !P_RIGHTMOST(opaque) &&
1557 opaque->btpo_next < scanblkno)
1558 backtrack_to = opaque->btpo_next;
1559
1560 ndeletable = 0;
1561 nupdatable = 0;
1562 minoff = P_FIRSTDATAKEY(opaque);
1563 maxoff = PageGetMaxOffsetNumber(page);
1564 nhtidsdead = 0;
1565 nhtidslive = 0;
1566 if (callback)
1567 {
1568 /* btbulkdelete callback tells us what to delete (or update) */
1569 for (offnum = minoff;
1570 offnum <= maxoff;
1571 offnum = OffsetNumberNext(offnum))
1572 {
1573 IndexTuple itup;
1574
1575 itup = (IndexTuple) PageGetItem(page,
1576 PageGetItemId(page, offnum));
1577
1578 Assert(!BTreeTupleIsPivot(itup));
1579 if (!BTreeTupleIsPosting(itup))
1580 {
1581 /* Regular tuple, standard table TID representation */
1582 if (callback(&itup->t_tid, callback_state))
1583 {
1584 deletable[ndeletable++] = offnum;
1585 nhtidsdead++;
1586 }
1587 else
1588 nhtidslive++;
1589 }
1590 else
1591 {
1592 BTVacuumPosting vacposting;
1593 int nremaining;
1594
1595 /* Posting list tuple */
1596 vacposting = btreevacuumposting(vstate, itup, offnum,
1597 &nremaining);
1598 if (vacposting == NULL)
1599 {
1600 /*
1601 * All table TIDs from the posting tuple remain, so no
1602 * delete or update required
1603 */
1604 Assert(nremaining == BTreeTupleGetNPosting(itup));
1605 }
1606 else if (nremaining > 0)
1607 {
1608
1609 /*
1610 * Store metadata about posting list tuple in
1611 * updatable array for entire page. Existing tuple
1612 * will be updated during the later call to
1613 * _bt_delitems_vacuum().
1614 */
1615 Assert(nremaining < BTreeTupleGetNPosting(itup));
1616 updatable[nupdatable++] = vacposting;
1617 nhtidsdead += BTreeTupleGetNPosting(itup) - nremaining;
1618 }
1619 else
1620 {
1621 /*
1622 * All table TIDs from the posting list must be
1623 * deleted. We'll delete the index tuple completely
1624 * (no update required).
1625 */
1626 Assert(nremaining == 0);
1627 deletable[ndeletable++] = offnum;
1628 nhtidsdead += BTreeTupleGetNPosting(itup);
1629 pfree(vacposting);
1630 }
1631
1632 nhtidslive += nremaining;
1633 }
1634 }
1635 }
1636
1637 /*
1638 * Apply any needed deletes or updates. We issue just one
1639 * _bt_delitems_vacuum() call per page, so as to minimize WAL traffic.
1640 */
1641 if (ndeletable > 0 || nupdatable > 0)
1642 {
1643 Assert(nhtidsdead >= ndeletable + nupdatable);
1644 _bt_delitems_vacuum(rel, buf, deletable, ndeletable, updatable,
1645 nupdatable);
1646
1647 stats->tuples_removed += nhtidsdead;
1648 /* must recompute maxoff */
1649 maxoff = PageGetMaxOffsetNumber(page);
1650
1651 /* can't leak memory here */
1652 for (int i = 0; i < nupdatable; i++)
1653 pfree(updatable[i]);
1654 }
1655 else
1656 {
1657 /*
1658 * If the leaf page has been split during this vacuum cycle, it
1659 * seems worth expending a write to clear btpo_cycleid even if we
1660 * don't have any deletions to do. (If we do, _bt_delitems_vacuum
1661 * takes care of this.) This ensures we won't process the page
1662 * again.
1663 *
1664 * We treat this like a hint-bit update because there's no need to
1665 * WAL-log it.
1666 */
1667 Assert(nhtidsdead == 0);
1668 if (vstate->cycleid != 0 &&
1669 opaque->btpo_cycleid == vstate->cycleid)
1670 {
1671 opaque->btpo_cycleid = 0;
1672 MarkBufferDirtyHint(buf, true);
1673 }
1674 }
1675
1676 /*
1677 * If the leaf page is now empty, try to delete it; else count the
1678 * live tuples (live table TIDs in posting lists are counted as
1679 * separate live tuples). We don't delete when backtracking, though,
1680 * since that would require teaching _bt_pagedel() about backtracking
1681 * (doesn't seem worth adding more complexity to deal with that).
1682 *
1683 * We don't count the number of live TIDs during cleanup-only calls to
1684 * btvacuumscan (i.e. when callback is not set). We count the number
1685 * of index tuples directly instead. This avoids the expense of
1686 * directly examining all of the tuples on each page. VACUUM will
1687 * treat num_index_tuples as an estimate in cleanup-only case, so it
1688 * doesn't matter that this underestimates num_index_tuples
1689 * significantly in some cases.
1690 */
1691 if (minoff > maxoff)
1692 attempt_pagedel = (blkno == scanblkno);
1693 else if (callback)
1694 stats->num_index_tuples += nhtidslive;
1695 else
1696 stats->num_index_tuples += maxoff - minoff + 1;
1697
1698 Assert(!attempt_pagedel || nhtidslive == 0);
1699 }
1700
1701 if (attempt_pagedel)
1702 {
1703 MemoryContext oldcontext;
1704
1705 /* Run pagedel in a temp context to avoid memory leakage */
1707 oldcontext = MemoryContextSwitchTo(vstate->pagedelcontext);
1708
1709 /*
1710 * _bt_pagedel maintains the bulk delete stats on our behalf;
1711 * pages_newly_deleted and pages_deleted are likely to be incremented
1712 * during call
1713 */
1714 Assert(blkno == scanblkno);
1715 _bt_pagedel(rel, buf, vstate);
1716
1717 MemoryContextSwitchTo(oldcontext);
1718 /* pagedel released buffer, so we shouldn't */
1719 }
1720 else
1721 _bt_relbuf(rel, buf);
1722
1723 if (backtrack_to != P_NONE)
1724 {
1725 blkno = backtrack_to;
1726
1727 /* check for vacuum delay while not holding any buffer lock */
1728 vacuum_delay_point(false);
1729
1730 /*
1731 * We can't use _bt_getbuf() here because it always applies
1732 * _bt_checkpage(), which will barf on an all-zero page. We want to
1733 * recycle all-zero pages, not fail. Also, we want to use a
1734 * nondefault buffer access strategy.
1735 */
1737 info->strategy);
1738 goto backtrack;
1739 }
1740
1741 return scanblkno;
1742}
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:4318
void MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
Definition: bufmgr.c:5525
Buffer ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
Definition: bufmgr.c:886
static Page BufferGetPage(Buffer buffer)
Definition: bufmgr.h:436
@ RBM_NORMAL
Definition: bufmgr.h:46
static bool PageIsNew(const PageData *page)
Definition: bufpage.h:233
static ItemId PageGetItemId(Page page, OffsetNumber offsetNumber)
Definition: bufpage.h:243
static void * PageGetItem(PageData *page, const ItemIdData *itemId)
Definition: bufpage.h:353
static OffsetNumber PageGetMaxOffsetNumber(const PageData *page)
Definition: bufpage.h:371
int errmsg_internal(const char *fmt,...)
Definition: elog.c:1170
int errcode(int sqlerrcode)
Definition: elog.c:863
#define LOG
Definition: elog.h:31
#define ereport(elevel,...)
Definition: elog.h:150
bool(* IndexBulkDeleteCallback)(ItemPointer itemptr, void *state)
Definition: genam.h:114
void RecordFreeIndexPage(Relation rel, BlockNumber freeBlock)
Definition: indexfsm.c:52
IndexTupleData * IndexTuple
Definition: itup.h:53
#define MaxIndexTuplesPerPage
Definition: itup.h:181
void MemoryContextReset(MemoryContext context)
Definition: mcxt.c:403
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:1024
void _bt_pagedel(Relation rel, Buffer leafbuf, BTVacState *vstate)
Definition: nbtpage.c:1801
void _bt_delitems_vacuum(Relation rel, Buffer buf, OffsetNumber *deletable, int ndeletable, BTVacuumPosting *updatable, int nupdatable)
Definition: nbtpage.c:1155
void _bt_checkpage(Relation rel, Buffer buf)
Definition: nbtpage.c:798
void _bt_upgradelockbufcleanup(Relation rel, Buffer buf)
Definition: nbtpage.c:1110
void _bt_lockbuf(Relation rel, Buffer buf, int access)
Definition: nbtpage.c:1040
static BTVacuumPosting btreevacuumposting(BTVacState *vstate, IndexTuple posting, OffsetNumber updatedoffset, int *nremaining)
Definition: nbtree.c:1756
#define P_ISHALFDEAD(opaque)
Definition: nbtree.h:225
static bool BTreeTupleIsPivot(IndexTuple itup)
Definition: nbtree.h:481
#define P_ISLEAF(opaque)
Definition: nbtree.h:221
#define BTPageGetOpaque(page)
Definition: nbtree.h:74
#define P_ISDELETED(opaque)
Definition: nbtree.h:223
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:370
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:220
#define BT_READ
Definition: nbtree.h:730
static bool BTPageIsRecyclable(Page page, Relation heaprel)
Definition: nbtree.h:292
static bool BTreeTupleIsPosting(IndexTuple itup)
Definition: nbtree.h:493
#define BTP_SPLIT_END
Definition: nbtree.h:82
#define OffsetNumberNext(offsetNumber)
Definition: off.h:52
uint16 OffsetNumber
Definition: off.h:24
static MemoryContext MemoryContextSwitchTo(MemoryContext context)
Definition: palloc.h:124
static char buf[DEFAULT_XLOG_SEG_SIZE]
Definition: pg_test_fsync.c:71
#define RelationGetRelationName(relation)
Definition: rel.h:549
@ MAIN_FORKNUM
Definition: relpath.h:58
BlockNumber btpo_next
Definition: nbtree.h:66
uint16 btpo_flags
Definition: nbtree.h:68
BTCycleId btpo_cycleid
Definition: nbtree.h:69
IndexBulkDeleteResult * stats
Definition: nbtree.h:334
BTCycleId cycleid
Definition: nbtree.h:337
MemoryContext pagedelcontext
Definition: nbtree.h:338
IndexVacuumInfo * info
Definition: nbtree.h:333
double tuples_removed
Definition: genam.h:107
BufferAccessStrategy strategy
Definition: genam.h:80
Relation heaprel
Definition: genam.h:74
void vacuum_delay_point(bool is_analyze)
Definition: vacuum.c:2426

References _bt_checkpage(), _bt_delitems_vacuum(), _bt_lockbuf(), _bt_pagedel(), _bt_relbuf(), _bt_upgradelockbufcleanup(), Assert(), BT_READ, BTP_SPLIT_END, BTPageGetOpaque, BTPageIsRecyclable(), BTPageOpaqueData::btpo_cycleid, BTPageOpaqueData::btpo_flags, BTPageOpaqueData::btpo_next, BTreeTupleGetNPosting(), BTreeTupleIsPivot(), BTreeTupleIsPosting(), btreevacuumposting(), buf, BufferGetBlockNumber(), BufferGetPage(), BTVacState::callback, callback(), BTVacState::callback_state, BTVacState::cycleid, ereport, errcode(), errmsg_internal(), IndexVacuumInfo::heaprel, i, IndexVacuumInfo::index, BTVacState::info, LOG, MAIN_FORKNUM, MarkBufferDirtyHint(), MaxIndexTuplesPerPage, MemoryContextReset(), MemoryContextSwitchTo(), IndexBulkDeleteResult::num_index_tuples, OffsetNumberNext, P_FIRSTDATAKEY, P_ISDELETED, P_ISHALFDEAD, P_ISLEAF, P_NONE, P_RIGHTMOST, BTVacState::pagedelcontext, PageGetItem(), PageGetItemId(), PageGetMaxOffsetNumber(), PageIsNew(), IndexBulkDeleteResult::pages_deleted, IndexBulkDeleteResult::pages_free, pfree(), RBM_NORMAL, ReadBufferExtended(), RecordFreeIndexPage(), RelationGetRelationName, BTVacState::stats, IndexVacuumInfo::strategy, IndexTupleData::t_tid, IndexBulkDeleteResult::tuples_removed, and vacuum_delay_point().

Referenced by btvacuumscan().

◆ btvacuumscan()

static void btvacuumscan ( IndexVacuumInfo info,
IndexBulkDeleteResult stats,
IndexBulkDeleteCallback  callback,
void *  callback_state,
BTCycleId  cycleid 
)
static

Definition at line 1243 of file nbtree.c.

1246{
1247 Relation rel = info->index;
1248 BTVacState vstate;
1249 BlockNumber num_pages;
1250 bool needLock;
1252 ReadStream *stream = NULL;
1253
1254 /*
1255 * Reset fields that track information about the entire index now. This
1256 * avoids double-counting in the case where a single VACUUM command
1257 * requires multiple scans of the index.
1258 *
1259 * Avoid resetting the tuples_removed and pages_newly_deleted fields here,
1260 * since they track information about the VACUUM command, and so must last
1261 * across each call to btvacuumscan().
1262 *
1263 * (Note that pages_free is treated as state about the whole index, not
1264 * the current VACUUM. This is appropriate because RecordFreeIndexPage()
1265 * calls are idempotent, and get repeated for the same deleted pages in
1266 * some scenarios. The point for us is to track the number of recyclable
1267 * pages in the index at the end of the VACUUM command.)
1268 */
1269 stats->num_pages = 0;
1270 stats->num_index_tuples = 0;
1271 stats->pages_deleted = 0;
1272 stats->pages_free = 0;
1273
1274 /* Set up info to pass down to btvacuumpage */
1275 vstate.info = info;
1276 vstate.stats = stats;
1277 vstate.callback = callback;
1278 vstate.callback_state = callback_state;
1279 vstate.cycleid = cycleid;
1280
1281 /* Create a temporary memory context to run _bt_pagedel in */
1283 "_bt_pagedel",
1285
1286 /* Initialize vstate fields used by _bt_pendingfsm_finalize */
1287 vstate.bufsize = 0;
1288 vstate.maxbufsize = 0;
1289 vstate.pendingpages = NULL;
1290 vstate.npendingpages = 0;
1291 /* Consider applying _bt_pendingfsm_finalize optimization */
1292 _bt_pendingfsm_init(rel, &vstate, (callback == NULL));
1293
1294 /*
1295 * The outer loop iterates over all index pages except the metapage, in
1296 * physical order (we hope the kernel will cooperate in providing
1297 * read-ahead for speed). It is critical that we visit all leaf pages,
1298 * including ones added after we start the scan, else we might fail to
1299 * delete some deletable tuples. Hence, we must repeatedly check the
1300 * relation length. We must acquire the relation-extension lock while
1301 * doing so to avoid a race condition: if someone else is extending the
1302 * relation, there is a window where bufmgr/smgr have created a new
1303 * all-zero page but it hasn't yet been write-locked by _bt_getbuf(). If
1304 * we manage to scan such a page here, we'll improperly assume it can be
1305 * recycled. Taking the lock synchronizes things enough to prevent a
1306 * problem: either num_pages won't include the new page, or _bt_getbuf
1307 * already has write lock on the buffer and it will be fully initialized
1308 * before we can examine it. Also, we need not worry if a page is added
1309 * immediately after we look; the page splitting code already has
1310 * write-lock on the left page before it adds a right page, so we must
1311 * already have processed any tuples due to be moved into such a page.
1312 *
1313 * XXX: Now that new pages are locked with RBM_ZERO_AND_LOCK, I don't
1314 * think the use of the extension lock is still required.
1315 *
1316 * We can skip locking for new or temp relations, however, since no one
1317 * else could be accessing them.
1318 */
1319 needLock = !RELATION_IS_LOCAL(rel);
1320
1322
1323 /*
1324 * It is safe to use batchmode as block_range_read_stream_cb takes no
1325 * locks.
1326 */
1330 info->strategy,
1331 rel,
1334 &p,
1335 0);
1336 for (;;)
1337 {
1338 /* Get the current relation length */
1339 if (needLock)
1341 num_pages = RelationGetNumberOfBlocks(rel);
1342 if (needLock)
1344
1345 if (info->report_progress)
1347 num_pages);
1348
1349 /* Quit if we've scanned the whole relation */
1350 if (p.current_blocknum >= num_pages)
1351 break;
1352
1353 p.last_exclusive = num_pages;
1354
1355 /* Iterate over pages, then loop back to recheck relation length */
1356 while (true)
1357 {
1358 BlockNumber current_block;
1359 Buffer buf;
1360
1361 /* call vacuum_delay_point while not holding any buffer lock */
1362 vacuum_delay_point(false);
1363
1364 buf = read_stream_next_buffer(stream, NULL);
1365
1366 if (!BufferIsValid(buf))
1367 break;
1368
1369 current_block = btvacuumpage(&vstate, buf);
1370
1371 if (info->report_progress)
1373 current_block);
1374 }
1375
1376 /*
1377 * We have to reset the read stream to use it again. After returning
1378 * InvalidBuffer, the read stream API won't invoke our callback again
1379 * until the stream has been reset.
1380 */
1381 read_stream_reset(stream);
1382 }
1383
1384 read_stream_end(stream);
1385
1386 /* Set statistics num_pages field to final size of index */
1387 stats->num_pages = num_pages;
1388
1390
1391 /*
1392 * If there were any calls to _bt_pagedel() during scan of the index then
1393 * see if any of the resulting pages can be placed in the FSM now. When
1394 * it's not safe we'll have to leave it up to a future VACUUM operation.
1395 *
1396 * Finally, if we placed any pages in the FSM (either just now or during
1397 * the scan), forcibly update the upper-level FSM pages to ensure that
1398 * searchers can find them.
1399 */
1400 _bt_pendingfsm_finalize(rel, &vstate);
1401 if (stats->pages_free > 0)
1403}
void pgstat_progress_update_param(int index, int64 val)
int Buffer
Definition: buf.h:23
#define RelationGetNumberOfBlocks(reln)
Definition: bufmgr.h:294
static bool BufferIsValid(Buffer bufnum)
Definition: bufmgr.h:387
void IndexFreeSpaceMapVacuum(Relation rel)
Definition: indexfsm.c:71
void LockRelationForExtension(Relation relation, LOCKMODE lockmode)
Definition: lmgr.c:424
void UnlockRelationForExtension(Relation relation, LOCKMODE lockmode)
Definition: lmgr.c:474
#define ExclusiveLock
Definition: lockdefs.h:42
MemoryContext CurrentMemoryContext
Definition: mcxt.c:160
#define AllocSetContextCreate
Definition: memutils.h:129
#define ALLOCSET_DEFAULT_SIZES
Definition: memutils.h:160
void _bt_pendingfsm_finalize(Relation rel, BTVacState *vstate)
Definition: nbtpage.c:3000
void _bt_pendingfsm_init(Relation rel, BTVacState *vstate, bool cleanuponly)
Definition: nbtpage.c:2958
static BlockNumber btvacuumpage(BTVacState *vstate, Buffer buf)
Definition: nbtree.c:1418
#define PROGRESS_SCAN_BLOCKS_DONE
Definition: progress.h:142
#define PROGRESS_SCAN_BLOCKS_TOTAL
Definition: progress.h:141
void read_stream_reset(ReadStream *stream)
Definition: read_stream.c:1044
Buffer read_stream_next_buffer(ReadStream *stream, void **per_buffer_data)
Definition: read_stream.c:791
ReadStream * read_stream_begin_relation(int flags, BufferAccessStrategy strategy, Relation rel, ForkNumber forknum, ReadStreamBlockNumberCB callback, void *callback_private_data, size_t per_buffer_data_size)
Definition: read_stream.c:737
void read_stream_end(ReadStream *stream)
Definition: read_stream.c:1089
BlockNumber block_range_read_stream_cb(ReadStream *stream, void *callback_private_data, void *per_buffer_data)
Definition: read_stream.c:162
#define READ_STREAM_MAINTENANCE
Definition: read_stream.h:28
#define READ_STREAM_USE_BATCHING
Definition: read_stream.h:64
#define READ_STREAM_FULL
Definition: read_stream.h:43
#define RELATION_IS_LOCAL(relation)
Definition: rel.h:658
BTPendingFSM * pendingpages
Definition: nbtree.h:345
int npendingpages
Definition: nbtree.h:346
int bufsize
Definition: nbtree.h:343
int maxbufsize
Definition: nbtree.h:344
BlockNumber num_pages
Definition: genam.h:104
bool report_progress
Definition: genam.h:76

References _bt_pendingfsm_finalize(), _bt_pendingfsm_init(), ALLOCSET_DEFAULT_SIZES, AllocSetContextCreate, block_range_read_stream_cb(), BTREE_METAPAGE, btvacuumpage(), buf, BufferIsValid(), BTVacState::bufsize, BTVacState::callback, callback(), BTVacState::callback_state, BlockRangeReadStreamPrivate::current_blocknum, CurrentMemoryContext, BTVacState::cycleid, ExclusiveLock, IndexVacuumInfo::index, IndexFreeSpaceMapVacuum(), BTVacState::info, BlockRangeReadStreamPrivate::last_exclusive, LockRelationForExtension(), MAIN_FORKNUM, BTVacState::maxbufsize, MemoryContextDelete(), BTVacState::npendingpages, IndexBulkDeleteResult::num_index_tuples, IndexBulkDeleteResult::num_pages, BTVacState::pagedelcontext, IndexBulkDeleteResult::pages_deleted, IndexBulkDeleteResult::pages_free, BTVacState::pendingpages, pgstat_progress_update_param(), PROGRESS_SCAN_BLOCKS_DONE, PROGRESS_SCAN_BLOCKS_TOTAL, read_stream_begin_relation(), read_stream_end(), READ_STREAM_FULL, READ_STREAM_MAINTENANCE, read_stream_next_buffer(), read_stream_reset(), READ_STREAM_USE_BATCHING, RELATION_IS_LOCAL, RelationGetNumberOfBlocks, IndexVacuumInfo::report_progress, BTVacState::stats, IndexVacuumInfo::strategy, UnlockRelationForExtension(), and vacuum_delay_point().

Referenced by btbulkdelete(), and btvacuumcleanup().