#include "postgres.h"#include "access/genam.h"#include "access/nbtree.h"#include "access/relscan.h"#include "catalog/index.h"#include "catalog/storage.h"#include "commands/vacuum.h"#include "storage/bufmgr.h"#include "storage/freespace.h"#include "storage/indexfsm.h"#include "storage/ipc.h"#include "storage/lmgr.h"#include "utils/memutils.h"

Go to the source code of this file.
Data Structures | |
| struct | BTBuildState |
| struct | BTVacState |
Functions | |
| static void | btbuildCallback (Relation index, HeapTuple htup, Datum *values, bool *isnull, bool tupleIsAlive, void *state) |
| static void | btvacuumscan (IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state, BTCycleId cycleid) |
| static void | btvacuumpage (BTVacState *vstate, BlockNumber blkno, BlockNumber orig_blkno) |
| Datum | btbuild (PG_FUNCTION_ARGS) |
| Datum | btinsert (PG_FUNCTION_ARGS) |
| Datum | btgettuple (PG_FUNCTION_ARGS) |
| Datum | btgetbitmap (PG_FUNCTION_ARGS) |
| Datum | btbeginscan (PG_FUNCTION_ARGS) |
| Datum | btrescan (PG_FUNCTION_ARGS) |
| Datum | btendscan (PG_FUNCTION_ARGS) |
| Datum | btmarkpos (PG_FUNCTION_ARGS) |
| Datum | btrestrpos (PG_FUNCTION_ARGS) |
| Datum | btbulkdelete (PG_FUNCTION_ARGS) |
| Datum | btvacuumcleanup (PG_FUNCTION_ARGS) |
| Datum btbeginscan | ( | PG_FUNCTION_ARGS | ) |
Definition at line 337 of file nbtree.c.
References PG_GETARG_INT32, PG_GETARG_POINTER, PG_RETURN_POINTER, and RelationGetIndexScan().
00338 { 00339 Relation rel = (Relation) PG_GETARG_POINTER(0); 00340 int keysz = PG_GETARG_INT32(1); 00341 ScanKey scankey = (ScanKey) PG_GETARG_POINTER(2); 00342 IndexScanDesc scan; 00343 00344 /* get the scan */ 00345 scan = RelationGetIndexScan(rel, keysz, scankey); 00346 00347 PG_RETURN_POINTER(scan); 00348 }
| Datum btbuild | ( | PG_FUNCTION_ARGS | ) |
Definition at line 84 of file nbtree.c.
References _bt_leafbuild(), _bt_spooldestroy(), _bt_spoolinit(), btbuildCallback(), elog, ERROR, BTBuildState::haveDead, IndexBuildResult::heap_tuples, BTBuildState::heapRel, IndexInfo::ii_Unique, IndexBuildResult::index_tuples, IndexBuildHeapScan(), BTBuildState::indtuples, BTBuildState::isUnique, log_btree_build_stats, NULL, palloc, PG_GETARG_POINTER, PG_RETURN_POINTER, RelationGetNumberOfBlocks(), RelationGetRelationName, ResetUsage(), ShowUsage(), BTBuildState::spool, and BTBuildState::spool2.
00085 { 00086 Relation heap = (Relation) PG_GETARG_POINTER(0); 00087 Relation index = (Relation) PG_GETARG_POINTER(1); 00088 IndexInfo *indexInfo = (IndexInfo *) PG_GETARG_POINTER(2); 00089 IndexBuildResult *result; 00090 double reltuples; 00091 BTBuildState buildstate; 00092 00093 buildstate.isUnique = indexInfo->ii_Unique; 00094 buildstate.haveDead = false; 00095 buildstate.heapRel = heap; 00096 buildstate.spool = NULL; 00097 buildstate.spool2 = NULL; 00098 buildstate.indtuples = 0; 00099 00100 #ifdef BTREE_BUILD_STATS 00101 if (log_btree_build_stats) 00102 ResetUsage(); 00103 #endif /* BTREE_BUILD_STATS */ 00104 00105 /* 00106 * We expect to be called exactly once for any index relation. If that's 00107 * not the case, big trouble's what we have. 00108 */ 00109 if (RelationGetNumberOfBlocks(index) != 0) 00110 elog(ERROR, "index \"%s\" already contains data", 00111 RelationGetRelationName(index)); 00112 00113 buildstate.spool = _bt_spoolinit(index, indexInfo->ii_Unique, false); 00114 00115 /* 00116 * If building a unique index, put dead tuples in a second spool to keep 00117 * them out of the uniqueness check. 00118 */ 00119 if (indexInfo->ii_Unique) 00120 buildstate.spool2 = _bt_spoolinit(index, false, true); 00121 00122 /* do the heap scan */ 00123 reltuples = IndexBuildHeapScan(heap, index, indexInfo, true, 00124 btbuildCallback, (void *) &buildstate); 00125 00126 /* okay, all heap tuples are indexed */ 00127 if (buildstate.spool2 && !buildstate.haveDead) 00128 { 00129 /* spool2 turns out to be unnecessary */ 00130 _bt_spooldestroy(buildstate.spool2); 00131 buildstate.spool2 = NULL; 00132 } 00133 00134 /* 00135 * Finish the build by (1) completing the sort of the spool file, (2) 00136 * inserting the sorted tuples into btree pages and (3) building the upper 00137 * levels. 00138 */ 00139 _bt_leafbuild(buildstate.spool, buildstate.spool2); 00140 _bt_spooldestroy(buildstate.spool); 00141 if (buildstate.spool2) 00142 _bt_spooldestroy(buildstate.spool2); 00143 00144 #ifdef BTREE_BUILD_STATS 00145 if (log_btree_build_stats) 00146 { 00147 ShowUsage("BTREE BUILD STATS"); 00148 ResetUsage(); 00149 } 00150 #endif /* BTREE_BUILD_STATS */ 00151 00152 /* 00153 * If we are reindexing a pre-existing index, it is critical to send out a 00154 * relcache invalidation SI message to ensure all backends re-read the 00155 * index metapage. We expect that the caller will ensure that happens 00156 * (typically as a side effect of updating index stats, but it must happen 00157 * even if the stats don't change!) 00158 */ 00159 00160 /* 00161 * Return statistics 00162 */ 00163 result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult)); 00164 00165 result->heap_tuples = reltuples; 00166 result->index_tuples = buildstate.indtuples; 00167 00168 PG_RETURN_POINTER(result); 00169 }
| static void btbuildCallback | ( | Relation | index, | |
| HeapTuple | htup, | |||
| Datum * | values, | |||
| bool * | isnull, | |||
| bool | tupleIsAlive, | |||
| void * | state | |||
| ) | [static] |
Definition at line 175 of file nbtree.c.
References _bt_spool(), BTBuildState::haveDead, index_form_tuple(), BTBuildState::indtuples, NULL, pfree(), RelationGetDescr, BTBuildState::spool, BTBuildState::spool2, HeapTupleData::t_self, and IndexTupleData::t_tid.
Referenced by btbuild().
00181 { 00182 BTBuildState *buildstate = (BTBuildState *) state; 00183 IndexTuple itup; 00184 00185 /* form an index tuple and point it at the heap tuple */ 00186 itup = index_form_tuple(RelationGetDescr(index), values, isnull); 00187 itup->t_tid = htup->t_self; 00188 00189 /* 00190 * insert the index tuple into the appropriate spool file for subsequent 00191 * processing 00192 */ 00193 if (tupleIsAlive || buildstate->spool2 == NULL) 00194 _bt_spool(itup, buildstate->spool); 00195 else 00196 { 00197 /* dead tuples are put into spool2 */ 00198 buildstate->haveDead = true; 00199 _bt_spool(itup, buildstate->spool2); 00200 } 00201 00202 buildstate->indtuples += 1; 00203 00204 pfree(itup); 00205 }
| Datum btbulkdelete | ( | PG_FUNCTION_ARGS | ) |
Definition at line 521 of file nbtree.c.
References _bt_end_vacuum(), _bt_end_vacuum_callback(), _bt_start_vacuum(), btvacuumscan(), callback(), IndexVacuumInfo::index, NULL, palloc0, PG_END_ENSURE_ERROR_CLEANUP, PG_ENSURE_ERROR_CLEANUP, PG_GETARG_POINTER, PG_RETURN_POINTER, and PointerGetDatum.
00522 { 00523 IndexVacuumInfo *info = (IndexVacuumInfo *) PG_GETARG_POINTER(0); 00524 IndexBulkDeleteResult *volatile stats = (IndexBulkDeleteResult *) PG_GETARG_POINTER(1); 00525 IndexBulkDeleteCallback callback = (IndexBulkDeleteCallback) PG_GETARG_POINTER(2); 00526 void *callback_state = (void *) PG_GETARG_POINTER(3); 00527 Relation rel = info->index; 00528 BTCycleId cycleid; 00529 00530 /* allocate stats if first time through, else re-use existing struct */ 00531 if (stats == NULL) 00532 stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); 00533 00534 /* Establish the vacuum cycle ID to use for this scan */ 00535 /* The ENSURE stuff ensures we clean up shared memory on failure */ 00536 PG_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel)); 00537 { 00538 cycleid = _bt_start_vacuum(rel); 00539 00540 btvacuumscan(info, stats, callback, callback_state, cycleid); 00541 } 00542 PG_END_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel)); 00543 _bt_end_vacuum(rel); 00544 00545 PG_RETURN_POINTER(stats); 00546 }
| Datum btendscan | ( | PG_FUNCTION_ARGS | ) |
Definition at line 409 of file nbtree.c.
References _bt_killitems(), BTScanPosIsValid, BTScanPosData::buf, BTScanOpaqueData::currPos, InvalidBuffer, BTScanOpaqueData::keyData, BTScanOpaqueData::killedItems, BTScanOpaqueData::markItemIndex, BTScanOpaqueData::markPos, NULL, BTScanOpaqueData::numKilled, IndexScanDescData::opaque, pfree(), PG_GETARG_POINTER, PG_RETURN_VOID, and ReleaseBuffer().
00410 { 00411 IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0); 00412 BTScanOpaque so = (BTScanOpaque) scan->opaque; 00413 00414 /* we aren't holding any read locks, but gotta drop the pins */ 00415 if (BTScanPosIsValid(so->currPos)) 00416 { 00417 /* Before leaving current page, deal with any killed items */ 00418 if (so->numKilled > 0) 00419 _bt_killitems(scan, false); 00420 ReleaseBuffer(so->currPos.buf); 00421 so->currPos.buf = InvalidBuffer; 00422 } 00423 00424 if (BTScanPosIsValid(so->markPos)) 00425 { 00426 ReleaseBuffer(so->markPos.buf); 00427 so->markPos.buf = InvalidBuffer; 00428 } 00429 so->markItemIndex = -1; 00430 00431 if (so->killedItems != NULL) 00432 pfree(so->killedItems); 00433 if (so->keyData != NULL) 00434 pfree(so->keyData); 00435 pfree(so); 00436 00437 PG_RETURN_VOID(); 00438 }
| Datum btgetbitmap | ( | PG_FUNCTION_ARGS | ) |
Definition at line 292 of file nbtree.c.
References _bt_first(), _bt_next(), BTScanOpaqueData::currPos, ForwardScanDirection, BTScanPosItem::heapTid, BTScanPosData::itemIndex, BTScanPosData::items, BTScanPosData::lastItem, IndexScanDescData::opaque, PG_GETARG_POINTER, PG_RETURN_INT64, HeapTupleData::t_self, tbm_add_tuples(), and IndexScanDescData::xs_ctup.
00293 { 00294 IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0); 00295 TIDBitmap *tbm = (TIDBitmap *) PG_GETARG_POINTER(1); 00296 BTScanOpaque so = (BTScanOpaque) scan->opaque; 00297 int64 ntids = 0; 00298 ItemPointer heapTid; 00299 00300 /* Fetch the first page & tuple. */ 00301 if (!_bt_first(scan, ForwardScanDirection)) 00302 { 00303 /* empty scan */ 00304 PG_RETURN_INT64(0); 00305 } 00306 /* Save tuple ID, and continue scanning */ 00307 heapTid = &scan->xs_ctup.t_self; 00308 tbm_add_tuples(tbm, heapTid, 1, false); 00309 ntids++; 00310 00311 for (;;) 00312 { 00313 /* 00314 * Advance to next tuple within page. This is the same as the easy 00315 * case in _bt_next(). 00316 */ 00317 if (++so->currPos.itemIndex > so->currPos.lastItem) 00318 { 00319 /* let _bt_next do the heavy lifting */ 00320 if (!_bt_next(scan, ForwardScanDirection)) 00321 break; 00322 } 00323 00324 /* Save tuple ID, and continue scanning */ 00325 heapTid = &so->currPos.items[so->currPos.itemIndex].heapTid; 00326 tbm_add_tuples(tbm, heapTid, 1, false); 00327 ntids++; 00328 } 00329 00330 PG_RETURN_INT64(ntids); 00331 }
| Datum btgettuple | ( | PG_FUNCTION_ARGS | ) |
Definition at line 240 of file nbtree.c.
References _bt_first(), _bt_next(), BTScanPosIsValid, BTScanOpaqueData::currPos, BTScanPosData::itemIndex, IndexScanDescData::kill_prior_tuple, BTScanOpaqueData::killedItems, MaxIndexTuplesPerPage, NULL, BTScanOpaqueData::numKilled, IndexScanDescData::opaque, palloc, PG_GETARG_INT32, PG_GETARG_POINTER, PG_RETURN_BOOL, and IndexScanDescData::xs_recheck.
00241 { 00242 IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0); 00243 ScanDirection dir = (ScanDirection) PG_GETARG_INT32(1); 00244 BTScanOpaque so = (BTScanOpaque) scan->opaque; 00245 bool res; 00246 00247 /* btree indexes are never lossy */ 00248 scan->xs_recheck = false; 00249 00250 /* 00251 * If we've already initialized this scan, we can just advance it in the 00252 * appropriate direction. If we haven't done so yet, we call a routine to 00253 * get the first item in the scan. 00254 */ 00255 if (BTScanPosIsValid(so->currPos)) 00256 { 00257 /* 00258 * Check to see if we should kill the previously-fetched tuple. 00259 */ 00260 if (scan->kill_prior_tuple) 00261 { 00262 /* 00263 * Yes, remember it for later. (We'll deal with all such tuples 00264 * at once right before leaving the index page.) The test for 00265 * numKilled overrun is not just paranoia: if the caller reverses 00266 * direction in the indexscan then the same item might get entered 00267 * multiple times. It's not worth trying to optimize that, so we 00268 * don't detect it, but instead just forget any excess entries. 00269 */ 00270 if (so->killedItems == NULL) 00271 so->killedItems = (int *) 00272 palloc(MaxIndexTuplesPerPage * sizeof(int)); 00273 if (so->numKilled < MaxIndexTuplesPerPage) 00274 so->killedItems[so->numKilled++] = so->currPos.itemIndex; 00275 } 00276 00277 /* 00278 * Now continue the scan. 00279 */ 00280 res = _bt_next(scan, dir); 00281 } 00282 else 00283 res = _bt_first(scan, dir); 00284 00285 PG_RETURN_BOOL(res); 00286 }
| Datum btinsert | ( | PG_FUNCTION_ARGS | ) |
Definition at line 214 of file nbtree.c.
References _bt_doinsert(), index_form_tuple(), pfree(), PG_GETARG_INT32, PG_GETARG_POINTER, PG_RETURN_BOOL, RelationGetDescr, IndexTupleData::t_tid, and values.
00215 { 00216 Relation rel = (Relation) PG_GETARG_POINTER(0); 00217 Datum *values = (Datum *) PG_GETARG_POINTER(1); 00218 bool *isnull = (bool *) PG_GETARG_POINTER(2); 00219 ItemPointer ht_ctid = (ItemPointer) PG_GETARG_POINTER(3); 00220 Relation heapRel = (Relation) PG_GETARG_POINTER(4); 00221 IndexUniqueCheck checkUnique = (IndexUniqueCheck) PG_GETARG_INT32(5); 00222 bool result; 00223 IndexTuple itup; 00224 00225 /* generate an index tuple */ 00226 itup = index_form_tuple(RelationGetDescr(rel), values, isnull); 00227 itup->t_tid = *ht_ctid; 00228 00229 result = _bt_doinsert(rel, itup, checkUnique, heapRel); 00230 00231 pfree(itup); 00232 00233 PG_RETURN_BOOL(result); 00234 }
| Datum btmarkpos | ( | PG_FUNCTION_ARGS | ) |
Definition at line 444 of file nbtree.c.
References BTScanPosIsValid, BTScanPosData::buf, BTScanOpaqueData::currPos, InvalidBuffer, BTScanPosData::itemIndex, BTScanOpaqueData::markItemIndex, BTScanOpaqueData::markPos, IndexScanDescData::opaque, PG_GETARG_POINTER, PG_RETURN_VOID, and ReleaseBuffer().
00445 { 00446 IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0); 00447 BTScanOpaque so = (BTScanOpaque) scan->opaque; 00448 00449 /* we aren't holding any read locks, but gotta drop the pin */ 00450 if (BTScanPosIsValid(so->markPos)) 00451 { 00452 ReleaseBuffer(so->markPos.buf); 00453 so->markPos.buf = InvalidBuffer; 00454 } 00455 00456 /* 00457 * Just record the current itemIndex. If we later step to next page 00458 * before releasing the marked position, _bt_steppage makes a full copy of 00459 * the currPos struct in markPos. If (as often happens) the mark is moved 00460 * before we leave the page, we don't have to do that work. 00461 */ 00462 if (BTScanPosIsValid(so->currPos)) 00463 so->markItemIndex = so->currPos.itemIndex; 00464 else 00465 so->markItemIndex = -1; 00466 00467 PG_RETURN_VOID(); 00468 }
| Datum btrescan | ( | PG_FUNCTION_ARGS | ) |
Definition at line 354 of file nbtree.c.
References _bt_killitems(), BTScanPosIsValid, BTScanPosData::buf, BTScanOpaqueData::currPos, InvalidBuffer, IndexScanDescData::keyData, BTScanOpaqueData::keyData, BTScanOpaqueData::killedItems, BTScanOpaqueData::markItemIndex, BTScanOpaqueData::markPos, memmove, NULL, BTScanOpaqueData::numberOfKeys, IndexScanDescData::numberOfKeys, BTScanOpaqueData::numKilled, IndexScanDescData::opaque, palloc, PG_GETARG_POINTER, PG_RETURN_VOID, and ReleaseBuffer().
00355 { 00356 IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0); 00357 ScanKey scankey = (ScanKey) PG_GETARG_POINTER(1); 00358 BTScanOpaque so; 00359 00360 so = (BTScanOpaque) scan->opaque; 00361 00362 if (so == NULL) /* if called from btbeginscan */ 00363 { 00364 so = (BTScanOpaque) palloc(sizeof(BTScanOpaqueData)); 00365 so->currPos.buf = so->markPos.buf = InvalidBuffer; 00366 if (scan->numberOfKeys > 0) 00367 so->keyData = (ScanKey) palloc(scan->numberOfKeys * sizeof(ScanKeyData)); 00368 else 00369 so->keyData = NULL; 00370 so->killedItems = NULL; /* until needed */ 00371 so->numKilled = 0; 00372 scan->opaque = so; 00373 } 00374 00375 /* we aren't holding any read locks, but gotta drop the pins */ 00376 if (BTScanPosIsValid(so->currPos)) 00377 { 00378 /* Before leaving current page, deal with any killed items */ 00379 if (so->numKilled > 0) 00380 _bt_killitems(scan, false); 00381 ReleaseBuffer(so->currPos.buf); 00382 so->currPos.buf = InvalidBuffer; 00383 } 00384 00385 if (BTScanPosIsValid(so->markPos)) 00386 { 00387 ReleaseBuffer(so->markPos.buf); 00388 so->markPos.buf = InvalidBuffer; 00389 } 00390 so->markItemIndex = -1; 00391 00392 /* 00393 * Reset the scan keys. Note that keys ordering stuff moved to _bt_first. 00394 * - vadim 05/05/97 00395 */ 00396 if (scankey && scan->numberOfKeys > 0) 00397 memmove(scan->keyData, 00398 scankey, 00399 scan->numberOfKeys * sizeof(ScanKeyData)); 00400 so->numberOfKeys = 0; /* until _bt_preprocess_keys sets it */ 00401 00402 PG_RETURN_VOID(); 00403 }
| Datum btrestrpos | ( | PG_FUNCTION_ARGS | ) |
Definition at line 474 of file nbtree.c.
References _bt_killitems(), BTScanPosIsValid, BTScanPosData::buf, BTScanOpaqueData::currPos, IncrBufferRefCount(), InvalidBuffer, BTScanPosData::itemIndex, BTScanPosData::lastItem, BTScanOpaqueData::markItemIndex, BTScanOpaqueData::markPos, BTScanOpaqueData::numKilled, offsetof, IndexScanDescData::opaque, PG_GETARG_POINTER, PG_RETURN_VOID, and ReleaseBuffer().
00475 { 00476 IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0); 00477 BTScanOpaque so = (BTScanOpaque) scan->opaque; 00478 00479 if (so->markItemIndex >= 0) 00480 { 00481 /* 00482 * The mark position is on the same page we are currently on. Just 00483 * restore the itemIndex. 00484 */ 00485 so->currPos.itemIndex = so->markItemIndex; 00486 } 00487 else 00488 { 00489 /* we aren't holding any read locks, but gotta drop the pin */ 00490 if (BTScanPosIsValid(so->currPos)) 00491 { 00492 /* Before leaving current page, deal with any killed items */ 00493 if (so->numKilled > 0 && 00494 so->currPos.buf != so->markPos.buf) 00495 _bt_killitems(scan, false); 00496 ReleaseBuffer(so->currPos.buf); 00497 so->currPos.buf = InvalidBuffer; 00498 } 00499 00500 if (BTScanPosIsValid(so->markPos)) 00501 { 00502 /* bump pin on mark buffer for assignment to current buffer */ 00503 IncrBufferRefCount(so->markPos.buf); 00504 memcpy(&so->currPos, &so->markPos, 00505 offsetof(BTScanPosData, items[1]) + 00506 so->markPos.lastItem * sizeof(BTScanPosItem)); 00507 } 00508 } 00509 00510 PG_RETURN_VOID(); 00511 }
| Datum btvacuumcleanup | ( | PG_FUNCTION_ARGS | ) |
Definition at line 554 of file nbtree.c.
References IndexVacuumInfo::analyze_only, btvacuumscan(), IndexVacuumInfo::estimated_count, IndexVacuumInfo::index, IndexFreeSpaceMapVacuum(), NULL, IndexVacuumInfo::num_heap_tuples, IndexBulkDeleteResult::num_index_tuples, palloc0, PG_GETARG_POINTER, and PG_RETURN_POINTER.
00555 { 00556 IndexVacuumInfo *info = (IndexVacuumInfo *) PG_GETARG_POINTER(0); 00557 IndexBulkDeleteResult *stats = (IndexBulkDeleteResult *) PG_GETARG_POINTER(1); 00558 00559 /* No-op in ANALYZE ONLY mode */ 00560 if (info->analyze_only) 00561 PG_RETURN_POINTER(stats); 00562 00563 /* 00564 * If btbulkdelete was called, we need not do anything, just return the 00565 * stats from the latest btbulkdelete call. If it wasn't called, we must 00566 * still do a pass over the index, to recycle any newly-recyclable pages 00567 * and to obtain index statistics. 00568 * 00569 * Since we aren't going to actually delete any leaf items, there's no 00570 * need to go through all the vacuum-cycle-ID pushups. 00571 */ 00572 if (stats == NULL) 00573 { 00574 stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); 00575 btvacuumscan(info, stats, NULL, NULL, 0); 00576 } 00577 00578 /* Finally, vacuum the FSM */ 00579 IndexFreeSpaceMapVacuum(info->index); 00580 00581 /* 00582 * It's quite possible for us to be fooled by concurrent page splits into 00583 * double-counting some index tuples, so disbelieve any total that exceeds 00584 * the underlying heap's count ... if we know that accurately. Otherwise 00585 * this might just make matters worse. 00586 */ 00587 if (!info->estimated_count) 00588 { 00589 if (stats->num_index_tuples > info->num_heap_tuples) 00590 stats->num_index_tuples = info->num_heap_tuples; 00591 } 00592 00593 PG_RETURN_POINTER(stats); 00594 }
| static void btvacuumpage | ( | BTVacState * | vstate, | |
| BlockNumber | blkno, | |||
| BlockNumber | orig_blkno | |||
| ) | [static] |
Definition at line 734 of file nbtree.c.
References _bt_checkpage(), _bt_delitems(), _bt_page_recyclable(), _bt_pagedel(), _bt_relbuf(), BT_READ, BTP_SPLIT_END, BTPageOpaqueData::btpo_cycleid, BTPageOpaqueData::btpo_flags, BTPageOpaqueData::btpo_next, BUFFER_LOCK_UNLOCK, BufferGetBlockNumber(), BufferGetPage, BTVacState::callback, callback(), BTVacState::callback_state, BTVacState::cycleid, IndexVacuumInfo::index, BTVacState::info, BTVacState::lastBlockVacuumed, BTVacState::lastUsedPage, LockBuffer(), LockBufferForCleanup(), MAIN_FORKNUM, MaxOffsetNumber, MemoryContextReset(), MemoryContextSwitchTo(), NULL, IndexBulkDeleteResult::num_index_tuples, OffsetNumberNext, P_FIRSTDATAKEY, P_IGNORE, P_ISDELETED, P_ISHALFDEAD, P_ISLEAF, P_NONE, P_RIGHTMOST, BTVacState::pagedelcontext, PageGetItem, PageGetItemId, PageGetMaxOffsetNumber, PageGetSpecialPointer, PageIsNew, IndexBulkDeleteResult::pages_deleted, RBM_NORMAL, ReadBufferExtended(), RecordFreeIndexPage(), SetBufferCommitInfoNeedsSave(), BTVacState::stats, IndexVacuumInfo::strategy, IndexTupleData::t_tid, BTVacState::totFreePages, IndexBulkDeleteResult::tuples_removed, and vacuum_delay_point().
Referenced by btvacuumscan().
00735 { 00736 IndexVacuumInfo *info = vstate->info; 00737 IndexBulkDeleteResult *stats = vstate->stats; 00738 IndexBulkDeleteCallback callback = vstate->callback; 00739 void *callback_state = vstate->callback_state; 00740 Relation rel = info->index; 00741 bool delete_now; 00742 BlockNumber recurse_to; 00743 Buffer buf; 00744 Page page; 00745 BTPageOpaque opaque; 00746 00747 restart: 00748 delete_now = false; 00749 recurse_to = P_NONE; 00750 00751 /* call vacuum_delay_point while not holding any buffer lock */ 00752 vacuum_delay_point(); 00753 00754 /* 00755 * We can't use _bt_getbuf() here because it always applies 00756 * _bt_checkpage(), which will barf on an all-zero page. We want to 00757 * recycle all-zero pages, not fail. Also, we want to use a nondefault 00758 * buffer access strategy. 00759 */ 00760 buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, 00761 info->strategy); 00762 LockBuffer(buf, BT_READ); 00763 page = BufferGetPage(buf); 00764 opaque = (BTPageOpaque) PageGetSpecialPointer(page); 00765 if (!PageIsNew(page)) 00766 _bt_checkpage(rel, buf); 00767 00768 /* 00769 * If we are recursing, the only case we want to do anything with is a 00770 * live leaf page having the current vacuum cycle ID. Any other state 00771 * implies we already saw the page (eg, deleted it as being empty). 00772 */ 00773 if (blkno != orig_blkno) 00774 { 00775 if (_bt_page_recyclable(page) || 00776 P_IGNORE(opaque) || 00777 !P_ISLEAF(opaque) || 00778 opaque->btpo_cycleid != vstate->cycleid) 00779 { 00780 _bt_relbuf(rel, buf); 00781 return; 00782 } 00783 } 00784 00785 /* If the page is in use, update lastUsedPage */ 00786 if (!_bt_page_recyclable(page) && vstate->lastUsedPage < blkno) 00787 vstate->lastUsedPage = blkno; 00788 00789 /* Page is valid, see what to do with it */ 00790 if (_bt_page_recyclable(page)) 00791 { 00792 /* Okay to recycle this page */ 00793 RecordFreeIndexPage(rel, blkno); 00794 vstate->totFreePages++; 00795 stats->pages_deleted++; 00796 } 00797 else if (P_ISDELETED(opaque)) 00798 { 00799 /* Already deleted, but can't recycle yet */ 00800 stats->pages_deleted++; 00801 } 00802 else if (P_ISHALFDEAD(opaque)) 00803 { 00804 /* Half-dead, try to delete */ 00805 delete_now = true; 00806 } 00807 else if (P_ISLEAF(opaque)) 00808 { 00809 OffsetNumber deletable[MaxOffsetNumber]; 00810 int ndeletable; 00811 OffsetNumber offnum, 00812 minoff, 00813 maxoff; 00814 00815 /* 00816 * Trade in the initial read lock for a super-exclusive write lock on 00817 * this page. We must get such a lock on every leaf page over the 00818 * course of the vacuum scan, whether or not it actually contains any 00819 * deletable tuples --- see nbtree/README. 00820 */ 00821 LockBuffer(buf, BUFFER_LOCK_UNLOCK); 00822 LockBufferForCleanup(buf); 00823 00824 /* 00825 * Check whether we need to recurse back to earlier pages. What we 00826 * are concerned about is a page split that happened since we started 00827 * the vacuum scan. If the split moved some tuples to a lower page 00828 * then we might have missed 'em. If so, set up for tail recursion. 00829 * (Must do this before possibly clearing btpo_cycleid below!) 00830 */ 00831 if (vstate->cycleid != 0 && 00832 opaque->btpo_cycleid == vstate->cycleid && 00833 !(opaque->btpo_flags & BTP_SPLIT_END) && 00834 !P_RIGHTMOST(opaque) && 00835 opaque->btpo_next < orig_blkno) 00836 recurse_to = opaque->btpo_next; 00837 00838 /* 00839 * Scan over all items to see which ones need deleted according to the 00840 * callback function. 00841 */ 00842 ndeletable = 0; 00843 minoff = P_FIRSTDATAKEY(opaque); 00844 maxoff = PageGetMaxOffsetNumber(page); 00845 if (callback) 00846 { 00847 for (offnum = minoff; 00848 offnum <= maxoff; 00849 offnum = OffsetNumberNext(offnum)) 00850 { 00851 IndexTuple itup; 00852 ItemPointer htup; 00853 00854 itup = (IndexTuple) PageGetItem(page, 00855 PageGetItemId(page, offnum)); 00856 htup = &(itup->t_tid); 00857 00858 /* 00859 * During Hot Standby we currently assume that 00860 * XLOG_BTREE_VACUUM records do not produce conflicts. That is 00861 * only true as long as the callback function depends only 00862 * upon whether the index tuple refers to heap tuples removed 00863 * in the initial heap scan. When vacuum starts it derives a 00864 * value of OldestXmin. Backends taking later snapshots could 00865 * have a RecentGlobalXmin with a later xid than the vacuum's 00866 * OldestXmin, so it is possible that row versions deleted 00867 * after OldestXmin could be marked as killed by other 00868 * backends. The callback function *could* look at the index 00869 * tuple state in isolation and decide to delete the index 00870 * tuple, though currently it does not. If it ever did, we 00871 * would need to reconsider whether XLOG_BTREE_VACUUM records 00872 * should cause conflicts. If they did cause conflicts they 00873 * would be fairly harsh conflicts, since we haven't yet 00874 * worked out a way to pass a useful value for 00875 * latestRemovedXid on the XLOG_BTREE_VACUUM records. This 00876 * applies to *any* type of index that marks index tuples as 00877 * killed. 00878 */ 00879 if (callback(htup, callback_state)) 00880 deletable[ndeletable++] = offnum; 00881 } 00882 } 00883 00884 /* 00885 * Apply any needed deletes. We issue just one _bt_delitems() call 00886 * per page, so as to minimize WAL traffic. 00887 */ 00888 if (ndeletable > 0) 00889 { 00890 BlockNumber lastBlockVacuumed = BufferGetBlockNumber(buf); 00891 00892 _bt_delitems(rel, buf, deletable, ndeletable, true, vstate->lastBlockVacuumed); 00893 00894 /* 00895 * Keep track of the block number of the lastBlockVacuumed, so we 00896 * can scan those blocks as well during WAL replay. This then 00897 * provides concurrency protection and allows btrees to be used 00898 * while in recovery. 00899 */ 00900 if (lastBlockVacuumed > vstate->lastBlockVacuumed) 00901 vstate->lastBlockVacuumed = lastBlockVacuumed; 00902 00903 stats->tuples_removed += ndeletable; 00904 /* must recompute maxoff */ 00905 maxoff = PageGetMaxOffsetNumber(page); 00906 } 00907 else 00908 { 00909 /* 00910 * If the page has been split during this vacuum cycle, it seems 00911 * worth expending a write to clear btpo_cycleid even if we don't 00912 * have any deletions to do. (If we do, _bt_delitems takes care 00913 * of this.) This ensures we won't process the page again. 00914 * 00915 * We treat this like a hint-bit update because there's no need to 00916 * WAL-log it. 00917 */ 00918 if (vstate->cycleid != 0 && 00919 opaque->btpo_cycleid == vstate->cycleid) 00920 { 00921 opaque->btpo_cycleid = 0; 00922 SetBufferCommitInfoNeedsSave(buf); 00923 } 00924 } 00925 00926 /* 00927 * If it's now empty, try to delete; else count the live tuples. We 00928 * don't delete when recursing, though, to avoid putting entries into 00929 * freePages out-of-order (doesn't seem worth any extra code to handle 00930 * the case). 00931 */ 00932 if (minoff > maxoff) 00933 delete_now = (blkno == orig_blkno); 00934 else 00935 stats->num_index_tuples += maxoff - minoff + 1; 00936 } 00937 00938 if (delete_now) 00939 { 00940 MemoryContext oldcontext; 00941 int ndel; 00942 00943 /* Run pagedel in a temp context to avoid memory leakage */ 00944 MemoryContextReset(vstate->pagedelcontext); 00945 oldcontext = MemoryContextSwitchTo(vstate->pagedelcontext); 00946 00947 ndel = _bt_pagedel(rel, buf, NULL); 00948 00949 /* count only this page, else may double-count parent */ 00950 if (ndel) 00951 stats->pages_deleted++; 00952 00953 MemoryContextSwitchTo(oldcontext); 00954 /* pagedel released buffer, so we shouldn't */ 00955 } 00956 else 00957 _bt_relbuf(rel, buf); 00958 00959 /* 00960 * This is really tail recursion, but if the compiler is too stupid to 00961 * optimize it as such, we'd eat an uncomfortably large amount of stack 00962 * space per recursion level (due to the deletable[] array). A failure is 00963 * improbable since the number of levels isn't likely to be large ... but 00964 * just in case, let's hand-optimize into a loop. 00965 */ 00966 if (recurse_to != P_NONE) 00967 { 00968 blkno = recurse_to; 00969 goto restart; 00970 } 00971 }
| static void btvacuumscan | ( | IndexVacuumInfo * | info, | |
| IndexBulkDeleteResult * | stats, | |||
| IndexBulkDeleteCallback | callback, | |||
| void * | callback_state, | |||
| BTCycleId | cycleid | |||
| ) | [static] |
Definition at line 609 of file nbtree.c.
References _bt_delitems(), _bt_relbuf(), ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE, ALLOCSET_DEFAULT_MINSIZE, AllocSetContextCreate(), BTREE_METAPAGE, btvacuumpage(), BTVacState::callback, BTVacState::callback_state, CurrentMemoryContext, BTVacState::cycleid, IndexBulkDeleteResult::estimated_count, ExclusiveLock, IndexVacuumInfo::index, BTVacState::info, BTVacState::lastBlockVacuumed, BTVacState::lastUsedPage, LockBufferForCleanup(), LockRelationForExtension(), MAIN_FORKNUM, MemoryContextDelete(), NULL, IndexBulkDeleteResult::num_index_tuples, IndexBulkDeleteResult::num_pages, BTVacState::pagedelcontext, IndexBulkDeleteResult::pages_deleted, IndexBulkDeleteResult::pages_free, RBM_NORMAL, ReadBufferExtended(), RELATION_IS_LOCAL, RelationGetNumberOfBlocks(), BTVacState::stats, IndexVacuumInfo::strategy, BTVacState::totFreePages, UnlockRelationForExtension(), and XLogStandbyInfoActive.
Referenced by btbulkdelete(), and btvacuumcleanup().
00612 { 00613 Relation rel = info->index; 00614 BTVacState vstate; 00615 BlockNumber num_pages; 00616 BlockNumber blkno; 00617 bool needLock; 00618 00619 /* 00620 * Reset counts that will be incremented during the scan; needed in case 00621 * of multiple scans during a single VACUUM command 00622 */ 00623 stats->estimated_count = false; 00624 stats->num_index_tuples = 0; 00625 stats->pages_deleted = 0; 00626 00627 /* Set up info to pass down to btvacuumpage */ 00628 vstate.info = info; 00629 vstate.stats = stats; 00630 vstate.callback = callback; 00631 vstate.callback_state = callback_state; 00632 vstate.cycleid = cycleid; 00633 vstate.lastBlockVacuumed = BTREE_METAPAGE; /* Initialise at first block */ 00634 vstate.lastUsedPage = BTREE_METAPAGE; 00635 vstate.totFreePages = 0; 00636 00637 /* Create a temporary memory context to run _bt_pagedel in */ 00638 vstate.pagedelcontext = AllocSetContextCreate(CurrentMemoryContext, 00639 "_bt_pagedel", 00640 ALLOCSET_DEFAULT_MINSIZE, 00641 ALLOCSET_DEFAULT_INITSIZE, 00642 ALLOCSET_DEFAULT_MAXSIZE); 00643 00644 /* 00645 * The outer loop iterates over all index pages except the metapage, in 00646 * physical order (we hope the kernel will cooperate in providing 00647 * read-ahead for speed). It is critical that we visit all leaf pages, 00648 * including ones added after we start the scan, else we might fail to 00649 * delete some deletable tuples. Hence, we must repeatedly check the 00650 * relation length. We must acquire the relation-extension lock while 00651 * doing so to avoid a race condition: if someone else is extending the 00652 * relation, there is a window where bufmgr/smgr have created a new 00653 * all-zero page but it hasn't yet been write-locked by _bt_getbuf(). If 00654 * we manage to scan such a page here, we'll improperly assume it can be 00655 * recycled. Taking the lock synchronizes things enough to prevent a 00656 * problem: either num_pages won't include the new page, or _bt_getbuf 00657 * already has write lock on the buffer and it will be fully initialized 00658 * before we can examine it. (See also vacuumlazy.c, which has the same 00659 * issue.) Also, we need not worry if a page is added immediately after 00660 * we look; the page splitting code already has write-lock on the left 00661 * page before it adds a right page, so we must already have processed any 00662 * tuples due to be moved into such a page. 00663 * 00664 * We can skip locking for new or temp relations, however, since no one 00665 * else could be accessing them. 00666 */ 00667 needLock = !RELATION_IS_LOCAL(rel); 00668 00669 blkno = BTREE_METAPAGE + 1; 00670 for (;;) 00671 { 00672 /* Get the current relation length */ 00673 if (needLock) 00674 LockRelationForExtension(rel, ExclusiveLock); 00675 num_pages = RelationGetNumberOfBlocks(rel); 00676 if (needLock) 00677 UnlockRelationForExtension(rel, ExclusiveLock); 00678 00679 /* Quit if we've scanned the whole relation */ 00680 if (blkno >= num_pages) 00681 break; 00682 /* Iterate over pages, then loop back to recheck length */ 00683 for (; blkno < num_pages; blkno++) 00684 { 00685 btvacuumpage(&vstate, blkno, blkno); 00686 } 00687 } 00688 00689 /* 00690 * InHotStandby we need to scan right up to the end of the index for 00691 * correct locking, so we may need to write a WAL record for the final 00692 * block in the index if it was not vacuumed. It's possible that VACUUMing 00693 * has actually removed zeroed pages at the end of the index so we need to 00694 * take care to issue the record for last actual block and not for the 00695 * last block that was scanned. Ignore empty indexes. 00696 */ 00697 if (XLogStandbyInfoActive() && 00698 num_pages > 1 && vstate.lastBlockVacuumed < (num_pages - 1)) 00699 { 00700 Buffer buf; 00701 00702 /* 00703 * We can't use _bt_getbuf() here because it always applies 00704 * _bt_checkpage(), which will barf on an all-zero page. We want to 00705 * recycle all-zero pages, not fail. Also, we want to use a 00706 * nondefault buffer access strategy. 00707 */ 00708 buf = ReadBufferExtended(rel, MAIN_FORKNUM, num_pages - 1, RBM_NORMAL, 00709 info->strategy); 00710 LockBufferForCleanup(buf); 00711 _bt_delitems(rel, buf, NULL, 0, true, vstate.lastBlockVacuumed); 00712 _bt_relbuf(rel, buf); 00713 } 00714 00715 MemoryContextDelete(vstate.pagedelcontext); 00716 00717 /* update statistics */ 00718 stats->num_pages = num_pages; 00719 stats->pages_free = vstate.totFreePages; 00720 }
1.5.8