Header And Logo

PostgreSQL
| The world's most advanced open source database.

nbtree.c File Reference

#include "postgres.h"
#include "access/genam.h"
#include "access/nbtree.h"
#include "access/relscan.h"
#include "catalog/index.h"
#include "catalog/storage.h"
#include "commands/vacuum.h"
#include "storage/bufmgr.h"
#include "storage/freespace.h"
#include "storage/indexfsm.h"
#include "storage/ipc.h"
#include "storage/lmgr.h"
#include "utils/memutils.h"

Include dependency graph for nbtree.c:

Go to the source code of this file.

Data Structures

struct  BTBuildState
struct  BTVacState

Functions

static void btbuildCallback (Relation index, HeapTuple htup, Datum *values, bool *isnull, bool tupleIsAlive, void *state)
static void btvacuumscan (IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state, BTCycleId cycleid)
static void btvacuumpage (BTVacState *vstate, BlockNumber blkno, BlockNumber orig_blkno)
Datum btbuild (PG_FUNCTION_ARGS)
Datum btinsert (PG_FUNCTION_ARGS)
Datum btgettuple (PG_FUNCTION_ARGS)
Datum btgetbitmap (PG_FUNCTION_ARGS)
Datum btbeginscan (PG_FUNCTION_ARGS)
Datum btrescan (PG_FUNCTION_ARGS)
Datum btendscan (PG_FUNCTION_ARGS)
Datum btmarkpos (PG_FUNCTION_ARGS)
Datum btrestrpos (PG_FUNCTION_ARGS)
Datum btbulkdelete (PG_FUNCTION_ARGS)
Datum btvacuumcleanup (PG_FUNCTION_ARGS)


Function Documentation

Datum btbeginscan ( PG_FUNCTION_ARGS   ) 

Definition at line 337 of file nbtree.c.

References PG_GETARG_INT32, PG_GETARG_POINTER, PG_RETURN_POINTER, and RelationGetIndexScan().

00338 {
00339     Relation    rel = (Relation) PG_GETARG_POINTER(0);
00340     int         keysz = PG_GETARG_INT32(1);
00341     ScanKey     scankey = (ScanKey) PG_GETARG_POINTER(2);
00342     IndexScanDesc scan;
00343 
00344     /* get the scan */
00345     scan = RelationGetIndexScan(rel, keysz, scankey);
00346 
00347     PG_RETURN_POINTER(scan);
00348 }

Datum btbuild ( PG_FUNCTION_ARGS   ) 

Definition at line 84 of file nbtree.c.

References _bt_leafbuild(), _bt_spooldestroy(), _bt_spoolinit(), btbuildCallback(), elog, ERROR, BTBuildState::haveDead, IndexBuildResult::heap_tuples, BTBuildState::heapRel, IndexInfo::ii_Unique, IndexBuildResult::index_tuples, IndexBuildHeapScan(), BTBuildState::indtuples, BTBuildState::isUnique, log_btree_build_stats, NULL, palloc, PG_GETARG_POINTER, PG_RETURN_POINTER, RelationGetNumberOfBlocks(), RelationGetRelationName, ResetUsage(), ShowUsage(), BTBuildState::spool, and BTBuildState::spool2.

00085 {
00086     Relation    heap = (Relation) PG_GETARG_POINTER(0);
00087     Relation    index = (Relation) PG_GETARG_POINTER(1);
00088     IndexInfo  *indexInfo = (IndexInfo *) PG_GETARG_POINTER(2);
00089     IndexBuildResult *result;
00090     double      reltuples;
00091     BTBuildState buildstate;
00092 
00093     buildstate.isUnique = indexInfo->ii_Unique;
00094     buildstate.haveDead = false;
00095     buildstate.heapRel = heap;
00096     buildstate.spool = NULL;
00097     buildstate.spool2 = NULL;
00098     buildstate.indtuples = 0;
00099 
00100 #ifdef BTREE_BUILD_STATS
00101     if (log_btree_build_stats)
00102         ResetUsage();
00103 #endif   /* BTREE_BUILD_STATS */
00104 
00105     /*
00106      * We expect to be called exactly once for any index relation. If that's
00107      * not the case, big trouble's what we have.
00108      */
00109     if (RelationGetNumberOfBlocks(index) != 0)
00110         elog(ERROR, "index \"%s\" already contains data",
00111              RelationGetRelationName(index));
00112 
00113     buildstate.spool = _bt_spoolinit(index, indexInfo->ii_Unique, false);
00114 
00115     /*
00116      * If building a unique index, put dead tuples in a second spool to keep
00117      * them out of the uniqueness check.
00118      */
00119     if (indexInfo->ii_Unique)
00120         buildstate.spool2 = _bt_spoolinit(index, false, true);
00121 
00122     /* do the heap scan */
00123     reltuples = IndexBuildHeapScan(heap, index, indexInfo, true,
00124                                    btbuildCallback, (void *) &buildstate);
00125 
00126     /* okay, all heap tuples are indexed */
00127     if (buildstate.spool2 && !buildstate.haveDead)
00128     {
00129         /* spool2 turns out to be unnecessary */
00130         _bt_spooldestroy(buildstate.spool2);
00131         buildstate.spool2 = NULL;
00132     }
00133 
00134     /*
00135      * Finish the build by (1) completing the sort of the spool file, (2)
00136      * inserting the sorted tuples into btree pages and (3) building the upper
00137      * levels.
00138      */
00139     _bt_leafbuild(buildstate.spool, buildstate.spool2);
00140     _bt_spooldestroy(buildstate.spool);
00141     if (buildstate.spool2)
00142         _bt_spooldestroy(buildstate.spool2);
00143 
00144 #ifdef BTREE_BUILD_STATS
00145     if (log_btree_build_stats)
00146     {
00147         ShowUsage("BTREE BUILD STATS");
00148         ResetUsage();
00149     }
00150 #endif   /* BTREE_BUILD_STATS */
00151 
00152     /*
00153      * If we are reindexing a pre-existing index, it is critical to send out a
00154      * relcache invalidation SI message to ensure all backends re-read the
00155      * index metapage.  We expect that the caller will ensure that happens
00156      * (typically as a side effect of updating index stats, but it must happen
00157      * even if the stats don't change!)
00158      */
00159 
00160     /*
00161      * Return statistics
00162      */
00163     result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
00164 
00165     result->heap_tuples = reltuples;
00166     result->index_tuples = buildstate.indtuples;
00167 
00168     PG_RETURN_POINTER(result);
00169 }

static void btbuildCallback ( Relation  index,
HeapTuple  htup,
Datum values,
bool isnull,
bool  tupleIsAlive,
void *  state 
) [static]

Definition at line 175 of file nbtree.c.

References _bt_spool(), BTBuildState::haveDead, index_form_tuple(), BTBuildState::indtuples, NULL, pfree(), RelationGetDescr, BTBuildState::spool, BTBuildState::spool2, HeapTupleData::t_self, and IndexTupleData::t_tid.

Referenced by btbuild().

00181 {
00182     BTBuildState *buildstate = (BTBuildState *) state;
00183     IndexTuple  itup;
00184 
00185     /* form an index tuple and point it at the heap tuple */
00186     itup = index_form_tuple(RelationGetDescr(index), values, isnull);
00187     itup->t_tid = htup->t_self;
00188 
00189     /*
00190      * insert the index tuple into the appropriate spool file for subsequent
00191      * processing
00192      */
00193     if (tupleIsAlive || buildstate->spool2 == NULL)
00194         _bt_spool(itup, buildstate->spool);
00195     else
00196     {
00197         /* dead tuples are put into spool2 */
00198         buildstate->haveDead = true;
00199         _bt_spool(itup, buildstate->spool2);
00200     }
00201 
00202     buildstate->indtuples += 1;
00203 
00204     pfree(itup);
00205 }

Datum btbulkdelete ( PG_FUNCTION_ARGS   ) 

Definition at line 521 of file nbtree.c.

References _bt_end_vacuum(), _bt_end_vacuum_callback(), _bt_start_vacuum(), btvacuumscan(), callback(), IndexVacuumInfo::index, NULL, palloc0, PG_END_ENSURE_ERROR_CLEANUP, PG_ENSURE_ERROR_CLEANUP, PG_GETARG_POINTER, PG_RETURN_POINTER, and PointerGetDatum.

00522 {
00523     IndexVacuumInfo *info = (IndexVacuumInfo *) PG_GETARG_POINTER(0);
00524     IndexBulkDeleteResult *volatile stats = (IndexBulkDeleteResult *) PG_GETARG_POINTER(1);
00525     IndexBulkDeleteCallback callback = (IndexBulkDeleteCallback) PG_GETARG_POINTER(2);
00526     void       *callback_state = (void *) PG_GETARG_POINTER(3);
00527     Relation    rel = info->index;
00528     BTCycleId   cycleid;
00529 
00530     /* allocate stats if first time through, else re-use existing struct */
00531     if (stats == NULL)
00532         stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
00533 
00534     /* Establish the vacuum cycle ID to use for this scan */
00535     /* The ENSURE stuff ensures we clean up shared memory on failure */
00536     PG_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel));
00537     {
00538         cycleid = _bt_start_vacuum(rel);
00539 
00540         btvacuumscan(info, stats, callback, callback_state, cycleid);
00541     }
00542     PG_END_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel));
00543     _bt_end_vacuum(rel);
00544 
00545     PG_RETURN_POINTER(stats);
00546 }

Datum btendscan ( PG_FUNCTION_ARGS   ) 

Definition at line 409 of file nbtree.c.

References _bt_killitems(), BTScanPosIsValid, BTScanPosData::buf, BTScanOpaqueData::currPos, InvalidBuffer, BTScanOpaqueData::keyData, BTScanOpaqueData::killedItems, BTScanOpaqueData::markItemIndex, BTScanOpaqueData::markPos, NULL, BTScanOpaqueData::numKilled, IndexScanDescData::opaque, pfree(), PG_GETARG_POINTER, PG_RETURN_VOID, and ReleaseBuffer().

00410 {
00411     IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
00412     BTScanOpaque so = (BTScanOpaque) scan->opaque;
00413 
00414     /* we aren't holding any read locks, but gotta drop the pins */
00415     if (BTScanPosIsValid(so->currPos))
00416     {
00417         /* Before leaving current page, deal with any killed items */
00418         if (so->numKilled > 0)
00419             _bt_killitems(scan, false);
00420         ReleaseBuffer(so->currPos.buf);
00421         so->currPos.buf = InvalidBuffer;
00422     }
00423 
00424     if (BTScanPosIsValid(so->markPos))
00425     {
00426         ReleaseBuffer(so->markPos.buf);
00427         so->markPos.buf = InvalidBuffer;
00428     }
00429     so->markItemIndex = -1;
00430 
00431     if (so->killedItems != NULL)
00432         pfree(so->killedItems);
00433     if (so->keyData != NULL)
00434         pfree(so->keyData);
00435     pfree(so);
00436 
00437     PG_RETURN_VOID();
00438 }

Datum btgetbitmap ( PG_FUNCTION_ARGS   ) 

Definition at line 292 of file nbtree.c.

References _bt_first(), _bt_next(), BTScanOpaqueData::currPos, ForwardScanDirection, BTScanPosItem::heapTid, BTScanPosData::itemIndex, BTScanPosData::items, BTScanPosData::lastItem, IndexScanDescData::opaque, PG_GETARG_POINTER, PG_RETURN_INT64, HeapTupleData::t_self, tbm_add_tuples(), and IndexScanDescData::xs_ctup.

00293 {
00294     IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
00295     TIDBitmap  *tbm = (TIDBitmap *) PG_GETARG_POINTER(1);
00296     BTScanOpaque so = (BTScanOpaque) scan->opaque;
00297     int64       ntids = 0;
00298     ItemPointer heapTid;
00299 
00300     /* Fetch the first page & tuple. */
00301     if (!_bt_first(scan, ForwardScanDirection))
00302     {
00303         /* empty scan */
00304         PG_RETURN_INT64(0);
00305     }
00306     /* Save tuple ID, and continue scanning */
00307     heapTid = &scan->xs_ctup.t_self;
00308     tbm_add_tuples(tbm, heapTid, 1, false);
00309     ntids++;
00310 
00311     for (;;)
00312     {
00313         /*
00314          * Advance to next tuple within page.  This is the same as the easy
00315          * case in _bt_next().
00316          */
00317         if (++so->currPos.itemIndex > so->currPos.lastItem)
00318         {
00319             /* let _bt_next do the heavy lifting */
00320             if (!_bt_next(scan, ForwardScanDirection))
00321                 break;
00322         }
00323 
00324         /* Save tuple ID, and continue scanning */
00325         heapTid = &so->currPos.items[so->currPos.itemIndex].heapTid;
00326         tbm_add_tuples(tbm, heapTid, 1, false);
00327         ntids++;
00328     }
00329 
00330     PG_RETURN_INT64(ntids);
00331 }

Datum btgettuple ( PG_FUNCTION_ARGS   ) 

Definition at line 240 of file nbtree.c.

References _bt_first(), _bt_next(), BTScanPosIsValid, BTScanOpaqueData::currPos, BTScanPosData::itemIndex, IndexScanDescData::kill_prior_tuple, BTScanOpaqueData::killedItems, MaxIndexTuplesPerPage, NULL, BTScanOpaqueData::numKilled, IndexScanDescData::opaque, palloc, PG_GETARG_INT32, PG_GETARG_POINTER, PG_RETURN_BOOL, and IndexScanDescData::xs_recheck.

00241 {
00242     IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
00243     ScanDirection dir = (ScanDirection) PG_GETARG_INT32(1);
00244     BTScanOpaque so = (BTScanOpaque) scan->opaque;
00245     bool        res;
00246 
00247     /* btree indexes are never lossy */
00248     scan->xs_recheck = false;
00249 
00250     /*
00251      * If we've already initialized this scan, we can just advance it in the
00252      * appropriate direction.  If we haven't done so yet, we call a routine to
00253      * get the first item in the scan.
00254      */
00255     if (BTScanPosIsValid(so->currPos))
00256     {
00257         /*
00258          * Check to see if we should kill the previously-fetched tuple.
00259          */
00260         if (scan->kill_prior_tuple)
00261         {
00262             /*
00263              * Yes, remember it for later.  (We'll deal with all such tuples
00264              * at once right before leaving the index page.)  The test for
00265              * numKilled overrun is not just paranoia: if the caller reverses
00266              * direction in the indexscan then the same item might get entered
00267              * multiple times.  It's not worth trying to optimize that, so we
00268              * don't detect it, but instead just forget any excess entries.
00269              */
00270             if (so->killedItems == NULL)
00271                 so->killedItems = (int *)
00272                     palloc(MaxIndexTuplesPerPage * sizeof(int));
00273             if (so->numKilled < MaxIndexTuplesPerPage)
00274                 so->killedItems[so->numKilled++] = so->currPos.itemIndex;
00275         }
00276 
00277         /*
00278          * Now continue the scan.
00279          */
00280         res = _bt_next(scan, dir);
00281     }
00282     else
00283         res = _bt_first(scan, dir);
00284 
00285     PG_RETURN_BOOL(res);
00286 }

Datum btinsert ( PG_FUNCTION_ARGS   ) 

Definition at line 214 of file nbtree.c.

References _bt_doinsert(), index_form_tuple(), pfree(), PG_GETARG_INT32, PG_GETARG_POINTER, PG_RETURN_BOOL, RelationGetDescr, IndexTupleData::t_tid, and values.

00215 {
00216     Relation    rel = (Relation) PG_GETARG_POINTER(0);
00217     Datum      *values = (Datum *) PG_GETARG_POINTER(1);
00218     bool       *isnull = (bool *) PG_GETARG_POINTER(2);
00219     ItemPointer ht_ctid = (ItemPointer) PG_GETARG_POINTER(3);
00220     Relation    heapRel = (Relation) PG_GETARG_POINTER(4);
00221     IndexUniqueCheck checkUnique = (IndexUniqueCheck) PG_GETARG_INT32(5);
00222     bool        result;
00223     IndexTuple  itup;
00224 
00225     /* generate an index tuple */
00226     itup = index_form_tuple(RelationGetDescr(rel), values, isnull);
00227     itup->t_tid = *ht_ctid;
00228 
00229     result = _bt_doinsert(rel, itup, checkUnique, heapRel);
00230 
00231     pfree(itup);
00232 
00233     PG_RETURN_BOOL(result);
00234 }

Datum btmarkpos ( PG_FUNCTION_ARGS   ) 

Definition at line 444 of file nbtree.c.

References BTScanPosIsValid, BTScanPosData::buf, BTScanOpaqueData::currPos, InvalidBuffer, BTScanPosData::itemIndex, BTScanOpaqueData::markItemIndex, BTScanOpaqueData::markPos, IndexScanDescData::opaque, PG_GETARG_POINTER, PG_RETURN_VOID, and ReleaseBuffer().

00445 {
00446     IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
00447     BTScanOpaque so = (BTScanOpaque) scan->opaque;
00448 
00449     /* we aren't holding any read locks, but gotta drop the pin */
00450     if (BTScanPosIsValid(so->markPos))
00451     {
00452         ReleaseBuffer(so->markPos.buf);
00453         so->markPos.buf = InvalidBuffer;
00454     }
00455 
00456     /*
00457      * Just record the current itemIndex.  If we later step to next page
00458      * before releasing the marked position, _bt_steppage makes a full copy of
00459      * the currPos struct in markPos.  If (as often happens) the mark is moved
00460      * before we leave the page, we don't have to do that work.
00461      */
00462     if (BTScanPosIsValid(so->currPos))
00463         so->markItemIndex = so->currPos.itemIndex;
00464     else
00465         so->markItemIndex = -1;
00466 
00467     PG_RETURN_VOID();
00468 }

Datum btrescan ( PG_FUNCTION_ARGS   ) 

Definition at line 354 of file nbtree.c.

References _bt_killitems(), BTScanPosIsValid, BTScanPosData::buf, BTScanOpaqueData::currPos, InvalidBuffer, IndexScanDescData::keyData, BTScanOpaqueData::keyData, BTScanOpaqueData::killedItems, BTScanOpaqueData::markItemIndex, BTScanOpaqueData::markPos, memmove, NULL, BTScanOpaqueData::numberOfKeys, IndexScanDescData::numberOfKeys, BTScanOpaqueData::numKilled, IndexScanDescData::opaque, palloc, PG_GETARG_POINTER, PG_RETURN_VOID, and ReleaseBuffer().

00355 {
00356     IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
00357     ScanKey     scankey = (ScanKey) PG_GETARG_POINTER(1);
00358     BTScanOpaque so;
00359 
00360     so = (BTScanOpaque) scan->opaque;
00361 
00362     if (so == NULL)             /* if called from btbeginscan */
00363     {
00364         so = (BTScanOpaque) palloc(sizeof(BTScanOpaqueData));
00365         so->currPos.buf = so->markPos.buf = InvalidBuffer;
00366         if (scan->numberOfKeys > 0)
00367             so->keyData = (ScanKey) palloc(scan->numberOfKeys * sizeof(ScanKeyData));
00368         else
00369             so->keyData = NULL;
00370         so->killedItems = NULL; /* until needed */
00371         so->numKilled = 0;
00372         scan->opaque = so;
00373     }
00374 
00375     /* we aren't holding any read locks, but gotta drop the pins */
00376     if (BTScanPosIsValid(so->currPos))
00377     {
00378         /* Before leaving current page, deal with any killed items */
00379         if (so->numKilled > 0)
00380             _bt_killitems(scan, false);
00381         ReleaseBuffer(so->currPos.buf);
00382         so->currPos.buf = InvalidBuffer;
00383     }
00384 
00385     if (BTScanPosIsValid(so->markPos))
00386     {
00387         ReleaseBuffer(so->markPos.buf);
00388         so->markPos.buf = InvalidBuffer;
00389     }
00390     so->markItemIndex = -1;
00391 
00392     /*
00393      * Reset the scan keys. Note that keys ordering stuff moved to _bt_first.
00394      * - vadim 05/05/97
00395      */
00396     if (scankey && scan->numberOfKeys > 0)
00397         memmove(scan->keyData,
00398                 scankey,
00399                 scan->numberOfKeys * sizeof(ScanKeyData));
00400     so->numberOfKeys = 0;       /* until _bt_preprocess_keys sets it */
00401 
00402     PG_RETURN_VOID();
00403 }

Datum btrestrpos ( PG_FUNCTION_ARGS   ) 

Definition at line 474 of file nbtree.c.

References _bt_killitems(), BTScanPosIsValid, BTScanPosData::buf, BTScanOpaqueData::currPos, IncrBufferRefCount(), InvalidBuffer, BTScanPosData::itemIndex, BTScanPosData::lastItem, BTScanOpaqueData::markItemIndex, BTScanOpaqueData::markPos, BTScanOpaqueData::numKilled, offsetof, IndexScanDescData::opaque, PG_GETARG_POINTER, PG_RETURN_VOID, and ReleaseBuffer().

00475 {
00476     IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
00477     BTScanOpaque so = (BTScanOpaque) scan->opaque;
00478 
00479     if (so->markItemIndex >= 0)
00480     {
00481         /*
00482          * The mark position is on the same page we are currently on. Just
00483          * restore the itemIndex.
00484          */
00485         so->currPos.itemIndex = so->markItemIndex;
00486     }
00487     else
00488     {
00489         /* we aren't holding any read locks, but gotta drop the pin */
00490         if (BTScanPosIsValid(so->currPos))
00491         {
00492             /* Before leaving current page, deal with any killed items */
00493             if (so->numKilled > 0 &&
00494                 so->currPos.buf != so->markPos.buf)
00495                 _bt_killitems(scan, false);
00496             ReleaseBuffer(so->currPos.buf);
00497             so->currPos.buf = InvalidBuffer;
00498         }
00499 
00500         if (BTScanPosIsValid(so->markPos))
00501         {
00502             /* bump pin on mark buffer for assignment to current buffer */
00503             IncrBufferRefCount(so->markPos.buf);
00504             memcpy(&so->currPos, &so->markPos,
00505                    offsetof(BTScanPosData, items[1]) +
00506                    so->markPos.lastItem * sizeof(BTScanPosItem));
00507         }
00508     }
00509 
00510     PG_RETURN_VOID();
00511 }

Datum btvacuumcleanup ( PG_FUNCTION_ARGS   ) 

Definition at line 554 of file nbtree.c.

References IndexVacuumInfo::analyze_only, btvacuumscan(), IndexVacuumInfo::estimated_count, IndexVacuumInfo::index, IndexFreeSpaceMapVacuum(), NULL, IndexVacuumInfo::num_heap_tuples, IndexBulkDeleteResult::num_index_tuples, palloc0, PG_GETARG_POINTER, and PG_RETURN_POINTER.

00555 {
00556     IndexVacuumInfo *info = (IndexVacuumInfo *) PG_GETARG_POINTER(0);
00557     IndexBulkDeleteResult *stats = (IndexBulkDeleteResult *) PG_GETARG_POINTER(1);
00558 
00559     /* No-op in ANALYZE ONLY mode */
00560     if (info->analyze_only)
00561         PG_RETURN_POINTER(stats);
00562 
00563     /*
00564      * If btbulkdelete was called, we need not do anything, just return the
00565      * stats from the latest btbulkdelete call.  If it wasn't called, we must
00566      * still do a pass over the index, to recycle any newly-recyclable pages
00567      * and to obtain index statistics.
00568      *
00569      * Since we aren't going to actually delete any leaf items, there's no
00570      * need to go through all the vacuum-cycle-ID pushups.
00571      */
00572     if (stats == NULL)
00573     {
00574         stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
00575         btvacuumscan(info, stats, NULL, NULL, 0);
00576     }
00577 
00578     /* Finally, vacuum the FSM */
00579     IndexFreeSpaceMapVacuum(info->index);
00580 
00581     /*
00582      * It's quite possible for us to be fooled by concurrent page splits into
00583      * double-counting some index tuples, so disbelieve any total that exceeds
00584      * the underlying heap's count ... if we know that accurately.  Otherwise
00585      * this might just make matters worse.
00586      */
00587     if (!info->estimated_count)
00588     {
00589         if (stats->num_index_tuples > info->num_heap_tuples)
00590             stats->num_index_tuples = info->num_heap_tuples;
00591     }
00592 
00593     PG_RETURN_POINTER(stats);
00594 }

static void btvacuumpage ( BTVacState vstate,
BlockNumber  blkno,
BlockNumber  orig_blkno 
) [static]

Definition at line 734 of file nbtree.c.

References _bt_checkpage(), _bt_delitems(), _bt_page_recyclable(), _bt_pagedel(), _bt_relbuf(), BT_READ, BTP_SPLIT_END, BTPageOpaqueData::btpo_cycleid, BTPageOpaqueData::btpo_flags, BTPageOpaqueData::btpo_next, BUFFER_LOCK_UNLOCK, BufferGetBlockNumber(), BufferGetPage, BTVacState::callback, callback(), BTVacState::callback_state, BTVacState::cycleid, IndexVacuumInfo::index, BTVacState::info, BTVacState::lastBlockVacuumed, BTVacState::lastUsedPage, LockBuffer(), LockBufferForCleanup(), MAIN_FORKNUM, MaxOffsetNumber, MemoryContextReset(), MemoryContextSwitchTo(), NULL, IndexBulkDeleteResult::num_index_tuples, OffsetNumberNext, P_FIRSTDATAKEY, P_IGNORE, P_ISDELETED, P_ISHALFDEAD, P_ISLEAF, P_NONE, P_RIGHTMOST, BTVacState::pagedelcontext, PageGetItem, PageGetItemId, PageGetMaxOffsetNumber, PageGetSpecialPointer, PageIsNew, IndexBulkDeleteResult::pages_deleted, RBM_NORMAL, ReadBufferExtended(), RecordFreeIndexPage(), SetBufferCommitInfoNeedsSave(), BTVacState::stats, IndexVacuumInfo::strategy, IndexTupleData::t_tid, BTVacState::totFreePages, IndexBulkDeleteResult::tuples_removed, and vacuum_delay_point().

Referenced by btvacuumscan().

00735 {
00736     IndexVacuumInfo *info = vstate->info;
00737     IndexBulkDeleteResult *stats = vstate->stats;
00738     IndexBulkDeleteCallback callback = vstate->callback;
00739     void       *callback_state = vstate->callback_state;
00740     Relation    rel = info->index;
00741     bool        delete_now;
00742     BlockNumber recurse_to;
00743     Buffer      buf;
00744     Page        page;
00745     BTPageOpaque opaque;
00746 
00747 restart:
00748     delete_now = false;
00749     recurse_to = P_NONE;
00750 
00751     /* call vacuum_delay_point while not holding any buffer lock */
00752     vacuum_delay_point();
00753 
00754     /*
00755      * We can't use _bt_getbuf() here because it always applies
00756      * _bt_checkpage(), which will barf on an all-zero page. We want to
00757      * recycle all-zero pages, not fail.  Also, we want to use a nondefault
00758      * buffer access strategy.
00759      */
00760     buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL,
00761                              info->strategy);
00762     LockBuffer(buf, BT_READ);
00763     page = BufferGetPage(buf);
00764     opaque = (BTPageOpaque) PageGetSpecialPointer(page);
00765     if (!PageIsNew(page))
00766         _bt_checkpage(rel, buf);
00767 
00768     /*
00769      * If we are recursing, the only case we want to do anything with is a
00770      * live leaf page having the current vacuum cycle ID.  Any other state
00771      * implies we already saw the page (eg, deleted it as being empty).
00772      */
00773     if (blkno != orig_blkno)
00774     {
00775         if (_bt_page_recyclable(page) ||
00776             P_IGNORE(opaque) ||
00777             !P_ISLEAF(opaque) ||
00778             opaque->btpo_cycleid != vstate->cycleid)
00779         {
00780             _bt_relbuf(rel, buf);
00781             return;
00782         }
00783     }
00784 
00785     /* If the page is in use, update lastUsedPage */
00786     if (!_bt_page_recyclable(page) && vstate->lastUsedPage < blkno)
00787         vstate->lastUsedPage = blkno;
00788 
00789     /* Page is valid, see what to do with it */
00790     if (_bt_page_recyclable(page))
00791     {
00792         /* Okay to recycle this page */
00793         RecordFreeIndexPage(rel, blkno);
00794         vstate->totFreePages++;
00795         stats->pages_deleted++;
00796     }
00797     else if (P_ISDELETED(opaque))
00798     {
00799         /* Already deleted, but can't recycle yet */
00800         stats->pages_deleted++;
00801     }
00802     else if (P_ISHALFDEAD(opaque))
00803     {
00804         /* Half-dead, try to delete */
00805         delete_now = true;
00806     }
00807     else if (P_ISLEAF(opaque))
00808     {
00809         OffsetNumber deletable[MaxOffsetNumber];
00810         int         ndeletable;
00811         OffsetNumber offnum,
00812                     minoff,
00813                     maxoff;
00814 
00815         /*
00816          * Trade in the initial read lock for a super-exclusive write lock on
00817          * this page.  We must get such a lock on every leaf page over the
00818          * course of the vacuum scan, whether or not it actually contains any
00819          * deletable tuples --- see nbtree/README.
00820          */
00821         LockBuffer(buf, BUFFER_LOCK_UNLOCK);
00822         LockBufferForCleanup(buf);
00823 
00824         /*
00825          * Check whether we need to recurse back to earlier pages.  What we
00826          * are concerned about is a page split that happened since we started
00827          * the vacuum scan.  If the split moved some tuples to a lower page
00828          * then we might have missed 'em.  If so, set up for tail recursion.
00829          * (Must do this before possibly clearing btpo_cycleid below!)
00830          */
00831         if (vstate->cycleid != 0 &&
00832             opaque->btpo_cycleid == vstate->cycleid &&
00833             !(opaque->btpo_flags & BTP_SPLIT_END) &&
00834             !P_RIGHTMOST(opaque) &&
00835             opaque->btpo_next < orig_blkno)
00836             recurse_to = opaque->btpo_next;
00837 
00838         /*
00839          * Scan over all items to see which ones need deleted according to the
00840          * callback function.
00841          */
00842         ndeletable = 0;
00843         minoff = P_FIRSTDATAKEY(opaque);
00844         maxoff = PageGetMaxOffsetNumber(page);
00845         if (callback)
00846         {
00847             for (offnum = minoff;
00848                  offnum <= maxoff;
00849                  offnum = OffsetNumberNext(offnum))
00850             {
00851                 IndexTuple  itup;
00852                 ItemPointer htup;
00853 
00854                 itup = (IndexTuple) PageGetItem(page,
00855                                                 PageGetItemId(page, offnum));
00856                 htup = &(itup->t_tid);
00857 
00858                 /*
00859                  * During Hot Standby we currently assume that
00860                  * XLOG_BTREE_VACUUM records do not produce conflicts. That is
00861                  * only true as long as the callback function depends only
00862                  * upon whether the index tuple refers to heap tuples removed
00863                  * in the initial heap scan. When vacuum starts it derives a
00864                  * value of OldestXmin. Backends taking later snapshots could
00865                  * have a RecentGlobalXmin with a later xid than the vacuum's
00866                  * OldestXmin, so it is possible that row versions deleted
00867                  * after OldestXmin could be marked as killed by other
00868                  * backends. The callback function *could* look at the index
00869                  * tuple state in isolation and decide to delete the index
00870                  * tuple, though currently it does not. If it ever did, we
00871                  * would need to reconsider whether XLOG_BTREE_VACUUM records
00872                  * should cause conflicts. If they did cause conflicts they
00873                  * would be fairly harsh conflicts, since we haven't yet
00874                  * worked out a way to pass a useful value for
00875                  * latestRemovedXid on the XLOG_BTREE_VACUUM records. This
00876                  * applies to *any* type of index that marks index tuples as
00877                  * killed.
00878                  */
00879                 if (callback(htup, callback_state))
00880                     deletable[ndeletable++] = offnum;
00881             }
00882         }
00883 
00884         /*
00885          * Apply any needed deletes.  We issue just one _bt_delitems() call
00886          * per page, so as to minimize WAL traffic.
00887          */
00888         if (ndeletable > 0)
00889         {
00890             BlockNumber lastBlockVacuumed = BufferGetBlockNumber(buf);
00891 
00892             _bt_delitems(rel, buf, deletable, ndeletable, true, vstate->lastBlockVacuumed);
00893 
00894             /*
00895              * Keep track of the block number of the lastBlockVacuumed, so we
00896              * can scan those blocks as well during WAL replay. This then
00897              * provides concurrency protection and allows btrees to be used
00898              * while in recovery.
00899              */
00900             if (lastBlockVacuumed > vstate->lastBlockVacuumed)
00901                 vstate->lastBlockVacuumed = lastBlockVacuumed;
00902 
00903             stats->tuples_removed += ndeletable;
00904             /* must recompute maxoff */
00905             maxoff = PageGetMaxOffsetNumber(page);
00906         }
00907         else
00908         {
00909             /*
00910              * If the page has been split during this vacuum cycle, it seems
00911              * worth expending a write to clear btpo_cycleid even if we don't
00912              * have any deletions to do.  (If we do, _bt_delitems takes care
00913              * of this.)  This ensures we won't process the page again.
00914              *
00915              * We treat this like a hint-bit update because there's no need to
00916              * WAL-log it.
00917              */
00918             if (vstate->cycleid != 0 &&
00919                 opaque->btpo_cycleid == vstate->cycleid)
00920             {
00921                 opaque->btpo_cycleid = 0;
00922                 SetBufferCommitInfoNeedsSave(buf);
00923             }
00924         }
00925 
00926         /*
00927          * If it's now empty, try to delete; else count the live tuples. We
00928          * don't delete when recursing, though, to avoid putting entries into
00929          * freePages out-of-order (doesn't seem worth any extra code to handle
00930          * the case).
00931          */
00932         if (minoff > maxoff)
00933             delete_now = (blkno == orig_blkno);
00934         else
00935             stats->num_index_tuples += maxoff - minoff + 1;
00936     }
00937 
00938     if (delete_now)
00939     {
00940         MemoryContext oldcontext;
00941         int         ndel;
00942 
00943         /* Run pagedel in a temp context to avoid memory leakage */
00944         MemoryContextReset(vstate->pagedelcontext);
00945         oldcontext = MemoryContextSwitchTo(vstate->pagedelcontext);
00946 
00947         ndel = _bt_pagedel(rel, buf, NULL);
00948 
00949         /* count only this page, else may double-count parent */
00950         if (ndel)
00951             stats->pages_deleted++;
00952 
00953         MemoryContextSwitchTo(oldcontext);
00954         /* pagedel released buffer, so we shouldn't */
00955     }
00956     else
00957         _bt_relbuf(rel, buf);
00958 
00959     /*
00960      * This is really tail recursion, but if the compiler is too stupid to
00961      * optimize it as such, we'd eat an uncomfortably large amount of stack
00962      * space per recursion level (due to the deletable[] array). A failure is
00963      * improbable since the number of levels isn't likely to be large ... but
00964      * just in case, let's hand-optimize into a loop.
00965      */
00966     if (recurse_to != P_NONE)
00967     {
00968         blkno = recurse_to;
00969         goto restart;
00970     }
00971 }

static void btvacuumscan ( IndexVacuumInfo info,
IndexBulkDeleteResult stats,
IndexBulkDeleteCallback  callback,
void *  callback_state,
BTCycleId  cycleid 
) [static]

Definition at line 609 of file nbtree.c.

References _bt_delitems(), _bt_relbuf(), ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE, ALLOCSET_DEFAULT_MINSIZE, AllocSetContextCreate(), BTREE_METAPAGE, btvacuumpage(), BTVacState::callback, BTVacState::callback_state, CurrentMemoryContext, BTVacState::cycleid, IndexBulkDeleteResult::estimated_count, ExclusiveLock, IndexVacuumInfo::index, BTVacState::info, BTVacState::lastBlockVacuumed, BTVacState::lastUsedPage, LockBufferForCleanup(), LockRelationForExtension(), MAIN_FORKNUM, MemoryContextDelete(), NULL, IndexBulkDeleteResult::num_index_tuples, IndexBulkDeleteResult::num_pages, BTVacState::pagedelcontext, IndexBulkDeleteResult::pages_deleted, IndexBulkDeleteResult::pages_free, RBM_NORMAL, ReadBufferExtended(), RELATION_IS_LOCAL, RelationGetNumberOfBlocks(), BTVacState::stats, IndexVacuumInfo::strategy, BTVacState::totFreePages, UnlockRelationForExtension(), and XLogStandbyInfoActive.

Referenced by btbulkdelete(), and btvacuumcleanup().

00612 {
00613     Relation    rel = info->index;
00614     BTVacState  vstate;
00615     BlockNumber num_pages;
00616     BlockNumber blkno;
00617     bool        needLock;
00618 
00619     /*
00620      * Reset counts that will be incremented during the scan; needed in case
00621      * of multiple scans during a single VACUUM command
00622      */
00623     stats->estimated_count = false;
00624     stats->num_index_tuples = 0;
00625     stats->pages_deleted = 0;
00626 
00627     /* Set up info to pass down to btvacuumpage */
00628     vstate.info = info;
00629     vstate.stats = stats;
00630     vstate.callback = callback;
00631     vstate.callback_state = callback_state;
00632     vstate.cycleid = cycleid;
00633     vstate.lastBlockVacuumed = BTREE_METAPAGE;  /* Initialise at first block */
00634     vstate.lastUsedPage = BTREE_METAPAGE;
00635     vstate.totFreePages = 0;
00636 
00637     /* Create a temporary memory context to run _bt_pagedel in */
00638     vstate.pagedelcontext = AllocSetContextCreate(CurrentMemoryContext,
00639                                                   "_bt_pagedel",
00640                                                   ALLOCSET_DEFAULT_MINSIZE,
00641                                                   ALLOCSET_DEFAULT_INITSIZE,
00642                                                   ALLOCSET_DEFAULT_MAXSIZE);
00643 
00644     /*
00645      * The outer loop iterates over all index pages except the metapage, in
00646      * physical order (we hope the kernel will cooperate in providing
00647      * read-ahead for speed).  It is critical that we visit all leaf pages,
00648      * including ones added after we start the scan, else we might fail to
00649      * delete some deletable tuples.  Hence, we must repeatedly check the
00650      * relation length.  We must acquire the relation-extension lock while
00651      * doing so to avoid a race condition: if someone else is extending the
00652      * relation, there is a window where bufmgr/smgr have created a new
00653      * all-zero page but it hasn't yet been write-locked by _bt_getbuf(). If
00654      * we manage to scan such a page here, we'll improperly assume it can be
00655      * recycled.  Taking the lock synchronizes things enough to prevent a
00656      * problem: either num_pages won't include the new page, or _bt_getbuf
00657      * already has write lock on the buffer and it will be fully initialized
00658      * before we can examine it.  (See also vacuumlazy.c, which has the same
00659      * issue.)  Also, we need not worry if a page is added immediately after
00660      * we look; the page splitting code already has write-lock on the left
00661      * page before it adds a right page, so we must already have processed any
00662      * tuples due to be moved into such a page.
00663      *
00664      * We can skip locking for new or temp relations, however, since no one
00665      * else could be accessing them.
00666      */
00667     needLock = !RELATION_IS_LOCAL(rel);
00668 
00669     blkno = BTREE_METAPAGE + 1;
00670     for (;;)
00671     {
00672         /* Get the current relation length */
00673         if (needLock)
00674             LockRelationForExtension(rel, ExclusiveLock);
00675         num_pages = RelationGetNumberOfBlocks(rel);
00676         if (needLock)
00677             UnlockRelationForExtension(rel, ExclusiveLock);
00678 
00679         /* Quit if we've scanned the whole relation */
00680         if (blkno >= num_pages)
00681             break;
00682         /* Iterate over pages, then loop back to recheck length */
00683         for (; blkno < num_pages; blkno++)
00684         {
00685             btvacuumpage(&vstate, blkno, blkno);
00686         }
00687     }
00688 
00689     /*
00690      * InHotStandby we need to scan right up to the end of the index for
00691      * correct locking, so we may need to write a WAL record for the final
00692      * block in the index if it was not vacuumed. It's possible that VACUUMing
00693      * has actually removed zeroed pages at the end of the index so we need to
00694      * take care to issue the record for last actual block and not for the
00695      * last block that was scanned. Ignore empty indexes.
00696      */
00697     if (XLogStandbyInfoActive() &&
00698         num_pages > 1 && vstate.lastBlockVacuumed < (num_pages - 1))
00699     {
00700         Buffer      buf;
00701 
00702         /*
00703          * We can't use _bt_getbuf() here because it always applies
00704          * _bt_checkpage(), which will barf on an all-zero page. We want to
00705          * recycle all-zero pages, not fail.  Also, we want to use a
00706          * nondefault buffer access strategy.
00707          */
00708         buf = ReadBufferExtended(rel, MAIN_FORKNUM, num_pages - 1, RBM_NORMAL,
00709                                  info->strategy);
00710         LockBufferForCleanup(buf);
00711         _bt_delitems(rel, buf, NULL, 0, true, vstate.lastBlockVacuumed);
00712         _bt_relbuf(rel, buf);
00713     }
00714 
00715     MemoryContextDelete(vstate.pagedelcontext);
00716 
00717     /* update statistics */
00718     stats->num_pages = num_pages;
00719     stats->pages_free = vstate.totFreePages;
00720 }


Generated on Sat Mar 13 03:04:58 2010 for PostgreSQL Source Code by  doxygen 1.5.8