PostgreSQL Source Code git master
Loading...
Searching...
No Matches
nbtdedup.c File Reference
#include "postgres.h"
#include "access/nbtree.h"
#include "access/nbtxlog.h"
#include "access/tableam.h"
#include "access/xloginsert.h"
#include "miscadmin.h"
#include "utils/rel.h"
Include dependency graph for nbtdedup.c:

Go to the source code of this file.

Functions

static void _bt_bottomupdel_finish_pending (Page page, BTDedupState state, TM_IndexDeleteOp *delstate)
 
static bool _bt_do_singleval (Relation rel, Page page, BTDedupState state, OffsetNumber minoff, IndexTuple newitem)
 
static void _bt_singleval_fillfactor (Page page, BTDedupState state, Size newitemsz)
 
void _bt_dedup_pass (Relation rel, Buffer buf, IndexTuple newitem, Size newitemsz, bool bottomupdedup)
 
bool _bt_bottomupdel_pass (Relation rel, Buffer buf, Relation heapRel, Size newitemsz)
 
void _bt_dedup_start_pending (BTDedupState state, IndexTuple base, OffsetNumber baseoff)
 
bool _bt_dedup_save_htid (BTDedupState state, IndexTuple itup)
 
Size _bt_dedup_finish_pending (Page newpage, BTDedupState state)
 
IndexTuple _bt_form_posting (IndexTuple base, const ItemPointerData *htids, int nhtids)
 
void _bt_update_posting (BTVacuumPosting vacposting)
 
IndexTuple _bt_swap_posting (IndexTuple newitem, IndexTuple oposting, int postingoff)
 

Function Documentation

◆ _bt_bottomupdel_finish_pending()

static void _bt_bottomupdel_finish_pending ( Page  page,
BTDedupState  state,
TM_IndexDeleteOp delstate 
)
static

Definition at line 648 of file nbtdedup.c.

650{
651 bool dupinterval = (state->nitems > 1);
652
653 Assert(state->nitems > 0);
654 Assert(state->nitems <= state->nhtids);
655 Assert(state->intervals[state->nintervals].baseoff == state->baseoff);
656
657 for (int i = 0; i < state->nitems; i++)
658 {
659 OffsetNumber offnum = state->baseoff + i;
660 ItemId itemid = PageGetItemId(page, offnum);
661 IndexTuple itup = (IndexTuple) PageGetItem(page, itemid);
662 TM_IndexDelete *ideltid = &delstate->deltids[delstate->ndeltids];
663 TM_IndexStatus *istatus = &delstate->status[delstate->ndeltids];
664
665 if (!BTreeTupleIsPosting(itup))
666 {
667 /* Simple case: A plain non-pivot tuple */
668 ideltid->tid = itup->t_tid;
669 ideltid->id = delstate->ndeltids;
670 istatus->idxoffnum = offnum;
671 istatus->knowndeletable = false; /* for now */
672 istatus->promising = dupinterval; /* simple rule */
673 istatus->freespace = ItemIdGetLength(itemid) + sizeof(ItemIdData);
674
675 delstate->ndeltids++;
676 }
677 else
678 {
679 /*
680 * Complicated case: A posting list tuple.
681 *
682 * We make the conservative assumption that there can only be at
683 * most one affected logical row per posting list tuple. There
684 * will be at most one promising entry in deltids to represent
685 * this presumed lone logical row. Note that this isn't even
686 * considered unless the posting list tuple is also in an interval
687 * of duplicates -- this complicated rule is just a variant of the
688 * simple rule used to decide if plain index tuples are promising.
689 */
690 int nitem = BTreeTupleGetNPosting(itup);
691 bool firstpromising = false;
692 bool lastpromising = false;
693
695
696 if (dupinterval)
697 {
698 /*
699 * Complicated rule: either the first or last TID in the
700 * posting list gets marked promising (if any at all)
701 */
706 midtid,
707 maxtid;
708
710 midtid = BTreeTupleGetPostingN(itup, nitem / 2);
715
716 /* Only entry with predominant table block can be promising */
720 }
721
722 for (int p = 0; p < nitem; p++)
723 {
725
726 ideltid->tid = *htid;
727 ideltid->id = delstate->ndeltids;
728 istatus->idxoffnum = offnum;
729 istatus->knowndeletable = false; /* for now */
730 istatus->promising = false;
731 if ((firstpromising && p == 0) ||
732 (lastpromising && p == nitem - 1))
733 istatus->promising = true;
734 istatus->freespace = sizeof(ItemPointerData); /* at worst */
735
736 ideltid++;
737 istatus++;
738 delstate->ndeltids++;
739 }
740 }
741 }
742
743 if (dupinterval)
744 {
745 state->intervals[state->nintervals].nitems = state->nitems;
746 state->nintervals++;
747 }
748
749 /* Reset state for next interval */
750 state->nhtids = 0;
751 state->nitems = 0;
752 state->phystupsize = 0;
753}
uint32 BlockNumber
Definition block.h:31
static ItemId PageGetItemId(Page page, OffsetNumber offsetNumber)
Definition bufpage.h:269
static void * PageGetItem(PageData *page, const ItemIdData *itemId)
Definition bufpage.h:379
#define Assert(condition)
Definition c.h:945
int i
Definition isn.c:77
#define ItemIdGetLength(itemId)
Definition itemid.h:59
static BlockNumber ItemPointerGetBlockNumber(const ItemPointerData *pointer)
Definition itemptr.h:103
IndexTupleData * IndexTuple
Definition itup.h:53
static uint16 BTreeTupleGetNPosting(IndexTuple posting)
Definition nbtree.h:519
static ItemPointer BTreeTupleGetPostingN(IndexTuple posting, int n)
Definition nbtree.h:545
static ItemPointer BTreeTupleGetMaxHeapTID(IndexTuple itup)
Definition nbtree.h:665
static bool BTreeTupleIsPosting(IndexTuple itup)
Definition nbtree.h:493
static ItemPointer BTreeTupleGetHeapTID(IndexTuple itup)
Definition nbtree.h:639
uint16 OffsetNumber
Definition off.h:24
static int fb(int x)
ItemPointerData t_tid
Definition itup.h:37

References Assert, BTreeTupleGetHeapTID(), BTreeTupleGetMaxHeapTID(), BTreeTupleGetNPosting(), BTreeTupleGetPostingN(), BTreeTupleIsPosting(), fb(), i, ItemIdGetLength, ItemPointerGetBlockNumber(), PageGetItem(), PageGetItemId(), and IndexTupleData::t_tid.

Referenced by _bt_bottomupdel_pass().

◆ _bt_bottomupdel_pass()

bool _bt_bottomupdel_pass ( Relation  rel,
Buffer  buf,
Relation  heapRel,
Size  newitemsz 
)

Definition at line 309 of file nbtdedup.c.

311{
312 OffsetNumber offnum,
313 minoff,
314 maxoff;
315 Page page = BufferGetPage(buf);
316 BTPageOpaque opaque = BTPageGetOpaque(page);
319 bool neverdedup;
321
322 /* Passed-in newitemsz is MAXALIGNED but does not include line pointer */
323 newitemsz += sizeof(ItemIdData);
324
325 /* Initialize deduplication state */
327 state->deduplicate = true;
328 state->nmaxitems = 0;
329 state->maxpostingsize = BLCKSZ; /* We're not really deduplicating */
330 state->base = NULL;
331 state->baseoff = InvalidOffsetNumber;
332 state->basetupsize = 0;
333 state->htids = palloc(state->maxpostingsize);
334 state->nhtids = 0;
335 state->nitems = 0;
336 state->phystupsize = 0;
337 state->nintervals = 0;
338
339 /*
340 * Initialize tableam state that describes bottom-up index deletion
341 * operation.
342 *
343 * We'll go on to ask the tableam to search for TIDs whose index tuples we
344 * can safely delete. The tableam will search until our leaf page space
345 * target is satisfied, or until the cost of continuing with the tableam
346 * operation seems too high. It focuses its efforts on TIDs associated
347 * with duplicate index tuples that we mark "promising".
348 *
349 * This space target is a little arbitrary. The tableam must be able to
350 * keep the costs and benefits in balance. We provide the tableam with
351 * exhaustive information about what might work, without directly
352 * concerning ourselves with avoiding work during the tableam call. Our
353 * role in costing the bottom-up deletion process is strictly advisory.
354 */
355 delstate.irel = rel;
357 delstate.bottomup = true;
358 delstate.bottomupfreespace = Max(BLCKSZ / 16, newitemsz);
359 delstate.ndeltids = 0;
362
363 minoff = P_FIRSTDATAKEY(opaque);
364 maxoff = PageGetMaxOffsetNumber(page);
365 for (offnum = minoff;
366 offnum <= maxoff;
367 offnum = OffsetNumberNext(offnum))
368 {
369 ItemId itemid = PageGetItemId(page, offnum);
370 IndexTuple itup = (IndexTuple) PageGetItem(page, itemid);
371
372 Assert(!ItemIdIsDead(itemid));
373
374 if (offnum == minoff)
375 {
376 /* itup starts first pending interval */
377 _bt_dedup_start_pending(state, itup, offnum);
378 }
379 else if (_bt_keep_natts_fast(rel, state->base, itup) > nkeyatts &&
381 {
382 /* Tuple is equal; just added its TIDs to pending interval */
383 }
384 else
385 {
386 /* Finalize interval -- move its TIDs to delete state */
388
389 /* itup starts new pending interval */
390 _bt_dedup_start_pending(state, itup, offnum);
391 }
392 }
393 /* Finalize final interval -- move its TIDs to delete state */
395
396 /*
397 * We don't give up now in the event of having few (or even zero)
398 * promising tuples for the tableam because it's not up to us as the index
399 * AM to manage costs (note that the tableam might have heuristics of its
400 * own that work out what to do). We should at least avoid having our
401 * caller do a useless deduplication pass after we return in the event of
402 * zero promising tuples, though.
403 */
404 neverdedup = false;
405 if (state->nintervals == 0)
406 neverdedup = true;
407
408 pfree(state->htids);
409 pfree(state);
410
411 /* Ask tableam which TIDs are deletable, then physically delete them */
412 _bt_delitems_delete_check(rel, buf, heapRel, &delstate);
413
414 pfree(delstate.deltids);
415 pfree(delstate.status);
416
417 /* Report "success" to caller unconditionally to avoid deduplication */
418 if (neverdedup)
419 return true;
420
421 /* Don't dedup when we won't end up back here any time soon anyway */
422 return PageGetExactFreeSpace(page) >= Max(BLCKSZ / 24, newitemsz);
423}
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition bufmgr.c:4426
static Page BufferGetPage(Buffer buffer)
Definition bufmgr.h:472
Size PageGetExactFreeSpace(const PageData *page)
Definition bufpage.c:957
PageData * Page
Definition bufpage.h:81
static OffsetNumber PageGetMaxOffsetNumber(const PageData *page)
Definition bufpage.h:397
#define Max(x, y)
Definition c.h:1087
#define palloc_object(type)
Definition fe_memutils.h:74
#define palloc_array(type, count)
Definition fe_memutils.h:76
#define ItemIdIsDead(itemId)
Definition itemid.h:113
void pfree(void *pointer)
Definition mcxt.c:1616
void * palloc(Size size)
Definition mcxt.c:1387
bool _bt_dedup_save_htid(BTDedupState state, IndexTuple itup)
Definition nbtdedup.c:486
void _bt_dedup_start_pending(BTDedupState state, IndexTuple base, OffsetNumber baseoff)
Definition nbtdedup.c:435
static void _bt_bottomupdel_finish_pending(Page page, BTDedupState state, TM_IndexDeleteOp *delstate)
Definition nbtdedup.c:648
void _bt_delitems_delete_check(Relation rel, Buffer buf, Relation heapRel, TM_IndexDeleteOp *delstate)
Definition nbtpage.c:1543
#define BTPageGetOpaque(page)
Definition nbtree.h:74
#define MaxTIDsPerBTreePage
Definition nbtree.h:186
#define P_FIRSTDATAKEY(opaque)
Definition nbtree.h:370
int _bt_keep_natts_fast(Relation rel, IndexTuple lastleft, IndexTuple firstright)
Definition nbtutils.c:913
#define InvalidOffsetNumber
Definition off.h:26
#define OffsetNumberNext(offsetNumber)
Definition off.h:52
static char buf[DEFAULT_XLOG_SEG_SIZE]
#define IndexRelationGetNumberOfKeyAttributes(relation)
Definition rel.h:533

References _bt_bottomupdel_finish_pending(), _bt_dedup_save_htid(), _bt_dedup_start_pending(), _bt_delitems_delete_check(), _bt_keep_natts_fast(), Assert, BTPageGetOpaque, buf, BufferGetBlockNumber(), BufferGetPage(), fb(), IndexRelationGetNumberOfKeyAttributes, InvalidOffsetNumber, ItemIdIsDead, Max, MaxTIDsPerBTreePage, OffsetNumberNext, P_FIRSTDATAKEY, PageGetExactFreeSpace(), PageGetItem(), PageGetItemId(), PageGetMaxOffsetNumber(), palloc(), palloc_array, palloc_object, and pfree().

Referenced by _bt_delete_or_dedup_one_page().

◆ _bt_dedup_finish_pending()

Size _bt_dedup_finish_pending ( Page  newpage,
BTDedupState  state 
)

Definition at line 557 of file nbtdedup.c.

558{
562
563 Assert(state->nitems > 0);
564 Assert(state->nitems <= state->nhtids);
565 Assert(state->intervals[state->nintervals].baseoff == state->baseoff);
566
568 if (state->nitems == 1)
569 {
570 /* Use original, unchanged base tuple */
574 if (PageAddItem(newpage, state->base, tuplesz, tupoff, false, false) == InvalidOffsetNumber)
575 elog(ERROR, "deduplication failed to add tuple to page");
576
577 spacesaving = 0;
578 }
579 else
580 {
581 IndexTuple final;
582
583 /* Form a tuple with a posting list */
584 final = _bt_form_posting(state->base, state->htids, state->nhtids);
585 tuplesz = IndexTupleSize(final);
586 Assert(tuplesz <= state->maxpostingsize);
587
588 /* Save final number of items for posting list */
589 state->intervals[state->nintervals].nitems = state->nitems;
590
593 if (PageAddItem(newpage, final, tuplesz, tupoff, false, false) == InvalidOffsetNumber)
594 elog(ERROR, "deduplication failed to add tuple to page");
595
596 pfree(final);
597 spacesaving = state->phystupsize - (tuplesz + sizeof(ItemIdData));
598 /* Increment nintervals, since we wrote a new posting list tuple */
599 state->nintervals++;
601 }
602
603 /* Reset state for next pending posting list */
604 state->nhtids = 0;
605 state->nitems = 0;
606 state->phystupsize = 0;
607
608 return spacesaving;
609}
#define PageAddItem(page, item, size, offsetNumber, overwrite, is_heap)
Definition bufpage.h:504
#define MAXALIGN(LEN)
Definition c.h:898
size_t Size
Definition c.h:691
#define ERROR
Definition elog.h:39
#define elog(elevel,...)
Definition elog.h:226
static Size IndexTupleSize(const IndexTupleData *itup)
Definition itup.h:71
IndexTuple _bt_form_posting(IndexTuple base, const ItemPointerData *htids, int nhtids)
Definition nbtdedup.c:864
#define BTMaxItemSize
Definition nbtree.h:165

References _bt_form_posting(), Assert, BTMaxItemSize, elog, ERROR, fb(), IndexTupleSize(), InvalidOffsetNumber, MAXALIGN, OffsetNumberNext, PageAddItem, PageGetMaxOffsetNumber(), and pfree().

Referenced by _bt_dedup_pass(), and btree_xlog_dedup().

◆ _bt_dedup_pass()

void _bt_dedup_pass ( Relation  rel,
Buffer  buf,
IndexTuple  newitem,
Size  newitemsz,
bool  bottomupdedup 
)

Definition at line 59 of file nbtdedup.c.

61{
62 OffsetNumber offnum,
63 minoff,
64 maxoff;
65 Page page = BufferGetPage(buf);
66 BTPageOpaque opaque = BTPageGetOpaque(page);
70 bool singlevalstrat = false;
73
74 /* Passed-in newitemsz is MAXALIGNED but does not include line pointer */
75 newitemsz += sizeof(ItemIdData);
76
77 /*
78 * Initialize deduplication state.
79 *
80 * It would be possible for maxpostingsize (limit on posting list tuple
81 * size) to be set to one third of the page. However, it seems like a
82 * good idea to limit the size of posting lists to one sixth of a page.
83 * That ought to leave us with a good split point when pages full of
84 * duplicates can be split several times.
85 */
87 state->deduplicate = true;
88 state->nmaxitems = 0;
89 state->maxpostingsize = Min(BTMaxItemSize / 2, INDEX_SIZE_MASK);
90 /* Metadata about base tuple of current pending posting list */
91 state->base = NULL;
92 state->baseoff = InvalidOffsetNumber;
93 state->basetupsize = 0;
94 /* Metadata about current pending posting list TIDs */
95 state->htids = palloc(state->maxpostingsize);
96 state->nhtids = 0;
97 state->nitems = 0;
98 /* Size of all physical tuples to be replaced by pending posting list */
99 state->phystupsize = 0;
100 /* nintervals should be initialized to zero */
101 state->nintervals = 0;
102
103 minoff = P_FIRSTDATAKEY(opaque);
104 maxoff = PageGetMaxOffsetNumber(page);
105
106 /*
107 * Consider applying "single value" strategy, though only if the page
108 * seems likely to be split in the near future
109 */
110 if (!bottomupdedup)
111 singlevalstrat = _bt_do_singleval(rel, page, state, minoff, newitem);
112
113 /*
114 * Deduplicate items from page, and write them to newpage.
115 *
116 * Copy the original page's LSN into newpage copy. This will become the
117 * updated version of the page. We need this because XLogInsert will
118 * examine the LSN and possibly dump it in a page image.
119 */
122
123 /* Copy high key, if any */
124 if (!P_RIGHTMOST(opaque))
125 {
129
131 elog(ERROR, "deduplication failed to add highkey");
132 }
133
134 for (offnum = minoff;
135 offnum <= maxoff;
136 offnum = OffsetNumberNext(offnum))
137 {
138 ItemId itemid = PageGetItemId(page, offnum);
139 IndexTuple itup = (IndexTuple) PageGetItem(page, itemid);
140
141 Assert(!ItemIdIsDead(itemid));
142
143 if (offnum == minoff)
144 {
145 /*
146 * No previous/base tuple for the data item -- use the data item
147 * as base tuple of pending posting list
148 */
149 _bt_dedup_start_pending(state, itup, offnum);
150 }
151 else if (state->deduplicate &&
152 _bt_keep_natts_fast(rel, state->base, itup) > nkeyatts &&
154 {
155 /*
156 * Tuple is equal to base tuple of pending posting list. Heap
157 * TID(s) for itup have been saved in state.
158 */
159 }
160 else
161 {
162 /*
163 * Tuple is not equal to pending posting list tuple, or
164 * _bt_dedup_save_htid() opted to not merge current item into
165 * pending posting list for some other reason (e.g., adding more
166 * TIDs would have caused posting list to exceed current
167 * maxpostingsize).
168 *
169 * If state contains pending posting list with more than one item,
170 * form new posting tuple and add it to our temp page (newpage).
171 * Else add pending interval's base tuple to the temp page as-is.
172 */
174
175 if (singlevalstrat)
176 {
177 /*
178 * Single value strategy's extra steps.
179 *
180 * Lower maxpostingsize for sixth and final large posting list
181 * tuple at the point where 5 maxpostingsize-capped tuples
182 * have either been formed or observed.
183 *
184 * When a sixth maxpostingsize-capped item is formed/observed,
185 * stop merging together tuples altogether. The few tuples
186 * that remain at the end of the page won't be merged together
187 * at all (at least not until after a future page split takes
188 * place, when this page's newly allocated right sibling page
189 * gets its first deduplication pass).
190 */
191 if (state->nmaxitems == 5)
192 _bt_singleval_fillfactor(page, state, newitemsz);
193 else if (state->nmaxitems == 6)
194 {
195 state->deduplicate = false;
196 singlevalstrat = false; /* won't be back here */
197 }
198 }
199
200 /* itup starts new pending posting list */
201 _bt_dedup_start_pending(state, itup, offnum);
202 }
203 }
204
205 /* Handle the last item */
207
208 /*
209 * If no items suitable for deduplication were found, newpage must be
210 * exactly the same as the original page, so just return from function.
211 *
212 * We could determine whether or not to proceed on the basis the space
213 * savings being sufficient to avoid an immediate page split instead. We
214 * don't do that because there is some small value in nbtsplitloc.c always
215 * operating against a page that is fully deduplicated (apart from
216 * newitem). Besides, most of the cost has already been paid.
217 */
218 if (state->nintervals == 0)
219 {
220 /* cannot leak memory here */
221 pfree(newpage);
222 pfree(state->htids);
223 pfree(state);
224 return;
225 }
226
227 /*
228 * By here, it's clear that deduplication will definitely go ahead.
229 *
230 * Clear the BTP_HAS_GARBAGE page flag. The index must be a heapkeyspace
231 * index, and as such we'll never pay attention to BTP_HAS_GARBAGE anyway.
232 * But keep things tidy.
233 */
234 if (P_HAS_GARBAGE(opaque))
235 {
237
238 nopaque->btpo_flags &= ~BTP_HAS_GARBAGE;
239 }
240
242
245
246 /* XLOG stuff */
247 if (RelationNeedsWAL(rel))
248 {
250
251 xlrec_dedup.nintervals = state->nintervals;
252
256
257 /*
258 * The intervals array is not in the buffer, but pretend that it is.
259 * When XLogInsert stores the whole buffer, the array need not be
260 * stored too.
261 */
262 XLogRegisterBufData(0, state->intervals,
263 state->nintervals * sizeof(BTDedupInterval));
264
266 }
267 else
268 recptr = XLogGetFakeLSN(rel);
269
270 PageSetLSN(page, recptr);
271
273
274 /* Local space accounting should agree with page accounting */
275 Assert(pagesaving < newitemsz || PageGetExactFreeSpace(page) >= newitemsz);
276
277 /* cannot leak memory here */
278 pfree(state->htids);
279 pfree(state);
280}
void MarkBufferDirty(Buffer buffer)
Definition bufmgr.c:3132
void PageRestoreTempPage(Page tempPage, Page oldPage)
Definition bufpage.c:423
Page PageGetTempPageCopySpecial(const PageData *page)
Definition bufpage.c:401
static void PageSetLSN(Page page, XLogRecPtr lsn)
Definition bufpage.h:417
static XLogRecPtr PageGetLSN(const PageData *page)
Definition bufpage.h:411
#define Min(x, y)
Definition c.h:1093
#define PG_USED_FOR_ASSERTS_ONLY
Definition c.h:243
#define INDEX_SIZE_MASK
Definition itup.h:65
#define START_CRIT_SECTION()
Definition miscadmin.h:150
#define END_CRIT_SECTION()
Definition miscadmin.h:152
static bool _bt_do_singleval(Relation rel, Page page, BTDedupState state, OffsetNumber minoff, IndexTuple newitem)
Definition nbtdedup.c:782
Size _bt_dedup_finish_pending(Page newpage, BTDedupState state)
Definition nbtdedup.c:557
static void _bt_singleval_fillfactor(Page page, BTDedupState state, Size newitemsz)
Definition nbtdedup.c:822
#define P_HIKEY
Definition nbtree.h:368
#define P_HAS_GARBAGE(opaque)
Definition nbtree.h:227
#define P_RIGHTMOST(opaque)
Definition nbtree.h:220
#define XLOG_BTREE_DEDUP
Definition nbtxlog.h:33
#define SizeOfBtreeDedup
Definition nbtxlog.h:174
#define RelationNeedsWAL(relation)
Definition rel.h:637
uint16 nintervals
Definition nbtxlog.h:169
uint64 XLogRecPtr
Definition xlogdefs.h:21
XLogRecPtr XLogInsert(RmgrId rmid, uint8 info)
Definition xloginsert.c:482
void XLogRegisterBufData(uint8 block_id, const void *data, uint32 len)
Definition xloginsert.c:413
void XLogRegisterData(const void *data, uint32 len)
Definition xloginsert.c:372
void XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags)
Definition xloginsert.c:246
void XLogBeginInsert(void)
Definition xloginsert.c:153
XLogRecPtr XLogGetFakeLSN(Relation rel)
Definition xloginsert.c:562
#define REGBUF_STANDARD
Definition xloginsert.h:35

References _bt_dedup_finish_pending(), _bt_dedup_save_htid(), _bt_dedup_start_pending(), _bt_do_singleval(), _bt_keep_natts_fast(), _bt_singleval_fillfactor(), Assert, BTMaxItemSize, BTPageGetOpaque, buf, BufferGetPage(), elog, END_CRIT_SECTION, ERROR, fb(), INDEX_SIZE_MASK, IndexRelationGetNumberOfKeyAttributes, InvalidOffsetNumber, ItemIdGetLength, ItemIdIsDead, MarkBufferDirty(), Min, xl_btree_dedup::nintervals, OffsetNumberNext, P_FIRSTDATAKEY, P_HAS_GARBAGE, P_HIKEY, P_RIGHTMOST, PageAddItem, PageGetExactFreeSpace(), PageGetItem(), PageGetItemId(), PageGetLSN(), PageGetMaxOffsetNumber(), PageGetTempPageCopySpecial(), PageRestoreTempPage(), PageSetLSN(), palloc(), palloc_object, pfree(), PG_USED_FOR_ASSERTS_ONLY, REGBUF_STANDARD, RelationNeedsWAL, SizeOfBtreeDedup, START_CRIT_SECTION, XLOG_BTREE_DEDUP, XLogBeginInsert(), XLogGetFakeLSN(), XLogInsert(), XLogRegisterBufData(), XLogRegisterBuffer(), and XLogRegisterData().

Referenced by _bt_delete_or_dedup_one_page().

◆ _bt_dedup_save_htid()

bool _bt_dedup_save_htid ( BTDedupState  state,
IndexTuple  itup 
)

Definition at line 486 of file nbtdedup.c.

487{
488 int nhtids;
489 ItemPointer htids;
491
493
494 if (!BTreeTupleIsPosting(itup))
495 {
496 nhtids = 1;
497 htids = &itup->t_tid;
498 }
499 else
500 {
501 nhtids = BTreeTupleGetNPosting(itup);
502 htids = BTreeTupleGetPosting(itup);
503 }
504
505 /*
506 * Don't append (have caller finish pending posting list as-is) if
507 * appending heap TID(s) from itup would put us over maxpostingsize limit.
508 *
509 * This calculation needs to match the code used within _bt_form_posting()
510 * for new posting list tuples.
511 */
512 mergedtupsz = MAXALIGN(state->basetupsize +
513 (state->nhtids + nhtids) * sizeof(ItemPointerData));
514
515 if (mergedtupsz > state->maxpostingsize)
516 {
517 /*
518 * Count this as an oversized item for single value strategy, though
519 * only when there are 50 TIDs in the final posting list tuple. This
520 * limit (which is fairly arbitrary) avoids confusion about how many
521 * 1/6 of a page tuples have been encountered/created by the current
522 * deduplication pass.
523 *
524 * Note: We deliberately don't consider which deduplication pass
525 * merged together tuples to create this item (could be a previous
526 * deduplication pass, or current pass). See _bt_do_singleval()
527 * comments.
528 */
529 if (state->nhtids > 50)
530 state->nmaxitems++;
531
532 return false;
533 }
534
535 /*
536 * Save heap TIDs to pending posting list tuple -- itup can be merged into
537 * pending posting list
538 */
539 state->nitems++;
540 memcpy(state->htids + state->nhtids, htids,
541 sizeof(ItemPointerData) * nhtids);
542 state->nhtids += nhtids;
543 state->phystupsize += MAXALIGN(IndexTupleSize(itup)) + sizeof(ItemIdData);
544
545 return true;
546}
static bool BTreeTupleIsPivot(IndexTuple itup)
Definition nbtree.h:481
static ItemPointer BTreeTupleGetPosting(IndexTuple posting)
Definition nbtree.h:538

References Assert, BTreeTupleGetNPosting(), BTreeTupleGetPosting(), BTreeTupleIsPivot(), BTreeTupleIsPosting(), fb(), IndexTupleSize(), MAXALIGN, and IndexTupleData::t_tid.

Referenced by _bt_bottomupdel_pass(), _bt_dedup_pass(), _bt_load(), and btree_xlog_dedup().

◆ _bt_dedup_start_pending()

void _bt_dedup_start_pending ( BTDedupState  state,
IndexTuple  base,
OffsetNumber  baseoff 
)

Definition at line 435 of file nbtdedup.c.

437{
438 Assert(state->nhtids == 0);
439 Assert(state->nitems == 0);
441
442 /*
443 * Copy heap TID(s) from new base tuple for new candidate posting list
444 * into working state's array
445 */
446 if (!BTreeTupleIsPosting(base))
447 {
448 memcpy(state->htids, &base->t_tid, sizeof(ItemPointerData));
449 state->nhtids = 1;
450 state->basetupsize = IndexTupleSize(base);
451 }
452 else
453 {
454 int nposting;
455
457 memcpy(state->htids, BTreeTupleGetPosting(base),
458 sizeof(ItemPointerData) * nposting);
459 state->nhtids = nposting;
460 /* basetupsize should not include existing posting list */
461 state->basetupsize = BTreeTupleGetPostingOffset(base);
462 }
463
464 /*
465 * Save new base tuple itself -- it'll be needed if we actually create a
466 * new posting list from new pending posting list.
467 *
468 * Must maintain physical size of all existing tuples (including line
469 * pointer overhead) so that we can calculate space savings on page.
470 */
471 state->nitems = 1;
472 state->base = base;
473 state->baseoff = baseoff;
474 state->phystupsize = MAXALIGN(IndexTupleSize(base)) + sizeof(ItemIdData);
475 /* Also save baseoff in pending state for interval */
476 state->intervals[state->nintervals].baseoff = state->baseoff;
477}
static uint32 BTreeTupleGetPostingOffset(IndexTuple posting)
Definition nbtree.h:530

References Assert, BTreeTupleGetNPosting(), BTreeTupleGetPosting(), BTreeTupleGetPostingOffset(), BTreeTupleIsPivot(), BTreeTupleIsPosting(), fb(), IndexTupleSize(), MAXALIGN, and IndexTupleData::t_tid.

Referenced by _bt_bottomupdel_pass(), _bt_dedup_pass(), _bt_load(), and btree_xlog_dedup().

◆ _bt_do_singleval()

static bool _bt_do_singleval ( Relation  rel,
Page  page,
BTDedupState  state,
OffsetNumber  minoff,
IndexTuple  newitem 
)
static

Definition at line 782 of file nbtdedup.c.

784{
786 ItemId itemid;
787 IndexTuple itup;
788
789 itemid = PageGetItemId(page, minoff);
790 itup = (IndexTuple) PageGetItem(page, itemid);
791
792 if (_bt_keep_natts_fast(rel, newitem, itup) > nkeyatts)
793 {
794 itemid = PageGetItemId(page, PageGetMaxOffsetNumber(page));
795 itup = (IndexTuple) PageGetItem(page, itemid);
796
797 if (_bt_keep_natts_fast(rel, newitem, itup) > nkeyatts)
798 return true;
799 }
800
801 return false;
802}

References _bt_keep_natts_fast(), fb(), IndexRelationGetNumberOfKeyAttributes, PageGetItem(), PageGetItemId(), and PageGetMaxOffsetNumber().

Referenced by _bt_dedup_pass().

◆ _bt_form_posting()

IndexTuple _bt_form_posting ( IndexTuple  base,
const ItemPointerData htids,
int  nhtids 
)

Definition at line 864 of file nbtdedup.c.

865{
866 uint32 keysize,
867 newsize;
868 IndexTuple itup;
869
870 if (BTreeTupleIsPosting(base))
871 keysize = BTreeTupleGetPostingOffset(base);
872 else
873 keysize = IndexTupleSize(base);
874
876 Assert(nhtids > 0 && nhtids <= PG_UINT16_MAX);
877 Assert(keysize == MAXALIGN(keysize));
878
879 /* Determine final size of new tuple */
880 if (nhtids > 1)
881 newsize = MAXALIGN(keysize +
882 nhtids * sizeof(ItemPointerData));
883 else
884 newsize = keysize;
885
888
889 /* Allocate memory using palloc0() (matches index_form_tuple()) */
890 itup = palloc0(newsize);
891 memcpy(itup, base, keysize);
892 itup->t_info &= ~INDEX_SIZE_MASK;
893 itup->t_info |= newsize;
894 if (nhtids > 1)
895 {
896 /* Form posting list tuple */
897 BTreeTupleSetPosting(itup, nhtids, keysize);
898 memcpy(BTreeTupleGetPosting(itup), htids,
899 sizeof(ItemPointerData) * nhtids);
901 }
902 else
903 {
904 /* Form standard non-pivot tuple */
906 ItemPointerCopy(htids, &itup->t_tid);
908 }
909
910 return itup;
911}
uint32_t uint32
Definition c.h:618
#define PG_UINT16_MAX
Definition c.h:673
static void ItemPointerCopy(const ItemPointerData *fromPointer, ItemPointerData *toPointer)
Definition itemptr.h:172
static bool ItemPointerIsValid(const ItemPointerData *pointer)
Definition itemptr.h:83
void * palloc0(Size size)
Definition mcxt.c:1417
static void BTreeTupleSetPosting(IndexTuple itup, uint16 nhtids, int postingoffset)
Definition nbtree.h:505
unsigned short t_info
Definition itup.h:49

References Assert, BTreeTupleGetPosting(), BTreeTupleGetPostingOffset(), BTreeTupleIsPivot(), BTreeTupleIsPosting(), BTreeTupleSetPosting(), fb(), INDEX_SIZE_MASK, IndexTupleSize(), ItemPointerCopy(), ItemPointerIsValid(), MAXALIGN, palloc0(), PG_UINT16_MAX, IndexTupleData::t_info, and IndexTupleData::t_tid.

Referenced by _bt_dedup_finish_pending(), _bt_sort_dedup_finish_pending(), and bt_posting_plain_tuple().

◆ _bt_singleval_fillfactor()

static void _bt_singleval_fillfactor ( Page  page,
BTDedupState  state,
Size  newitemsz 
)
static

Definition at line 822 of file nbtdedup.c.

823{
824 Size leftfree;
825 int reduction;
826
827 /* This calculation needs to match nbtsplitloc.c */
828 leftfree = PageGetPageSize(page) - SizeOfPageHeaderData -
829 MAXALIGN(sizeof(BTPageOpaqueData));
830 /* Subtract size of new high key (includes pivot heap TID space) */
831 leftfree -= newitemsz + MAXALIGN(sizeof(ItemPointerData));
832
833 /*
834 * Reduce maxpostingsize by an amount equal to target free space on left
835 * half of page
836 */
837 reduction = leftfree * ((100 - BTREE_SINGLEVAL_FILLFACTOR) / 100.0);
838 if (state->maxpostingsize > reduction)
839 state->maxpostingsize -= reduction;
840 else
841 state->maxpostingsize = 0;
842}
static Size PageGetPageSize(const PageData *page)
Definition bufpage.h:302
#define SizeOfPageHeaderData
Definition bufpage.h:242
#define BTREE_SINGLEVAL_FILLFACTOR
Definition nbtree.h:203

References BTREE_SINGLEVAL_FILLFACTOR, fb(), MAXALIGN, PageGetPageSize(), and SizeOfPageHeaderData.

Referenced by _bt_dedup_pass().

◆ _bt_swap_posting()

IndexTuple _bt_swap_posting ( IndexTuple  newitem,
IndexTuple  oposting,
int  postingoff 
)

Definition at line 1022 of file nbtdedup.c.

1023{
1024 int nhtids;
1025 char *replacepos;
1026 char *replaceposright;
1029
1032
1033 /*
1034 * The postingoff argument originated as a _bt_binsrch_posting() return
1035 * value. It will be 0 in the event of corruption that makes a leaf page
1036 * contain a non-pivot tuple that's somehow identical to newitem (no two
1037 * non-pivot tuples should ever have the same TID). This has been known
1038 * to happen in the field from time to time.
1039 *
1040 * Perform a basic sanity check to catch this case now.
1041 */
1042 if (!(postingoff > 0 && postingoff < nhtids))
1043 elog(ERROR, "posting list tuple with %d items cannot be split at offset %d",
1044 nhtids, postingoff);
1045
1046 /*
1047 * Move item pointers in posting list to make a gap for the new item's
1048 * heap TID. We shift TIDs one place to the right, losing original
1049 * rightmost TID. (nmovebytes must not include TIDs to the left of
1050 * postingoff, nor the existing rightmost/max TID that gets overwritten.)
1051 */
1053 replacepos = (char *) BTreeTupleGetPostingN(nposting, postingoff);
1054 replaceposright = (char *) BTreeTupleGetPostingN(nposting, postingoff + 1);
1055 nmovebytes = (nhtids - postingoff - 1) * sizeof(ItemPointerData);
1057
1058 /* Fill the gap at postingoff with TID of new item (original new TID) */
1059 Assert(!BTreeTupleIsPivot(newitem) && !BTreeTupleIsPosting(newitem));
1061
1062 /* Now copy oposting's rightmost/max TID into new item (final new TID) */
1064
1066 BTreeTupleGetHeapTID(newitem)) < 0);
1068
1069 return nposting;
1070}
IndexTuple CopyIndexTuple(IndexTuple source)
Definition indextuple.c:479
int32 ItemPointerCompare(const ItemPointerData *arg1, const ItemPointerData *arg2)
Definition itemptr.c:51

References Assert, BTreeTupleGetHeapTID(), BTreeTupleGetMaxHeapTID(), BTreeTupleGetNPosting(), BTreeTupleGetPostingN(), BTreeTupleIsPivot(), BTreeTupleIsPosting(), CopyIndexTuple(), elog, ERROR, fb(), ItemPointerCompare(), ItemPointerCopy(), and IndexTupleData::t_tid.

Referenced by _bt_insertonpg(), btree_xlog_insert(), and btree_xlog_split().

◆ _bt_update_posting()

void _bt_update_posting ( BTVacuumPosting  vacposting)

Definition at line 924 of file nbtdedup.c.

925{
927 uint32 keysize,
928 newsize;
929 IndexTuple itup;
930 int nhtids;
931 int ui,
932 d;
933 ItemPointer htids;
934
935 nhtids = BTreeTupleGetNPosting(origtuple) - vacposting->ndeletedtids;
936
938 Assert(nhtids > 0 && nhtids < BTreeTupleGetNPosting(origtuple));
939
940 /*
941 * Determine final size of new tuple.
942 *
943 * This calculation needs to match the code used within _bt_form_posting()
944 * for new posting list tuples. We avoid calling _bt_form_posting() here
945 * to save ourselves a second memory allocation for a htids workspace.
946 */
948 if (nhtids > 1)
949 newsize = MAXALIGN(keysize +
950 nhtids * sizeof(ItemPointerData));
951 else
952 newsize = keysize;
953
956
957 /* Allocate memory using palloc0() (matches index_form_tuple()) */
958 itup = palloc0(newsize);
959 memcpy(itup, origtuple, keysize);
960 itup->t_info &= ~INDEX_SIZE_MASK;
961 itup->t_info |= newsize;
962
963 if (nhtids > 1)
964 {
965 /* Form posting list tuple */
966 BTreeTupleSetPosting(itup, nhtids, keysize);
967 htids = BTreeTupleGetPosting(itup);
968 }
969 else
970 {
971 /* Form standard non-pivot tuple */
973 htids = &itup->t_tid;
974 }
975
976 ui = 0;
977 d = 0;
978 for (int i = 0; i < BTreeTupleGetNPosting(origtuple); i++)
979 {
980 if (d < vacposting->ndeletedtids && vacposting->deletetids[d] == i)
981 {
982 d++;
983 continue;
984 }
985 htids[ui++] = *BTreeTupleGetPostingN(origtuple, i);
986 }
987 Assert(ui == nhtids);
988 Assert(d == vacposting->ndeletedtids);
989 Assert(nhtids == 1 || _bt_posting_valid(itup));
990 Assert(nhtids > 1 || ItemPointerIsValid(&itup->t_tid));
991
992 /* vacposting arg's itup will now point to updated version */
993 vacposting->itup = itup;
994}

References Assert, BTreeTupleGetNPosting(), BTreeTupleGetPosting(), BTreeTupleGetPostingN(), BTreeTupleGetPostingOffset(), BTreeTupleSetPosting(), fb(), i, INDEX_SIZE_MASK, ItemPointerIsValid(), MAXALIGN, palloc0(), IndexTupleData::t_info, and IndexTupleData::t_tid.

Referenced by _bt_delitems_update(), and btree_xlog_updates().