PostgreSQL Source Code  git master
nbtdedup.c File Reference
#include "postgres.h"
#include "access/nbtree.h"
#include "access/nbtxlog.h"
#include "access/xloginsert.h"
#include "miscadmin.h"
#include "utils/rel.h"
Include dependency graph for nbtdedup.c:

Go to the source code of this file.

Functions

static void _bt_bottomupdel_finish_pending (Page page, BTDedupState state, TM_IndexDeleteOp *delstate)
 
static bool _bt_do_singleval (Relation rel, Page page, BTDedupState state, OffsetNumber minoff, IndexTuple newitem)
 
static void _bt_singleval_fillfactor (Page page, BTDedupState state, Size newitemsz)
 
void _bt_dedup_pass (Relation rel, Buffer buf, Relation heapRel, IndexTuple newitem, Size newitemsz, bool bottomupdedup)
 
bool _bt_bottomupdel_pass (Relation rel, Buffer buf, Relation heapRel, Size newitemsz)
 
void _bt_dedup_start_pending (BTDedupState state, IndexTuple base, OffsetNumber baseoff)
 
bool _bt_dedup_save_htid (BTDedupState state, IndexTuple itup)
 
Size _bt_dedup_finish_pending (Page newpage, BTDedupState state)
 
IndexTuple _bt_form_posting (IndexTuple base, ItemPointer htids, int nhtids)
 
void _bt_update_posting (BTVacuumPosting vacposting)
 
IndexTuple _bt_swap_posting (IndexTuple newitem, IndexTuple oposting, int postingoff)
 

Function Documentation

◆ _bt_bottomupdel_finish_pending()

static void _bt_bottomupdel_finish_pending ( Page  page,
BTDedupState  state,
TM_IndexDeleteOp delstate 
)
static

Definition at line 644 of file nbtdedup.c.

646 {
647  bool dupinterval = (state->nitems > 1);
648 
649  Assert(state->nitems > 0);
650  Assert(state->nitems <= state->nhtids);
651  Assert(state->intervals[state->nintervals].baseoff == state->baseoff);
652 
653  for (int i = 0; i < state->nitems; i++)
654  {
655  OffsetNumber offnum = state->baseoff + i;
656  ItemId itemid = PageGetItemId(page, offnum);
657  IndexTuple itup = (IndexTuple) PageGetItem(page, itemid);
658  TM_IndexDelete *ideltid = &delstate->deltids[delstate->ndeltids];
659  TM_IndexStatus *istatus = &delstate->status[delstate->ndeltids];
660 
661  if (!BTreeTupleIsPosting(itup))
662  {
663  /* Simple case: A plain non-pivot tuple */
664  ideltid->tid = itup->t_tid;
665  ideltid->id = delstate->ndeltids;
666  istatus->idxoffnum = offnum;
667  istatus->knowndeletable = false; /* for now */
668  istatus->promising = dupinterval; /* simple rule */
669  istatus->freespace = ItemIdGetLength(itemid) + sizeof(ItemIdData);
670 
671  delstate->ndeltids++;
672  }
673  else
674  {
675  /*
676  * Complicated case: A posting list tuple.
677  *
678  * We make the conservative assumption that there can only be at
679  * most one affected logical row per posting list tuple. There
680  * will be at most one promising entry in deltids to represent
681  * this presumed lone logical row. Note that this isn't even
682  * considered unless the posting list tuple is also in an interval
683  * of duplicates -- this complicated rule is just a variant of the
684  * simple rule used to decide if plain index tuples are promising.
685  */
686  int nitem = BTreeTupleGetNPosting(itup);
687  bool firstpromising = false;
688  bool lastpromising = false;
689 
690  Assert(_bt_posting_valid(itup));
691 
692  if (dupinterval)
693  {
694  /*
695  * Complicated rule: either the first or last TID in the
696  * posting list gets marked promising (if any at all)
697  */
698  BlockNumber minblocklist,
699  midblocklist,
700  maxblocklist;
701  ItemPointer mintid,
702  midtid,
703  maxtid;
704 
705  mintid = BTreeTupleGetHeapTID(itup);
706  midtid = BTreeTupleGetPostingN(itup, nitem / 2);
707  maxtid = BTreeTupleGetMaxHeapTID(itup);
708  minblocklist = ItemPointerGetBlockNumber(mintid);
709  midblocklist = ItemPointerGetBlockNumber(midtid);
710  maxblocklist = ItemPointerGetBlockNumber(maxtid);
711 
712  /* Only entry with predominant table block can be promising */
713  firstpromising = (minblocklist == midblocklist);
714  lastpromising = (!firstpromising &&
715  midblocklist == maxblocklist);
716  }
717 
718  for (int p = 0; p < nitem; p++)
719  {
720  ItemPointer htid = BTreeTupleGetPostingN(itup, p);
721 
722  ideltid->tid = *htid;
723  ideltid->id = delstate->ndeltids;
724  istatus->idxoffnum = offnum;
725  istatus->knowndeletable = false; /* for now */
726  istatus->promising = false;
727  if ((firstpromising && p == 0) ||
728  (lastpromising && p == nitem - 1))
729  istatus->promising = true;
730  istatus->freespace = sizeof(ItemPointerData); /* at worst */
731 
732  ideltid++;
733  istatus++;
734  delstate->ndeltids++;
735  }
736  }
737  }
738 
739  if (dupinterval)
740  {
741  state->intervals[state->nintervals].nitems = state->nitems;
742  state->nintervals++;
743  }
744 
745  /* Reset state for next interval */
746  state->nhtids = 0;
747  state->nitems = 0;
748  state->phystupsize = 0;
749 }
uint32 BlockNumber
Definition: block.h:31
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:234
#define PageGetItem(page, itemId)
Definition: bufpage.h:339
int i
Definition: isn.c:73
#define ItemIdGetLength(itemId)
Definition: itemid.h:59
struct ItemIdData ItemIdData
#define ItemPointerGetBlockNumber(pointer)
Definition: itemptr.h:98
struct ItemPointerData ItemPointerData
IndexTupleData * IndexTuple
Definition: itup.h:53
Assert(fmt[strlen(fmt) - 1] !='\n')
static uint16 BTreeTupleGetNPosting(IndexTuple posting)
Definition: nbtree.h:513
static ItemPointer BTreeTupleGetPostingN(IndexTuple posting, int n)
Definition: nbtree.h:539
static ItemPointer BTreeTupleGetMaxHeapTID(IndexTuple itup)
Definition: nbtree.h:659
static bool BTreeTupleIsPosting(IndexTuple itup)
Definition: nbtree.h:487
static ItemPointer BTreeTupleGetHeapTID(IndexTuple itup)
Definition: nbtree.h:633
uint16 OffsetNumber
Definition: off.h:24
ItemPointerData t_tid
Definition: itup.h:37
TM_IndexStatus * status
Definition: tableam.h:231
TM_IndexDelete * deltids
Definition: tableam.h:230
ItemPointerData tid
Definition: tableam.h:189
bool knowndeletable
Definition: tableam.h:196
bool promising
Definition: tableam.h:199
int16 freespace
Definition: tableam.h:200
OffsetNumber idxoffnum
Definition: tableam.h:195
Definition: regguts.h:318

References Assert(), BTreeTupleGetHeapTID(), BTreeTupleGetMaxHeapTID(), BTreeTupleGetNPosting(), BTreeTupleGetPostingN(), BTreeTupleIsPosting(), TM_IndexDeleteOp::deltids, TM_IndexStatus::freespace, i, TM_IndexDelete::id, TM_IndexStatus::idxoffnum, ItemIdGetLength, ItemPointerGetBlockNumber, TM_IndexStatus::knowndeletable, TM_IndexDeleteOp::ndeltids, PageGetItem, PageGetItemId, TM_IndexStatus::promising, TM_IndexDeleteOp::status, IndexTupleData::t_tid, and TM_IndexDelete::tid.

Referenced by _bt_bottomupdel_pass().

◆ _bt_bottomupdel_pass()

bool _bt_bottomupdel_pass ( Relation  rel,
Buffer  buf,
Relation  heapRel,
Size  newitemsz 
)

Definition at line 306 of file nbtdedup.c.

308 {
309  OffsetNumber offnum,
310  minoff,
311  maxoff;
312  Page page = BufferGetPage(buf);
313  BTPageOpaque opaque = BTPageGetOpaque(page);
315  TM_IndexDeleteOp delstate;
316  bool neverdedup;
317  int nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
318 
319  /* Passed-in newitemsz is MAXALIGNED but does not include line pointer */
320  newitemsz += sizeof(ItemIdData);
321 
322  /* Initialize deduplication state */
324  state->deduplicate = true;
325  state->nmaxitems = 0;
326  state->maxpostingsize = BLCKSZ; /* We're not really deduplicating */
327  state->base = NULL;
328  state->baseoff = InvalidOffsetNumber;
329  state->basetupsize = 0;
330  state->htids = palloc(state->maxpostingsize);
331  state->nhtids = 0;
332  state->nitems = 0;
333  state->phystupsize = 0;
334  state->nintervals = 0;
335 
336  /*
337  * Initialize tableam state that describes bottom-up index deletion
338  * operation.
339  *
340  * We'll go on to ask the tableam to search for TIDs whose index tuples we
341  * can safely delete. The tableam will search until our leaf page space
342  * target is satisfied, or until the cost of continuing with the tableam
343  * operation seems too high. It focuses its efforts on TIDs associated
344  * with duplicate index tuples that we mark "promising".
345  *
346  * This space target is a little arbitrary. The tableam must be able to
347  * keep the costs and benefits in balance. We provide the tableam with
348  * exhaustive information about what might work, without directly
349  * concerning ourselves with avoiding work during the tableam call. Our
350  * role in costing the bottom-up deletion process is strictly advisory.
351  */
352  delstate.irel = rel;
353  delstate.iblknum = BufferGetBlockNumber(buf);
354  delstate.bottomup = true;
355  delstate.bottomupfreespace = Max(BLCKSZ / 16, newitemsz);
356  delstate.ndeltids = 0;
357  delstate.deltids = palloc(MaxTIDsPerBTreePage * sizeof(TM_IndexDelete));
358  delstate.status = palloc(MaxTIDsPerBTreePage * sizeof(TM_IndexStatus));
359 
360  minoff = P_FIRSTDATAKEY(opaque);
361  maxoff = PageGetMaxOffsetNumber(page);
362  for (offnum = minoff;
363  offnum <= maxoff;
364  offnum = OffsetNumberNext(offnum))
365  {
366  ItemId itemid = PageGetItemId(page, offnum);
367  IndexTuple itup = (IndexTuple) PageGetItem(page, itemid);
368 
369  Assert(!ItemIdIsDead(itemid));
370 
371  if (offnum == minoff)
372  {
373  /* itup starts first pending interval */
374  _bt_dedup_start_pending(state, itup, offnum);
375  }
376  else if (_bt_keep_natts_fast(rel, state->base, itup) > nkeyatts &&
377  _bt_dedup_save_htid(state, itup))
378  {
379  /* Tuple is equal; just added its TIDs to pending interval */
380  }
381  else
382  {
383  /* Finalize interval -- move its TIDs to delete state */
384  _bt_bottomupdel_finish_pending(page, state, &delstate);
385 
386  /* itup starts new pending interval */
387  _bt_dedup_start_pending(state, itup, offnum);
388  }
389  }
390  /* Finalize final interval -- move its TIDs to delete state */
391  _bt_bottomupdel_finish_pending(page, state, &delstate);
392 
393  /*
394  * We don't give up now in the event of having few (or even zero)
395  * promising tuples for the tableam because it's not up to us as the index
396  * AM to manage costs (note that the tableam might have heuristics of its
397  * own that work out what to do). We should at least avoid having our
398  * caller do a useless deduplication pass after we return in the event of
399  * zero promising tuples, though.
400  */
401  neverdedup = false;
402  if (state->nintervals == 0)
403  neverdedup = true;
404 
405  pfree(state->htids);
406  pfree(state);
407 
408  /* Ask tableam which TIDs are deletable, then physically delete them */
409  _bt_delitems_delete_check(rel, buf, heapRel, &delstate);
410 
411  pfree(delstate.deltids);
412  pfree(delstate.status);
413 
414  /* Report "success" to caller unconditionally to avoid deduplication */
415  if (neverdedup)
416  return true;
417 
418  /* Don't dedup when we won't end up back here any time soon anyway */
419  return PageGetExactFreeSpace(page) >= Max(BLCKSZ / 24, newitemsz);
420 }
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2755
#define BufferGetPage(buffer)
Definition: bufmgr.h:169
Size PageGetExactFreeSpace(Page page)
Definition: bufpage.c:958
Pointer Page
Definition: bufpage.h:78
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:356
#define Max(x, y)
Definition: c.h:980
#define ItemIdIsDead(itemId)
Definition: itemid.h:113
void pfree(void *pointer)
Definition: mcxt.c:1175
void * palloc(Size size)
Definition: mcxt.c:1068
bool _bt_dedup_save_htid(BTDedupState state, IndexTuple itup)
Definition: nbtdedup.c:483
void _bt_dedup_start_pending(BTDedupState state, IndexTuple base, OffsetNumber baseoff)
Definition: nbtdedup.c:432
static void _bt_bottomupdel_finish_pending(Page page, BTDedupState state, TM_IndexDeleteOp *delstate)
Definition: nbtdedup.c:644
void _bt_delitems_delete_check(Relation rel, Buffer buf, Relation heapRel, TM_IndexDeleteOp *delstate)
Definition: nbtpage.c:1528
#define BTPageGetOpaque(page)
Definition: nbtree.h:73
#define MaxTIDsPerBTreePage
Definition: nbtree.h:186
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:371
BTDedupStateData * BTDedupState
Definition: nbtree.h:893
int _bt_keep_natts_fast(Relation rel, IndexTuple lastleft, IndexTuple firstright)
Definition: nbtutils.c:2424
#define InvalidOffsetNumber
Definition: off.h:26
#define OffsetNumberNext(offsetNumber)
Definition: off.h:52
static char * buf
Definition: pg_test_fsync.c:67
#define IndexRelationGetNumberOfKeyAttributes(relation)
Definition: rel.h:508
int bottomupfreespace
Definition: tableam.h:226
Relation irel
Definition: tableam.h:223
BlockNumber iblknum
Definition: tableam.h:224

References _bt_bottomupdel_finish_pending(), _bt_dedup_save_htid(), _bt_dedup_start_pending(), _bt_delitems_delete_check(), _bt_keep_natts_fast(), Assert(), TM_IndexDeleteOp::bottomup, TM_IndexDeleteOp::bottomupfreespace, BTPageGetOpaque, buf, BufferGetBlockNumber(), BufferGetPage, TM_IndexDeleteOp::deltids, TM_IndexDeleteOp::iblknum, IndexRelationGetNumberOfKeyAttributes, InvalidOffsetNumber, TM_IndexDeleteOp::irel, ItemIdIsDead, Max, MaxTIDsPerBTreePage, TM_IndexDeleteOp::ndeltids, OffsetNumberNext, P_FIRSTDATAKEY, PageGetExactFreeSpace(), PageGetItem, PageGetItemId, PageGetMaxOffsetNumber, palloc(), pfree(), and TM_IndexDeleteOp::status.

Referenced by _bt_delete_or_dedup_one_page().

◆ _bt_dedup_finish_pending()

Size _bt_dedup_finish_pending ( Page  newpage,
BTDedupState  state 
)

Definition at line 554 of file nbtdedup.c.

555 {
556  OffsetNumber tupoff;
557  Size tuplesz;
558  Size spacesaving;
559 
560  Assert(state->nitems > 0);
561  Assert(state->nitems <= state->nhtids);
562  Assert(state->intervals[state->nintervals].baseoff == state->baseoff);
563 
564  tupoff = OffsetNumberNext(PageGetMaxOffsetNumber(newpage));
565  if (state->nitems == 1)
566  {
567  /* Use original, unchanged base tuple */
568  tuplesz = IndexTupleSize(state->base);
569  if (PageAddItem(newpage, (Item) state->base, tuplesz, tupoff,
570  false, false) == InvalidOffsetNumber)
571  elog(ERROR, "deduplication failed to add tuple to page");
572 
573  spacesaving = 0;
574  }
575  else
576  {
577  IndexTuple final;
578 
579  /* Form a tuple with a posting list */
580  final = _bt_form_posting(state->base, state->htids, state->nhtids);
581  tuplesz = IndexTupleSize(final);
582  Assert(tuplesz <= state->maxpostingsize);
583 
584  /* Save final number of items for posting list */
585  state->intervals[state->nintervals].nitems = state->nitems;
586 
587  Assert(tuplesz == MAXALIGN(IndexTupleSize(final)));
588  if (PageAddItem(newpage, (Item) final, tuplesz, tupoff, false,
589  false) == InvalidOffsetNumber)
590  elog(ERROR, "deduplication failed to add tuple to page");
591 
592  pfree(final);
593  spacesaving = state->phystupsize - (tuplesz + sizeof(ItemIdData));
594  /* Increment nintervals, since we wrote a new posting list tuple */
595  state->nintervals++;
596  Assert(spacesaving > 0 && spacesaving < BLCKSZ);
597  }
598 
599  /* Reset state for next pending posting list */
600  state->nhtids = 0;
601  state->nitems = 0;
602  state->phystupsize = 0;
603 
604  return spacesaving;
605 }
#define PageAddItem(page, item, size, offsetNumber, overwrite, is_heap)
Definition: bufpage.h:415
#define MAXALIGN(LEN)
Definition: c.h:757
size_t Size
Definition: c.h:540
#define ERROR
Definition: elog.h:33
#define elog(elevel,...)
Definition: elog.h:218
Pointer Item
Definition: item.h:17
#define IndexTupleSize(itup)
Definition: itup.h:70
IndexTuple _bt_form_posting(IndexTuple base, ItemPointer htids, int nhtids)
Definition: nbtdedup.c:860

References _bt_form_posting(), Assert(), elog, ERROR, IndexTupleSize, InvalidOffsetNumber, MAXALIGN, OffsetNumberNext, PageAddItem, PageGetMaxOffsetNumber, and pfree().

Referenced by _bt_dedup_pass(), and btree_xlog_dedup().

◆ _bt_dedup_pass()

void _bt_dedup_pass ( Relation  rel,
Buffer  buf,
Relation  heapRel,
IndexTuple  newitem,
Size  newitemsz,
bool  bottomupdedup 
)

Definition at line 58 of file nbtdedup.c.

60 {
61  OffsetNumber offnum,
62  minoff,
63  maxoff;
64  Page page = BufferGetPage(buf);
65  BTPageOpaque opaque = BTPageGetOpaque(page);
66  Page newpage;
68  Size pagesaving PG_USED_FOR_ASSERTS_ONLY = 0;
69  bool singlevalstrat = false;
70  int nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
71 
72  /* Passed-in newitemsz is MAXALIGNED but does not include line pointer */
73  newitemsz += sizeof(ItemIdData);
74 
75  /*
76  * Initialize deduplication state.
77  *
78  * It would be possible for maxpostingsize (limit on posting list tuple
79  * size) to be set to one third of the page. However, it seems like a
80  * good idea to limit the size of posting lists to one sixth of a page.
81  * That ought to leave us with a good split point when pages full of
82  * duplicates can be split several times.
83  */
85  state->deduplicate = true;
86  state->nmaxitems = 0;
87  state->maxpostingsize = Min(BTMaxItemSize(page) / 2, INDEX_SIZE_MASK);
88  /* Metadata about base tuple of current pending posting list */
89  state->base = NULL;
90  state->baseoff = InvalidOffsetNumber;
91  state->basetupsize = 0;
92  /* Metadata about current pending posting list TIDs */
93  state->htids = palloc(state->maxpostingsize);
94  state->nhtids = 0;
95  state->nitems = 0;
96  /* Size of all physical tuples to be replaced by pending posting list */
97  state->phystupsize = 0;
98  /* nintervals should be initialized to zero */
99  state->nintervals = 0;
100 
101  minoff = P_FIRSTDATAKEY(opaque);
102  maxoff = PageGetMaxOffsetNumber(page);
103 
104  /*
105  * Consider applying "single value" strategy, though only if the page
106  * seems likely to be split in the near future
107  */
108  if (!bottomupdedup)
109  singlevalstrat = _bt_do_singleval(rel, page, state, minoff, newitem);
110 
111  /*
112  * Deduplicate items from page, and write them to newpage.
113  *
114  * Copy the original page's LSN into newpage copy. This will become the
115  * updated version of the page. We need this because XLogInsert will
116  * examine the LSN and possibly dump it in a page image.
117  */
118  newpage = PageGetTempPageCopySpecial(page);
119  PageSetLSN(newpage, PageGetLSN(page));
120 
121  /* Copy high key, if any */
122  if (!P_RIGHTMOST(opaque))
123  {
124  ItemId hitemid = PageGetItemId(page, P_HIKEY);
125  Size hitemsz = ItemIdGetLength(hitemid);
126  IndexTuple hitem = (IndexTuple) PageGetItem(page, hitemid);
127 
128  if (PageAddItem(newpage, (Item) hitem, hitemsz, P_HIKEY,
129  false, false) == InvalidOffsetNumber)
130  elog(ERROR, "deduplication failed to add highkey");
131  }
132 
133  for (offnum = minoff;
134  offnum <= maxoff;
135  offnum = OffsetNumberNext(offnum))
136  {
137  ItemId itemid = PageGetItemId(page, offnum);
138  IndexTuple itup = (IndexTuple) PageGetItem(page, itemid);
139 
140  Assert(!ItemIdIsDead(itemid));
141 
142  if (offnum == minoff)
143  {
144  /*
145  * No previous/base tuple for the data item -- use the data item
146  * as base tuple of pending posting list
147  */
148  _bt_dedup_start_pending(state, itup, offnum);
149  }
150  else if (state->deduplicate &&
151  _bt_keep_natts_fast(rel, state->base, itup) > nkeyatts &&
152  _bt_dedup_save_htid(state, itup))
153  {
154  /*
155  * Tuple is equal to base tuple of pending posting list. Heap
156  * TID(s) for itup have been saved in state.
157  */
158  }
159  else
160  {
161  /*
162  * Tuple is not equal to pending posting list tuple, or
163  * _bt_dedup_save_htid() opted to not merge current item into
164  * pending posting list for some other reason (e.g., adding more
165  * TIDs would have caused posting list to exceed current
166  * maxpostingsize).
167  *
168  * If state contains pending posting list with more than one item,
169  * form new posting tuple, and actually update the page. Else
170  * reset the state and move on without modifying the page.
171  */
172  pagesaving += _bt_dedup_finish_pending(newpage, state);
173 
174  if (singlevalstrat)
175  {
176  /*
177  * Single value strategy's extra steps.
178  *
179  * Lower maxpostingsize for sixth and final large posting list
180  * tuple at the point where 5 maxpostingsize-capped tuples
181  * have either been formed or observed.
182  *
183  * When a sixth maxpostingsize-capped item is formed/observed,
184  * stop merging together tuples altogether. The few tuples
185  * that remain at the end of the page won't be merged together
186  * at all (at least not until after a future page split takes
187  * place).
188  */
189  if (state->nmaxitems == 5)
190  _bt_singleval_fillfactor(page, state, newitemsz);
191  else if (state->nmaxitems == 6)
192  {
193  state->deduplicate = false;
194  singlevalstrat = false; /* won't be back here */
195  }
196  }
197 
198  /* itup starts new pending posting list */
199  _bt_dedup_start_pending(state, itup, offnum);
200  }
201  }
202 
203  /* Handle the last item */
204  pagesaving += _bt_dedup_finish_pending(newpage, state);
205 
206  /*
207  * If no items suitable for deduplication were found, newpage must be
208  * exactly the same as the original page, so just return from function.
209  *
210  * We could determine whether or not to proceed on the basis the space
211  * savings being sufficient to avoid an immediate page split instead. We
212  * don't do that because there is some small value in nbtsplitloc.c always
213  * operating against a page that is fully deduplicated (apart from
214  * newitem). Besides, most of the cost has already been paid.
215  */
216  if (state->nintervals == 0)
217  {
218  /* cannot leak memory here */
219  pfree(newpage);
220  pfree(state->htids);
221  pfree(state);
222  return;
223  }
224 
225  /*
226  * By here, it's clear that deduplication will definitely go ahead.
227  *
228  * Clear the BTP_HAS_GARBAGE page flag. The index must be a heapkeyspace
229  * index, and as such we'll never pay attention to BTP_HAS_GARBAGE anyway.
230  * But keep things tidy.
231  */
232  if (P_HAS_GARBAGE(opaque))
233  {
234  BTPageOpaque nopaque = BTPageGetOpaque(newpage);
235 
236  nopaque->btpo_flags &= ~BTP_HAS_GARBAGE;
237  }
238 
240 
241  PageRestoreTempPage(newpage, page);
243 
244  /* XLOG stuff */
245  if (RelationNeedsWAL(rel))
246  {
247  XLogRecPtr recptr;
248  xl_btree_dedup xlrec_dedup;
249 
250  xlrec_dedup.nintervals = state->nintervals;
251 
252  XLogBeginInsert();
254  XLogRegisterData((char *) &xlrec_dedup, SizeOfBtreeDedup);
255 
256  /*
257  * The intervals array is not in the buffer, but pretend that it is.
258  * When XLogInsert stores the whole buffer, the array need not be
259  * stored too.
260  */
261  XLogRegisterBufData(0, (char *) state->intervals,
262  state->nintervals * sizeof(BTDedupInterval));
263 
264  recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DEDUP);
265 
266  PageSetLSN(page, recptr);
267  }
268 
270 
271  /* Local space accounting should agree with page accounting */
272  Assert(pagesaving < newitemsz || PageGetExactFreeSpace(page) >= newitemsz);
273 
274  /* cannot leak memory here */
275  pfree(state->htids);
276  pfree(state);
277 }
void MarkBufferDirty(Buffer buffer)
Definition: bufmgr.c:1573
void PageRestoreTempPage(Page tempPage, Page oldPage)
Definition: bufpage.c:424
Page PageGetTempPageCopySpecial(Page page)
Definition: bufpage.c:402
#define PageSetLSN(page, lsn)
Definition: bufpage.h:367
#define PageGetLSN(page)
Definition: bufpage.h:365
#define Min(x, y)
Definition: c.h:986
#define PG_USED_FOR_ASSERTS_ONLY
Definition: c.h:155
#define INDEX_SIZE_MASK
Definition: itup.h:65
#define START_CRIT_SECTION()
Definition: miscadmin.h:148
#define END_CRIT_SECTION()
Definition: miscadmin.h:150
static bool _bt_do_singleval(Relation rel, Page page, BTDedupState state, OffsetNumber minoff, IndexTuple newitem)
Definition: nbtdedup.c:778
Size _bt_dedup_finish_pending(Page newpage, BTDedupState state)
Definition: nbtdedup.c:554
static void _bt_singleval_fillfactor(Page page, BTDedupState state, Size newitemsz)
Definition: nbtdedup.c:818
#define P_HIKEY
Definition: nbtree.h:369
#define P_HAS_GARBAGE(opaque)
Definition: nbtree.h:227
#define BTP_HAS_GARBAGE
Definition: nbtree.h:82
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:220
#define BTMaxItemSize(page)
Definition: nbtree.h:164
#define XLOG_BTREE_DEDUP
Definition: nbtxlog.h:33
#define SizeOfBtreeDedup
Definition: nbtxlog.h:174
#define RelationNeedsWAL(relation)
Definition: rel.h:613
uint16 btpo_flags
Definition: nbtree.h:67
uint16 nintervals
Definition: nbtxlog.h:169
uint64 XLogRecPtr
Definition: xlogdefs.h:21
XLogRecPtr XLogInsert(RmgrId rmid, uint8 info)
Definition: xloginsert.c:443
void XLogRegisterBufData(uint8 block_id, char *data, int len)
Definition: xloginsert.c:389
void XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags)
Definition: xloginsert.c:243
void XLogBeginInsert(void)
Definition: xloginsert.c:150
void XLogRegisterData(char *data, int len)
Definition: xloginsert.c:351
#define REGBUF_STANDARD
Definition: xloginsert.h:34

References _bt_dedup_finish_pending(), _bt_dedup_save_htid(), _bt_dedup_start_pending(), _bt_do_singleval(), _bt_keep_natts_fast(), _bt_singleval_fillfactor(), Assert(), BTMaxItemSize, BTP_HAS_GARBAGE, BTPageGetOpaque, BTPageOpaqueData::btpo_flags, buf, BufferGetPage, elog, END_CRIT_SECTION, ERROR, INDEX_SIZE_MASK, IndexRelationGetNumberOfKeyAttributes, InvalidOffsetNumber, ItemIdGetLength, ItemIdIsDead, MarkBufferDirty(), Min, xl_btree_dedup::nintervals, OffsetNumberNext, P_FIRSTDATAKEY, P_HAS_GARBAGE, P_HIKEY, P_RIGHTMOST, PageAddItem, PageGetExactFreeSpace(), PageGetItem, PageGetItemId, PageGetLSN, PageGetMaxOffsetNumber, PageGetTempPageCopySpecial(), PageRestoreTempPage(), PageSetLSN, palloc(), pfree(), PG_USED_FOR_ASSERTS_ONLY, REGBUF_STANDARD, RelationNeedsWAL, SizeOfBtreeDedup, START_CRIT_SECTION, XLOG_BTREE_DEDUP, XLogBeginInsert(), XLogInsert(), XLogRegisterBufData(), XLogRegisterBuffer(), and XLogRegisterData().

Referenced by _bt_delete_or_dedup_one_page().

◆ _bt_dedup_save_htid()

bool _bt_dedup_save_htid ( BTDedupState  state,
IndexTuple  itup 
)

Definition at line 483 of file nbtdedup.c.

484 {
485  int nhtids;
486  ItemPointer htids;
487  Size mergedtupsz;
488 
489  Assert(!BTreeTupleIsPivot(itup));
490 
491  if (!BTreeTupleIsPosting(itup))
492  {
493  nhtids = 1;
494  htids = &itup->t_tid;
495  }
496  else
497  {
498  nhtids = BTreeTupleGetNPosting(itup);
499  htids = BTreeTupleGetPosting(itup);
500  }
501 
502  /*
503  * Don't append (have caller finish pending posting list as-is) if
504  * appending heap TID(s) from itup would put us over maxpostingsize limit.
505  *
506  * This calculation needs to match the code used within _bt_form_posting()
507  * for new posting list tuples.
508  */
509  mergedtupsz = MAXALIGN(state->basetupsize +
510  (state->nhtids + nhtids) * sizeof(ItemPointerData));
511 
512  if (mergedtupsz > state->maxpostingsize)
513  {
514  /*
515  * Count this as an oversized item for single value strategy, though
516  * only when there are 50 TIDs in the final posting list tuple. This
517  * limit (which is fairly arbitrary) avoids confusion about how many
518  * 1/6 of a page tuples have been encountered/created by the current
519  * deduplication pass.
520  *
521  * Note: We deliberately don't consider which deduplication pass
522  * merged together tuples to create this item (could be a previous
523  * deduplication pass, or current pass). See _bt_do_singleval()
524  * comments.
525  */
526  if (state->nhtids > 50)
527  state->nmaxitems++;
528 
529  return false;
530  }
531 
532  /*
533  * Save heap TIDs to pending posting list tuple -- itup can be merged into
534  * pending posting list
535  */
536  state->nitems++;
537  memcpy(state->htids + state->nhtids, htids,
538  sizeof(ItemPointerData) * nhtids);
539  state->nhtids += nhtids;
540  state->phystupsize += MAXALIGN(IndexTupleSize(itup)) + sizeof(ItemIdData);
541 
542  return true;
543 }
static bool BTreeTupleIsPivot(IndexTuple itup)
Definition: nbtree.h:475
static ItemPointer BTreeTupleGetPosting(IndexTuple posting)
Definition: nbtree.h:532

References Assert(), BTreeTupleGetNPosting(), BTreeTupleGetPosting(), BTreeTupleIsPivot(), BTreeTupleIsPosting(), IndexTupleSize, MAXALIGN, and IndexTupleData::t_tid.

Referenced by _bt_bottomupdel_pass(), _bt_dedup_pass(), _bt_load(), and btree_xlog_dedup().

◆ _bt_dedup_start_pending()

void _bt_dedup_start_pending ( BTDedupState  state,
IndexTuple  base,
OffsetNumber  baseoff 
)

Definition at line 432 of file nbtdedup.c.

434 {
435  Assert(state->nhtids == 0);
436  Assert(state->nitems == 0);
437  Assert(!BTreeTupleIsPivot(base));
438 
439  /*
440  * Copy heap TID(s) from new base tuple for new candidate posting list
441  * into working state's array
442  */
443  if (!BTreeTupleIsPosting(base))
444  {
445  memcpy(state->htids, &base->t_tid, sizeof(ItemPointerData));
446  state->nhtids = 1;
447  state->basetupsize = IndexTupleSize(base);
448  }
449  else
450  {
451  int nposting;
452 
453  nposting = BTreeTupleGetNPosting(base);
454  memcpy(state->htids, BTreeTupleGetPosting(base),
455  sizeof(ItemPointerData) * nposting);
456  state->nhtids = nposting;
457  /* basetupsize should not include existing posting list */
458  state->basetupsize = BTreeTupleGetPostingOffset(base);
459  }
460 
461  /*
462  * Save new base tuple itself -- it'll be needed if we actually create a
463  * new posting list from new pending posting list.
464  *
465  * Must maintain physical size of all existing tuples (including line
466  * pointer overhead) so that we can calculate space savings on page.
467  */
468  state->nitems = 1;
469  state->base = base;
470  state->baseoff = baseoff;
471  state->phystupsize = MAXALIGN(IndexTupleSize(base)) + sizeof(ItemIdData);
472  /* Also save baseoff in pending state for interval */
473  state->intervals[state->nintervals].baseoff = state->baseoff;
474 }
static uint32 BTreeTupleGetPostingOffset(IndexTuple posting)
Definition: nbtree.h:524

References Assert(), BTreeTupleGetNPosting(), BTreeTupleGetPosting(), BTreeTupleGetPostingOffset(), BTreeTupleIsPivot(), BTreeTupleIsPosting(), IndexTupleSize, MAXALIGN, and IndexTupleData::t_tid.

Referenced by _bt_bottomupdel_pass(), _bt_dedup_pass(), _bt_load(), and btree_xlog_dedup().

◆ _bt_do_singleval()

static bool _bt_do_singleval ( Relation  rel,
Page  page,
BTDedupState  state,
OffsetNumber  minoff,
IndexTuple  newitem 
)
static

Definition at line 778 of file nbtdedup.c.

780 {
781  int nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
782  ItemId itemid;
783  IndexTuple itup;
784 
785  itemid = PageGetItemId(page, minoff);
786  itup = (IndexTuple) PageGetItem(page, itemid);
787 
788  if (_bt_keep_natts_fast(rel, newitem, itup) > nkeyatts)
789  {
790  itemid = PageGetItemId(page, PageGetMaxOffsetNumber(page));
791  itup = (IndexTuple) PageGetItem(page, itemid);
792 
793  if (_bt_keep_natts_fast(rel, newitem, itup) > nkeyatts)
794  return true;
795  }
796 
797  return false;
798 }

References _bt_keep_natts_fast(), IndexRelationGetNumberOfKeyAttributes, PageGetItem, PageGetItemId, and PageGetMaxOffsetNumber.

Referenced by _bt_dedup_pass().

◆ _bt_form_posting()

IndexTuple _bt_form_posting ( IndexTuple  base,
ItemPointer  htids,
int  nhtids 
)

Definition at line 860 of file nbtdedup.c.

861 {
862  uint32 keysize,
863  newsize;
864  IndexTuple itup;
865 
866  if (BTreeTupleIsPosting(base))
867  keysize = BTreeTupleGetPostingOffset(base);
868  else
869  keysize = IndexTupleSize(base);
870 
871  Assert(!BTreeTupleIsPivot(base));
872  Assert(nhtids > 0 && nhtids <= PG_UINT16_MAX);
873  Assert(keysize == MAXALIGN(keysize));
874 
875  /* Determine final size of new tuple */
876  if (nhtids > 1)
877  newsize = MAXALIGN(keysize +
878  nhtids * sizeof(ItemPointerData));
879  else
880  newsize = keysize;
881 
882  Assert(newsize <= INDEX_SIZE_MASK);
883  Assert(newsize == MAXALIGN(newsize));
884 
885  /* Allocate memory using palloc0() (matches index_form_tuple()) */
886  itup = palloc0(newsize);
887  memcpy(itup, base, keysize);
888  itup->t_info &= ~INDEX_SIZE_MASK;
889  itup->t_info |= newsize;
890  if (nhtids > 1)
891  {
892  /* Form posting list tuple */
893  BTreeTupleSetPosting(itup, nhtids, keysize);
894  memcpy(BTreeTupleGetPosting(itup), htids,
895  sizeof(ItemPointerData) * nhtids);
896  Assert(_bt_posting_valid(itup));
897  }
898  else
899  {
900  /* Form standard non-pivot tuple */
901  itup->t_info &= ~INDEX_ALT_TID_MASK;
902  ItemPointerCopy(htids, &itup->t_tid);
904  }
905 
906  return itup;
907 }
unsigned int uint32
Definition: c.h:441
#define PG_UINT16_MAX
Definition: c.h:522
#define ItemPointerCopy(fromPointer, toPointer)
Definition: itemptr.h:161
#define ItemPointerIsValid(pointer)
Definition: itemptr.h:82
void * palloc0(Size size)
Definition: mcxt.c:1099
static void BTreeTupleSetPosting(IndexTuple itup, uint16 nhtids, int postingoffset)
Definition: nbtree.h:499
#define INDEX_ALT_TID_MASK
Definition: nbtree.h:461
unsigned short t_info
Definition: itup.h:49

References Assert(), BTreeTupleGetPosting(), BTreeTupleGetPostingOffset(), BTreeTupleIsPivot(), BTreeTupleIsPosting(), BTreeTupleSetPosting(), INDEX_ALT_TID_MASK, INDEX_SIZE_MASK, IndexTupleSize, ItemPointerCopy, ItemPointerIsValid, MAXALIGN, palloc0(), PG_UINT16_MAX, IndexTupleData::t_info, and IndexTupleData::t_tid.

Referenced by _bt_dedup_finish_pending(), _bt_sort_dedup_finish_pending(), and bt_posting_plain_tuple().

◆ _bt_singleval_fillfactor()

static void _bt_singleval_fillfactor ( Page  page,
BTDedupState  state,
Size  newitemsz 
)
static

Definition at line 818 of file nbtdedup.c.

819 {
820  Size leftfree;
821  int reduction;
822 
823  /* This calculation needs to match nbtsplitloc.c */
824  leftfree = PageGetPageSize(page) - SizeOfPageHeaderData -
825  MAXALIGN(sizeof(BTPageOpaqueData));
826  /* Subtract size of new high key (includes pivot heap TID space) */
827  leftfree -= newitemsz + MAXALIGN(sizeof(ItemPointerData));
828 
829  /*
830  * Reduce maxpostingsize by an amount equal to target free space on left
831  * half of page
832  */
833  reduction = leftfree * ((100 - BTREE_SINGLEVAL_FILLFACTOR) / 100.0);
834  if (state->maxpostingsize > reduction)
835  state->maxpostingsize -= reduction;
836  else
837  state->maxpostingsize = 0;
838 }
#define SizeOfPageHeaderData
Definition: bufpage.h:215
#define PageGetPageSize(page)
Definition: bufpage.h:267
#define BTREE_SINGLEVAL_FILLFACTOR
Definition: nbtree.h:203

References BTREE_SINGLEVAL_FILLFACTOR, MAXALIGN, PageGetPageSize, and SizeOfPageHeaderData.

Referenced by _bt_dedup_pass().

◆ _bt_swap_posting()

IndexTuple _bt_swap_posting ( IndexTuple  newitem,
IndexTuple  oposting,
int  postingoff 
)

Definition at line 1018 of file nbtdedup.c.

1019 {
1020  int nhtids;
1021  char *replacepos;
1022  char *replaceposright;
1023  Size nmovebytes;
1024  IndexTuple nposting;
1025 
1026  nhtids = BTreeTupleGetNPosting(oposting);
1027  Assert(_bt_posting_valid(oposting));
1028 
1029  /*
1030  * The postingoff argument originated as a _bt_binsrch_posting() return
1031  * value. It will be 0 in the event of corruption that makes a leaf page
1032  * contain a non-pivot tuple that's somehow identical to newitem (no two
1033  * non-pivot tuples should ever have the same TID). This has been known
1034  * to happen in the field from time to time.
1035  *
1036  * Perform a basic sanity check to catch this case now.
1037  */
1038  if (!(postingoff > 0 && postingoff < nhtids))
1039  elog(ERROR, "posting list tuple with %d items cannot be split at offset %d",
1040  nhtids, postingoff);
1041 
1042  /*
1043  * Move item pointers in posting list to make a gap for the new item's
1044  * heap TID. We shift TIDs one place to the right, losing original
1045  * rightmost TID. (nmovebytes must not include TIDs to the left of
1046  * postingoff, nor the existing rightmost/max TID that gets overwritten.)
1047  */
1048  nposting = CopyIndexTuple(oposting);
1049  replacepos = (char *) BTreeTupleGetPostingN(nposting, postingoff);
1050  replaceposright = (char *) BTreeTupleGetPostingN(nposting, postingoff + 1);
1051  nmovebytes = (nhtids - postingoff - 1) * sizeof(ItemPointerData);
1052  memmove(replaceposright, replacepos, nmovebytes);
1053 
1054  /* Fill the gap at postingoff with TID of new item (original new TID) */
1055  Assert(!BTreeTupleIsPivot(newitem) && !BTreeTupleIsPosting(newitem));
1056  ItemPointerCopy(&newitem->t_tid, (ItemPointer) replacepos);
1057 
1058  /* Now copy oposting's rightmost/max TID into new item (final new TID) */
1059  ItemPointerCopy(BTreeTupleGetMaxHeapTID(oposting), &newitem->t_tid);
1060 
1062  BTreeTupleGetHeapTID(newitem)) < 0);
1063  Assert(_bt_posting_valid(nposting));
1064 
1065  return nposting;
1066 }
IndexTuple CopyIndexTuple(IndexTuple source)
Definition: indextuple.c:528
int32 ItemPointerCompare(ItemPointer arg1, ItemPointer arg2)
Definition: itemptr.c:52

References Assert(), BTreeTupleGetHeapTID(), BTreeTupleGetMaxHeapTID(), BTreeTupleGetNPosting(), BTreeTupleGetPostingN(), BTreeTupleIsPivot(), BTreeTupleIsPosting(), CopyIndexTuple(), elog, ERROR, ItemPointerCompare(), ItemPointerCopy, and IndexTupleData::t_tid.

Referenced by _bt_insertonpg(), btree_xlog_insert(), and btree_xlog_split().

◆ _bt_update_posting()

void _bt_update_posting ( BTVacuumPosting  vacposting)

Definition at line 920 of file nbtdedup.c.

921 {
922  IndexTuple origtuple = vacposting->itup;
923  uint32 keysize,
924  newsize;
925  IndexTuple itup;
926  int nhtids;
927  int ui,
928  d;
929  ItemPointer htids;
930 
931  nhtids = BTreeTupleGetNPosting(origtuple) - vacposting->ndeletedtids;
932 
933  Assert(_bt_posting_valid(origtuple));
934  Assert(nhtids > 0 && nhtids < BTreeTupleGetNPosting(origtuple));
935 
936  /*
937  * Determine final size of new tuple.
938  *
939  * This calculation needs to match the code used within _bt_form_posting()
940  * for new posting list tuples. We avoid calling _bt_form_posting() here
941  * to save ourselves a second memory allocation for a htids workspace.
942  */
943  keysize = BTreeTupleGetPostingOffset(origtuple);
944  if (nhtids > 1)
945  newsize = MAXALIGN(keysize +
946  nhtids * sizeof(ItemPointerData));
947  else
948  newsize = keysize;
949 
950  Assert(newsize <= INDEX_SIZE_MASK);
951  Assert(newsize == MAXALIGN(newsize));
952 
953  /* Allocate memory using palloc0() (matches index_form_tuple()) */
954  itup = palloc0(newsize);
955  memcpy(itup, origtuple, keysize);
956  itup->t_info &= ~INDEX_SIZE_MASK;
957  itup->t_info |= newsize;
958 
959  if (nhtids > 1)
960  {
961  /* Form posting list tuple */
962  BTreeTupleSetPosting(itup, nhtids, keysize);
963  htids = BTreeTupleGetPosting(itup);
964  }
965  else
966  {
967  /* Form standard non-pivot tuple */
968  itup->t_info &= ~INDEX_ALT_TID_MASK;
969  htids = &itup->t_tid;
970  }
971 
972  ui = 0;
973  d = 0;
974  for (int i = 0; i < BTreeTupleGetNPosting(origtuple); i++)
975  {
976  if (d < vacposting->ndeletedtids && vacposting->deletetids[d] == i)
977  {
978  d++;
979  continue;
980  }
981  htids[ui++] = *BTreeTupleGetPostingN(origtuple, i);
982  }
983  Assert(ui == nhtids);
984  Assert(d == vacposting->ndeletedtids);
985  Assert(nhtids == 1 || _bt_posting_valid(itup));
986  Assert(nhtids > 1 || ItemPointerIsValid(&itup->t_tid));
987 
988  /* vacposting arg's itup will now point to updated version */
989  vacposting->itup = itup;
990 }
uint16 deletetids[FLEXIBLE_ARRAY_MEMBER]
Definition: nbtree.h:911
uint16 ndeletedtids
Definition: nbtree.h:910
IndexTuple itup
Definition: nbtree.h:906

References Assert(), BTreeTupleGetNPosting(), BTreeTupleGetPosting(), BTreeTupleGetPostingN(), BTreeTupleGetPostingOffset(), BTreeTupleSetPosting(), BTVacuumPostingData::deletetids, i, INDEX_ALT_TID_MASK, INDEX_SIZE_MASK, ItemPointerIsValid, BTVacuumPostingData::itup, MAXALIGN, BTVacuumPostingData::ndeletedtids, palloc0(), IndexTupleData::t_info, and IndexTupleData::t_tid.

Referenced by _bt_delitems_update(), and btree_xlog_updates().