PostgreSQL Source Code  git master
nbtdedup.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * nbtdedup.c
4  * Deduplicate items in Postgres btrees.
5  *
6  * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  *
10  * IDENTIFICATION
11  * src/backend/access/nbtree/nbtdedup.c
12  *
13  *-------------------------------------------------------------------------
14  */
15 #include "postgres.h"
16 
17 #include "access/nbtree.h"
18 #include "access/nbtxlog.h"
19 #include "miscadmin.h"
20 #include "utils/rel.h"
21 
22 static bool _bt_do_singleval(Relation rel, Page page, BTDedupState state,
23  OffsetNumber minoff, IndexTuple newitem);
25  Size newitemsz);
26 #ifdef USE_ASSERT_CHECKING
27 static bool _bt_posting_valid(IndexTuple posting);
28 #endif
29 
30 /*
31  * Deduplicate items on a leaf page. The page will have to be split by caller
32  * if we cannot successfully free at least newitemsz (we also need space for
33  * newitem's line pointer, which isn't included in caller's newitemsz).
34  *
35  * The general approach taken here is to perform as much deduplication as
36  * possible to free as much space as possible. Note, however, that "single
37  * value" strategy is sometimes used for !checkingunique callers, in which
38  * case deduplication will leave a few tuples untouched at the end of the
39  * page. The general idea is to prepare the page for an anticipated page
40  * split that uses nbtsplitloc.c's "single value" strategy to determine a
41  * split point. (There is no reason to deduplicate items that will end up on
42  * the right half of the page after the anticipated page split; better to
43  * handle those if and when the anticipated right half page gets its own
44  * deduplication pass, following further inserts of duplicates.)
45  *
46  * This function should be called during insertion, when the page doesn't have
47  * enough space to fit an incoming newitem. If the BTP_HAS_GARBAGE page flag
48  * was set, caller should have removed any LP_DEAD items by calling
49  * _bt_vacuum_one_page() before calling here. We may still have to kill
50  * LP_DEAD items here when the page's BTP_HAS_GARBAGE hint is falsely unset,
51  * but that should be rare. Also, _bt_vacuum_one_page() won't unset the
52  * BTP_HAS_GARBAGE flag when it finds no LP_DEAD items, so a successful
53  * deduplication pass will always clear it, just to keep things tidy.
54  */
55 void
57  IndexTuple newitem, Size newitemsz, bool checkingunique)
58 {
59  OffsetNumber offnum,
60  minoff,
61  maxoff;
62  Page page = BufferGetPage(buf);
63  BTPageOpaque opaque;
64  Page newpage;
65  int newpagendataitems = 0;
68  int ndeletable = 0;
69  Size pagesaving = 0;
70  bool singlevalstrat = false;
71  int nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
72 
73  /*
74  * We can't assume that there are no LP_DEAD items. For one thing, VACUUM
75  * will clear the BTP_HAS_GARBAGE hint without reliably removing items
76  * that are marked LP_DEAD. We don't want to unnecessarily unset LP_DEAD
77  * bits when deduplicating items. Allowing it would be correct, though
78  * wasteful.
79  */
80  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
81  minoff = P_FIRSTDATAKEY(opaque);
82  maxoff = PageGetMaxOffsetNumber(page);
83  for (offnum = minoff;
84  offnum <= maxoff;
85  offnum = OffsetNumberNext(offnum))
86  {
87  ItemId itemid = PageGetItemId(page, offnum);
88 
89  if (ItemIdIsDead(itemid))
90  deletable[ndeletable++] = offnum;
91  }
92 
93  if (ndeletable > 0)
94  {
95  _bt_delitems_delete(rel, buf, deletable, ndeletable, heapRel);
96 
97  /*
98  * Return when a split will be avoided. This is equivalent to
99  * avoiding a split using the usual _bt_vacuum_one_page() path.
100  */
101  if (PageGetFreeSpace(page) >= newitemsz)
102  return;
103 
104  /*
105  * Reconsider number of items on page, in case _bt_delitems_delete()
106  * managed to delete an item or two
107  */
108  minoff = P_FIRSTDATAKEY(opaque);
109  maxoff = PageGetMaxOffsetNumber(page);
110  }
111 
112  /* Passed-in newitemsz is MAXALIGNED but does not include line pointer */
113  newitemsz += sizeof(ItemIdData);
114 
115  /*
116  * By here, it's clear that deduplication will definitely be attempted.
117  * Initialize deduplication state.
118  *
119  * It would be possible for maxpostingsize (limit on posting list tuple
120  * size) to be set to one third of the page. However, it seems like a
121  * good idea to limit the size of posting lists to one sixth of a page.
122  * That ought to leave us with a good split point when pages full of
123  * duplicates can be split several times.
124  */
125  state = (BTDedupState) palloc(sizeof(BTDedupStateData));
126  state->deduplicate = true;
127  state->maxpostingsize = Min(BTMaxItemSize(page) / 2, INDEX_SIZE_MASK);
128  /* Metadata about base tuple of current pending posting list */
129  state->base = NULL;
130  state->baseoff = InvalidOffsetNumber;
131  state->basetupsize = 0;
132  /* Metadata about current pending posting list TIDs */
133  state->htids = palloc(state->maxpostingsize);
134  state->nhtids = 0;
135  state->nitems = 0;
136  /* Size of all physical tuples to be replaced by pending posting list */
137  state->phystupsize = 0;
138  /* nintervals should be initialized to zero */
139  state->nintervals = 0;
140 
141  /* Determine if "single value" strategy should be used */
142  if (!checkingunique)
143  singlevalstrat = _bt_do_singleval(rel, page, state, minoff, newitem);
144 
145  /*
146  * Deduplicate items from page, and write them to newpage.
147  *
148  * Copy the original page's LSN into newpage copy. This will become the
149  * updated version of the page. We need this because XLogInsert will
150  * examine the LSN and possibly dump it in a page image.
151  */
152  newpage = PageGetTempPageCopySpecial(page);
153  PageSetLSN(newpage, PageGetLSN(page));
154 
155  /* Copy high key, if any */
156  if (!P_RIGHTMOST(opaque))
157  {
158  ItemId hitemid = PageGetItemId(page, P_HIKEY);
159  Size hitemsz = ItemIdGetLength(hitemid);
160  IndexTuple hitem = (IndexTuple) PageGetItem(page, hitemid);
161 
162  if (PageAddItem(newpage, (Item) hitem, hitemsz, P_HIKEY,
163  false, false) == InvalidOffsetNumber)
164  elog(ERROR, "deduplication failed to add highkey");
165  }
166 
167  for (offnum = minoff;
168  offnum <= maxoff;
169  offnum = OffsetNumberNext(offnum))
170  {
171  ItemId itemid = PageGetItemId(page, offnum);
172  IndexTuple itup = (IndexTuple) PageGetItem(page, itemid);
173 
174  Assert(!ItemIdIsDead(itemid));
175 
176  if (offnum == minoff)
177  {
178  /*
179  * No previous/base tuple for the data item -- use the data item
180  * as base tuple of pending posting list
181  */
182  _bt_dedup_start_pending(state, itup, offnum);
183  }
184  else if (state->deduplicate &&
185  _bt_keep_natts_fast(rel, state->base, itup) > nkeyatts &&
186  _bt_dedup_save_htid(state, itup))
187  {
188  /*
189  * Tuple is equal to base tuple of pending posting list. Heap
190  * TID(s) for itup have been saved in state.
191  */
192  }
193  else
194  {
195  /*
196  * Tuple is not equal to pending posting list tuple, or
197  * _bt_dedup_save_htid() opted to not merge current item into
198  * pending posting list for some other reason (e.g., adding more
199  * TIDs would have caused posting list to exceed current
200  * maxpostingsize).
201  *
202  * If state contains pending posting list with more than one item,
203  * form new posting tuple, and actually update the page. Else
204  * reset the state and move on without modifying the page.
205  */
206  pagesaving += _bt_dedup_finish_pending(newpage, state);
207  newpagendataitems++;
208 
209  if (singlevalstrat)
210  {
211  /*
212  * Single value strategy's extra steps.
213  *
214  * Lower maxpostingsize for sixth and final item that might be
215  * deduplicated by current deduplication pass. When sixth
216  * item formed/observed, stop deduplicating items.
217  *
218  * Note: It's possible that this will be reached even when
219  * current deduplication pass has yet to merge together some
220  * existing items. It doesn't matter whether or not the
221  * current call generated the maxpostingsize-capped duplicate
222  * tuples at the start of the page.
223  */
224  if (newpagendataitems == 5)
225  _bt_singleval_fillfactor(page, state, newitemsz);
226  else if (newpagendataitems == 6)
227  {
228  state->deduplicate = false;
229  singlevalstrat = false; /* won't be back here */
230  }
231  }
232 
233  /* itup starts new pending posting list */
234  _bt_dedup_start_pending(state, itup, offnum);
235  }
236  }
237 
238  /* Handle the last item */
239  pagesaving += _bt_dedup_finish_pending(newpage, state);
240  newpagendataitems++;
241 
242  /*
243  * If no items suitable for deduplication were found, newpage must be
244  * exactly the same as the original page, so just return from function.
245  *
246  * We could determine whether or not to proceed on the basis the space
247  * savings being sufficient to avoid an immediate page split instead. We
248  * don't do that because there is some small value in nbtsplitloc.c always
249  * operating against a page that is fully deduplicated (apart from
250  * newitem). Besides, most of the cost has already been paid.
251  */
252  if (state->nintervals == 0)
253  {
254  /* cannot leak memory here */
255  pfree(newpage);
256  pfree(state->htids);
257  pfree(state);
258  return;
259  }
260 
261  /*
262  * By here, it's clear that deduplication will definitely go ahead.
263  *
264  * Clear the BTP_HAS_GARBAGE page flag in the unlikely event that it is
265  * still falsely set, just to keep things tidy. (We can't rely on
266  * _bt_vacuum_one_page() having done this already, and we can't rely on a
267  * page split or VACUUM getting to it in the near future.)
268  */
269  if (P_HAS_GARBAGE(opaque))
270  {
271  BTPageOpaque nopaque = (BTPageOpaque) PageGetSpecialPointer(newpage);
272 
273  nopaque->btpo_flags &= ~BTP_HAS_GARBAGE;
274  }
275 
277 
278  PageRestoreTempPage(newpage, page);
279  MarkBufferDirty(buf);
280 
281  /* XLOG stuff */
282  if (RelationNeedsWAL(rel))
283  {
284  XLogRecPtr recptr;
285  xl_btree_dedup xlrec_dedup;
286 
287  xlrec_dedup.nintervals = state->nintervals;
288 
289  XLogBeginInsert();
291  XLogRegisterData((char *) &xlrec_dedup, SizeOfBtreeDedup);
292 
293  /*
294  * The intervals array is not in the buffer, but pretend that it is.
295  * When XLogInsert stores the whole buffer, the array need not be
296  * stored too.
297  */
298  XLogRegisterBufData(0, (char *) state->intervals,
299  state->nintervals * sizeof(BTDedupInterval));
300 
301  recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DEDUP);
302 
303  PageSetLSN(page, recptr);
304  }
305 
307 
308  /* Local space accounting should agree with page accounting */
309  Assert(pagesaving < newitemsz || PageGetExactFreeSpace(page) >= newitemsz);
310 
311  /* cannot leak memory here */
312  pfree(state->htids);
313  pfree(state);
314 }
315 
316 /*
317  * Create a new pending posting list tuple based on caller's base tuple.
318  *
319  * Every tuple processed by deduplication either becomes the base tuple for a
320  * posting list, or gets its heap TID(s) accepted into a pending posting list.
321  * A tuple that starts out as the base tuple for a posting list will only
322  * actually be rewritten within _bt_dedup_finish_pending() when it turns out
323  * that there are duplicates that can be merged into the base tuple.
324  */
325 void
327  OffsetNumber baseoff)
328 {
329  Assert(state->nhtids == 0);
330  Assert(state->nitems == 0);
331  Assert(!BTreeTupleIsPivot(base));
332 
333  /*
334  * Copy heap TID(s) from new base tuple for new candidate posting list
335  * into working state's array
336  */
337  if (!BTreeTupleIsPosting(base))
338  {
339  memcpy(state->htids, &base->t_tid, sizeof(ItemPointerData));
340  state->nhtids = 1;
341  state->basetupsize = IndexTupleSize(base);
342  }
343  else
344  {
345  int nposting;
346 
347  nposting = BTreeTupleGetNPosting(base);
348  memcpy(state->htids, BTreeTupleGetPosting(base),
349  sizeof(ItemPointerData) * nposting);
350  state->nhtids = nposting;
351  /* basetupsize should not include existing posting list */
353  }
354 
355  /*
356  * Save new base tuple itself -- it'll be needed if we actually create a
357  * new posting list from new pending posting list.
358  *
359  * Must maintain physical size of all existing tuples (including line
360  * pointer overhead) so that we can calculate space savings on page.
361  */
362  state->nitems = 1;
363  state->base = base;
364  state->baseoff = baseoff;
365  state->phystupsize = MAXALIGN(IndexTupleSize(base)) + sizeof(ItemIdData);
366  /* Also save baseoff in pending state for interval */
367  state->intervals[state->nintervals].baseoff = state->baseoff;
368 }
369 
370 /*
371  * Save itup heap TID(s) into pending posting list where possible.
372  *
373  * Returns bool indicating if the pending posting list managed by state now
374  * includes itup's heap TID(s).
375  */
376 bool
378 {
379  int nhtids;
380  ItemPointer htids;
381  Size mergedtupsz;
382 
383  Assert(!BTreeTupleIsPivot(itup));
384 
385  if (!BTreeTupleIsPosting(itup))
386  {
387  nhtids = 1;
388  htids = &itup->t_tid;
389  }
390  else
391  {
392  nhtids = BTreeTupleGetNPosting(itup);
393  htids = BTreeTupleGetPosting(itup);
394  }
395 
396  /*
397  * Don't append (have caller finish pending posting list as-is) if
398  * appending heap TID(s) from itup would put us over maxpostingsize limit.
399  *
400  * This calculation needs to match the code used within _bt_form_posting()
401  * for new posting list tuples.
402  */
403  mergedtupsz = MAXALIGN(state->basetupsize +
404  (state->nhtids + nhtids) * sizeof(ItemPointerData));
405 
406  if (mergedtupsz > state->maxpostingsize)
407  return false;
408 
409  /*
410  * Save heap TIDs to pending posting list tuple -- itup can be merged into
411  * pending posting list
412  */
413  state->nitems++;
414  memcpy(state->htids + state->nhtids, htids,
415  sizeof(ItemPointerData) * nhtids);
416  state->nhtids += nhtids;
417  state->phystupsize += MAXALIGN(IndexTupleSize(itup)) + sizeof(ItemIdData);
418 
419  return true;
420 }
421 
422 /*
423  * Finalize pending posting list tuple, and add it to the page. Final tuple
424  * is based on saved base tuple, and saved list of heap TIDs.
425  *
426  * Returns space saving from deduplicating to make a new posting list tuple.
427  * Note that this includes line pointer overhead. This is zero in the case
428  * where no deduplication was possible.
429  */
430 Size
432 {
433  OffsetNumber tupoff;
434  Size tuplesz;
435  Size spacesaving;
436 
437  Assert(state->nitems > 0);
438  Assert(state->nitems <= state->nhtids);
439  Assert(state->intervals[state->nintervals].baseoff == state->baseoff);
440 
441  tupoff = OffsetNumberNext(PageGetMaxOffsetNumber(newpage));
442  if (state->nitems == 1)
443  {
444  /* Use original, unchanged base tuple */
445  tuplesz = IndexTupleSize(state->base);
446  if (PageAddItem(newpage, (Item) state->base, tuplesz, tupoff,
447  false, false) == InvalidOffsetNumber)
448  elog(ERROR, "deduplication failed to add tuple to page");
449 
450  spacesaving = 0;
451  }
452  else
453  {
454  IndexTuple final;
455 
456  /* Form a tuple with a posting list */
457  final = _bt_form_posting(state->base, state->htids, state->nhtids);
458  tuplesz = IndexTupleSize(final);
459  Assert(tuplesz <= state->maxpostingsize);
460 
461  /* Save final number of items for posting list */
462  state->intervals[state->nintervals].nitems = state->nitems;
463 
464  Assert(tuplesz == MAXALIGN(IndexTupleSize(final)));
465  if (PageAddItem(newpage, (Item) final, tuplesz, tupoff, false,
466  false) == InvalidOffsetNumber)
467  elog(ERROR, "deduplication failed to add tuple to page");
468 
469  pfree(final);
470  spacesaving = state->phystupsize - (tuplesz + sizeof(ItemIdData));
471  /* Increment nintervals, since we wrote a new posting list tuple */
472  state->nintervals++;
473  Assert(spacesaving > 0 && spacesaving < BLCKSZ);
474  }
475 
476  /* Reset state for next pending posting list */
477  state->nhtids = 0;
478  state->nitems = 0;
479  state->phystupsize = 0;
480 
481  return spacesaving;
482 }
483 
484 /*
485  * Determine if page non-pivot tuples (data items) are all duplicates of the
486  * same value -- if they are, deduplication's "single value" strategy should
487  * be applied. The general goal of this strategy is to ensure that
488  * nbtsplitloc.c (which uses its own single value strategy) will find a useful
489  * split point as further duplicates are inserted, and successive rightmost
490  * page splits occur among pages that store the same duplicate value. When
491  * the page finally splits, it should end up BTREE_SINGLEVAL_FILLFACTOR% full,
492  * just like it would if deduplication were disabled.
493  *
494  * We expect that affected workloads will require _several_ single value
495  * strategy deduplication passes (over a page that only stores duplicates)
496  * before the page is finally split. The first deduplication pass should only
497  * find regular non-pivot tuples. Later deduplication passes will find
498  * existing maxpostingsize-capped posting list tuples, which must be skipped
499  * over. The penultimate pass is generally the first pass that actually
500  * reaches _bt_singleval_fillfactor(), and so will deliberately leave behind a
501  * few untouched non-pivot tuples. The final deduplication pass won't free
502  * any space -- it will skip over everything without merging anything (it
503  * retraces the steps of the penultimate pass).
504  *
505  * Fortunately, having several passes isn't too expensive. Each pass (after
506  * the first pass) won't spend many cycles on the large posting list tuples
507  * left by previous passes. Each pass will find a large contiguous group of
508  * smaller duplicate tuples to merge together at the end of the page.
509  *
510  * Note: We deliberately don't bother checking if the high key is a distinct
511  * value (prior to the TID tiebreaker column) before proceeding, unlike
512  * nbtsplitloc.c. Its single value strategy only gets applied on the
513  * rightmost page of duplicates of the same value (other leaf pages full of
514  * duplicates will get a simple 50:50 page split instead of splitting towards
515  * the end of the page). There is little point in making the same distinction
516  * here.
517  */
518 static bool
520  OffsetNumber minoff, IndexTuple newitem)
521 {
522  int nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
523  ItemId itemid;
524  IndexTuple itup;
525 
526  itemid = PageGetItemId(page, minoff);
527  itup = (IndexTuple) PageGetItem(page, itemid);
528 
529  if (_bt_keep_natts_fast(rel, newitem, itup) > nkeyatts)
530  {
531  itemid = PageGetItemId(page, PageGetMaxOffsetNumber(page));
532  itup = (IndexTuple) PageGetItem(page, itemid);
533 
534  if (_bt_keep_natts_fast(rel, newitem, itup) > nkeyatts)
535  return true;
536  }
537 
538  return false;
539 }
540 
541 /*
542  * Lower maxpostingsize when using "single value" strategy, to avoid a sixth
543  * and final maxpostingsize-capped tuple. The sixth and final posting list
544  * tuple will end up somewhat smaller than the first five. (Note: The first
545  * five tuples could actually just be very large duplicate tuples that
546  * couldn't be merged together at all. Deduplication will simply not modify
547  * the page when that happens.)
548  *
549  * When there are six posting lists on the page (after current deduplication
550  * pass goes on to create/observe a sixth very large tuple), caller should end
551  * its deduplication pass. It isn't useful to try to deduplicate items that
552  * are supposed to end up on the new right sibling page following the
553  * anticipated page split. A future deduplication pass of future right
554  * sibling page might take care of it. (This is why the first single value
555  * strategy deduplication pass for a given leaf page will generally find only
556  * plain non-pivot tuples -- see _bt_do_singleval() comments.)
557  */
558 static void
560 {
561  Size leftfree;
562  int reduction;
563 
564  /* This calculation needs to match nbtsplitloc.c */
565  leftfree = PageGetPageSize(page) - SizeOfPageHeaderData -
566  MAXALIGN(sizeof(BTPageOpaqueData));
567  /* Subtract size of new high key (includes pivot heap TID space) */
568  leftfree -= newitemsz + MAXALIGN(sizeof(ItemPointerData));
569 
570  /*
571  * Reduce maxpostingsize by an amount equal to target free space on left
572  * half of page
573  */
574  reduction = leftfree * ((100 - BTREE_SINGLEVAL_FILLFACTOR) / 100.0);
575  if (state->maxpostingsize > reduction)
576  state->maxpostingsize -= reduction;
577  else
578  state->maxpostingsize = 0;
579 }
580 
581 /*
582  * Build a posting list tuple based on caller's "base" index tuple and list of
583  * heap TIDs. When nhtids == 1, builds a standard non-pivot tuple without a
584  * posting list. (Posting list tuples can never have a single heap TID, partly
585  * because that ensures that deduplication always reduces final MAXALIGN()'d
586  * size of entire tuple.)
587  *
588  * Convention is that posting list starts at a MAXALIGN()'d offset (rather
589  * than a SHORTALIGN()'d offset), in line with the approach taken when
590  * appending a heap TID to new pivot tuple/high key during suffix truncation.
591  * This sometimes wastes a little space that was only needed as alignment
592  * padding in the original tuple. Following this convention simplifies the
593  * space accounting used when deduplicating a page (the same convention
594  * simplifies the accounting for choosing a point to split a page at).
595  *
596  * Note: Caller's "htids" array must be unique and already in ascending TID
597  * order. Any existing heap TIDs from "base" won't automatically appear in
598  * returned posting list tuple (they must be included in htids array.)
599  */
601 _bt_form_posting(IndexTuple base, ItemPointer htids, int nhtids)
602 {
603  uint32 keysize,
604  newsize;
605  IndexTuple itup;
606 
607  if (BTreeTupleIsPosting(base))
608  keysize = BTreeTupleGetPostingOffset(base);
609  else
610  keysize = IndexTupleSize(base);
611 
612  Assert(!BTreeTupleIsPivot(base));
613  Assert(nhtids > 0 && nhtids <= PG_UINT16_MAX);
614  Assert(keysize == MAXALIGN(keysize));
615 
616  /* Determine final size of new tuple */
617  if (nhtids > 1)
618  newsize = MAXALIGN(keysize +
619  nhtids * sizeof(ItemPointerData));
620  else
621  newsize = keysize;
622 
623  Assert(newsize <= INDEX_SIZE_MASK);
624  Assert(newsize == MAXALIGN(newsize));
625 
626  /* Allocate memory using palloc0() (matches index_form_tuple()) */
627  itup = palloc0(newsize);
628  memcpy(itup, base, keysize);
629  itup->t_info &= ~INDEX_SIZE_MASK;
630  itup->t_info |= newsize;
631  if (nhtids > 1)
632  {
633  /* Form posting list tuple */
634  BTreeTupleSetPosting(itup, nhtids, keysize);
635  memcpy(BTreeTupleGetPosting(itup), htids,
636  sizeof(ItemPointerData) * nhtids);
637  Assert(_bt_posting_valid(itup));
638  }
639  else
640  {
641  /* Form standard non-pivot tuple */
642  itup->t_info &= ~INDEX_ALT_TID_MASK;
643  ItemPointerCopy(htids, &itup->t_tid);
645  }
646 
647  return itup;
648 }
649 
650 /*
651  * Generate a replacement tuple by "updating" a posting list tuple so that it
652  * no longer has TIDs that need to be deleted.
653  *
654  * Used by VACUUM. Caller's vacposting argument points to the existing
655  * posting list tuple to be updated.
656  *
657  * On return, caller's vacposting argument will point to final "updated"
658  * tuple, which will be palloc()'d in caller's memory context.
659  */
660 void
662 {
663  IndexTuple origtuple = vacposting->itup;
664  uint32 keysize,
665  newsize;
666  IndexTuple itup;
667  int nhtids;
668  int ui,
669  d;
670  ItemPointer htids;
671 
672  nhtids = BTreeTupleGetNPosting(origtuple) - vacposting->ndeletedtids;
673 
674  Assert(_bt_posting_valid(origtuple));
675  Assert(nhtids > 0 && nhtids < BTreeTupleGetNPosting(origtuple));
676 
677  /*
678  * Determine final size of new tuple.
679  *
680  * This calculation needs to match the code used within _bt_form_posting()
681  * for new posting list tuples. We avoid calling _bt_form_posting() here
682  * to save ourselves a second memory allocation for a htids workspace.
683  */
684  keysize = BTreeTupleGetPostingOffset(origtuple);
685  if (nhtids > 1)
686  newsize = MAXALIGN(keysize +
687  nhtids * sizeof(ItemPointerData));
688  else
689  newsize = keysize;
690 
691  Assert(newsize <= INDEX_SIZE_MASK);
692  Assert(newsize == MAXALIGN(newsize));
693 
694  /* Allocate memory using palloc0() (matches index_form_tuple()) */
695  itup = palloc0(newsize);
696  memcpy(itup, origtuple, keysize);
697  itup->t_info &= ~INDEX_SIZE_MASK;
698  itup->t_info |= newsize;
699 
700  if (nhtids > 1)
701  {
702  /* Form posting list tuple */
703  BTreeTupleSetPosting(itup, nhtids, keysize);
704  htids = BTreeTupleGetPosting(itup);
705  }
706  else
707  {
708  /* Form standard non-pivot tuple */
709  itup->t_info &= ~INDEX_ALT_TID_MASK;
710  htids = &itup->t_tid;
711  }
712 
713  ui = 0;
714  d = 0;
715  for (int i = 0; i < BTreeTupleGetNPosting(origtuple); i++)
716  {
717  if (d < vacposting->ndeletedtids && vacposting->deletetids[d] == i)
718  {
719  d++;
720  continue;
721  }
722  htids[ui++] = *BTreeTupleGetPostingN(origtuple, i);
723  }
724  Assert(ui == nhtids);
725  Assert(d == vacposting->ndeletedtids);
726  Assert(nhtids == 1 || _bt_posting_valid(itup));
727  Assert(nhtids > 1 || ItemPointerIsValid(&itup->t_tid));
728 
729  /* vacposting arg's itup will now point to updated version */
730  vacposting->itup = itup;
731 }
732 
733 /*
734  * Prepare for a posting list split by swapping heap TID in newitem with heap
735  * TID from original posting list (the 'oposting' heap TID located at offset
736  * 'postingoff'). Modifies newitem, so caller should pass their own private
737  * copy that can safely be modified.
738  *
739  * Returns new posting list tuple, which is palloc()'d in caller's context.
740  * This is guaranteed to be the same size as 'oposting'. Modified newitem is
741  * what caller actually inserts. (This happens inside the same critical
742  * section that performs an in-place update of old posting list using new
743  * posting list returned here.)
744  *
745  * While the keys from newitem and oposting must be opclass equal, and must
746  * generate identical output when run through the underlying type's output
747  * function, it doesn't follow that their representations match exactly.
748  * Caller must avoid assuming that there can't be representational differences
749  * that make datums from oposting bigger or smaller than the corresponding
750  * datums from newitem. For example, differences in TOAST input state might
751  * break a faulty assumption about tuple size (the executor is entitled to
752  * apply TOAST compression based on its own criteria). It also seems possible
753  * that further representational variation will be introduced in the future,
754  * in order to support nbtree features like page-level prefix compression.
755  *
756  * See nbtree/README for details on the design of posting list splits.
757  */
759 _bt_swap_posting(IndexTuple newitem, IndexTuple oposting, int postingoff)
760 {
761  int nhtids;
762  char *replacepos;
763  char *replaceposright;
764  Size nmovebytes;
765  IndexTuple nposting;
766 
767  nhtids = BTreeTupleGetNPosting(oposting);
768  Assert(_bt_posting_valid(oposting));
769  Assert(postingoff > 0 && postingoff < nhtids);
770 
771  /*
772  * Move item pointers in posting list to make a gap for the new item's
773  * heap TID. We shift TIDs one place to the right, losing original
774  * rightmost TID. (nmovebytes must not include TIDs to the left of
775  * postingoff, nor the existing rightmost/max TID that gets overwritten.)
776  */
777  nposting = CopyIndexTuple(oposting);
778  replacepos = (char *) BTreeTupleGetPostingN(nposting, postingoff);
779  replaceposright = (char *) BTreeTupleGetPostingN(nposting, postingoff + 1);
780  nmovebytes = (nhtids - postingoff - 1) * sizeof(ItemPointerData);
781  memmove(replaceposright, replacepos, nmovebytes);
782 
783  /* Fill the gap at postingoff with TID of new item (original new TID) */
784  Assert(!BTreeTupleIsPivot(newitem) && !BTreeTupleIsPosting(newitem));
785  ItemPointerCopy(&newitem->t_tid, (ItemPointer) replacepos);
786 
787  /* Now copy oposting's rightmost/max TID into new item (final new TID) */
788  ItemPointerCopy(BTreeTupleGetMaxHeapTID(oposting), &newitem->t_tid);
789 
791  BTreeTupleGetHeapTID(newitem)) < 0);
792  Assert(_bt_posting_valid(nposting));
793 
794  return nposting;
795 }
796 
797 /*
798  * Verify posting list invariants for "posting", which must be a posting list
799  * tuple. Used within assertions.
800  */
801 #ifdef USE_ASSERT_CHECKING
802 static bool
803 _bt_posting_valid(IndexTuple posting)
804 {
805  ItemPointerData last;
806  ItemPointer htid;
807 
808  if (!BTreeTupleIsPosting(posting) || BTreeTupleGetNPosting(posting) < 2)
809  return false;
810 
811  /* Remember first heap TID for loop */
812  ItemPointerCopy(BTreeTupleGetHeapTID(posting), &last);
813  if (!ItemPointerIsValid(&last))
814  return false;
815 
816  /* Iterate, starting from second TID */
817  for (int i = 1; i < BTreeTupleGetNPosting(posting); i++)
818  {
819  htid = BTreeTupleGetPostingN(posting, i);
820 
821  if (!ItemPointerIsValid(htid))
822  return false;
823  if (ItemPointerCompare(htid, &last) <= 0)
824  return false;
825  ItemPointerCopy(htid, &last);
826  }
827 
828  return true;
829 }
830 #endif
int32 ItemPointerCompare(ItemPointer arg1, ItemPointer arg2)
Definition: itemptr.c:52
#define ItemPointerIsValid(pointer)
Definition: itemptr.h:82
void XLogRegisterBufData(uint8 block_id, char *data, int len)
Definition: xloginsert.c:362
IndexTuple base
Definition: nbtree.h:745
uint16 ndeletedtids
Definition: nbtree.h:781
IndexTuple _bt_form_posting(IndexTuple base, ItemPointer htids, int nhtids)
Definition: nbtdedup.c:601
void _bt_update_posting(BTVacuumPosting vacposting)
Definition: nbtdedup.c:661
void PageRestoreTempPage(Page tempPage, Page oldPage)
Definition: bufpage.c:403
static bool BTreeTupleIsPivot(IndexTuple itup)
Definition: nbtree.h:348
void _bt_dedup_one_page(Relation rel, Buffer buf, Relation heapRel, IndexTuple newitem, Size newitemsz, bool checkingunique)
Definition: nbtdedup.c:56
uint16 nintervals
Definition: nbtxlog.h:172
OffsetNumber baseoff
Definition: nbtree.h:746
static ItemPointer BTreeTupleGetHeapTID(IndexTuple itup)
Definition: nbtree.h:506
void MarkBufferDirty(Buffer buffer)
Definition: bufmgr.c:1468
void XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags)
Definition: xloginsert.c:214
static ItemPointer BTreeTupleGetPosting(IndexTuple posting)
Definition: nbtree.h:405
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:244
ItemPointerData t_tid
Definition: itup.h:37
#define Min(x, y)
Definition: c.h:920
#define END_CRIT_SECTION()
Definition: miscadmin.h:134
IndexTuple itup
Definition: nbtree.h:777
Pointer Item
Definition: item.h:17
#define INDEX_SIZE_MASK
Definition: itup.h:65
#define P_HAS_GARBAGE(opaque)
Definition: nbtree.h:220
#define START_CRIT_SECTION()
Definition: miscadmin.h:132
#define PageAddItem(page, item, size, offsetNumber, overwrite, is_heap)
Definition: bufpage.h:416
void _bt_dedup_start_pending(BTDedupState state, IndexTuple base, OffsetNumber baseoff)
Definition: nbtdedup.c:326
#define SizeOfPageHeaderData
Definition: bufpage.h:216
#define ItemIdIsDead(itemId)
Definition: itemid.h:113
IndexTuple _bt_swap_posting(IndexTuple newitem, IndexTuple oposting, int postingoff)
Definition: nbtdedup.c:759
Size _bt_dedup_finish_pending(Page newpage, BTDedupState state)
Definition: nbtdedup.c:431
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:357
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:69
Size PageGetFreeSpace(Page page)
Definition: bufpage.c:574
uint16 OffsetNumber
Definition: off.h:24
Page PageGetTempPageCopySpecial(Page page)
Definition: bufpage.c:381
ItemPointer htids
Definition: nbtree.h:750
void pfree(void *pointer)
Definition: mcxt.c:1056
#define ItemIdGetLength(itemId)
Definition: itemid.h:59
Size phystupsize
Definition: nbtree.h:753
#define ERROR
Definition: elog.h:43
#define INDEX_ALT_TID_MASK
Definition: nbtree.h:334
#define PG_UINT16_MAX
Definition: c.h:448
BTDedupInterval intervals[MaxIndexTuplesPerPage]
Definition: nbtree.h:762
static void BTreeTupleSetPosting(IndexTuple itup, uint16 nhtids, int postingoffset)
Definition: nbtree.h:372
static bool _bt_do_singleval(Relation rel, Page page, BTDedupState state, OffsetNumber minoff, IndexTuple newitem)
Definition: nbtdedup.c:519
void _bt_delitems_delete(Relation rel, Buffer buf, OffsetNumber *deletable, int ndeletable, Relation heapRel)
Definition: nbtpage.c:1183
IndexTuple CopyIndexTuple(IndexTuple source)
Definition: indextuple.c:510
static char * buf
Definition: pg_test_fsync.c:67
bool _bt_dedup_save_htid(BTDedupState state, IndexTuple itup)
Definition: nbtdedup.c:377
IndexTupleData * IndexTuple
Definition: itup.h:53
#define REGBUF_STANDARD
Definition: xloginsert.h:35
#define PageGetPageSize(page)
Definition: bufpage.h:268
unsigned int uint32
Definition: c.h:367
struct ItemIdData ItemIdData
#define SizeOfBtreeDedup
Definition: nbtxlog.h:177
#define IndexRelationGetNumberOfKeyAttributes(relation)
Definition: rel.h:475
#define BufferGetPage(buffer)
Definition: bufmgr.h:169
#define BTREE_SINGLEVAL_FILLFACTOR
Definition: nbtree.h:196
#define XLOG_BTREE_DEDUP
Definition: nbtxlog.h:32
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:235
static ItemPointer BTreeTupleGetMaxHeapTID(IndexTuple itup)
Definition: nbtree.h:532
void XLogRegisterData(char *data, int len)
Definition: xloginsert.c:324
XLogRecPtr XLogInsert(RmgrId rmid, uint8 info)
Definition: xloginsert.c:416
void * palloc0(Size size)
Definition: mcxt.c:980
uint16 deletetids[FLEXIBLE_ARRAY_MEMBER]
Definition: nbtree.h:782
#define InvalidOffsetNumber
Definition: off.h:26
static ItemPointer BTreeTupleGetPostingN(IndexTuple posting, int n)
Definition: nbtree.h:412
static bool BTreeTupleIsPosting(IndexTuple itup)
Definition: nbtree.h:360
static uint32 BTreeTupleGetPostingOffset(IndexTuple posting)
Definition: nbtree.h:397
uint16 nitems
Definition: nbtree.h:717
uint64 XLogRecPtr
Definition: xlogdefs.h:21
#define Assert(condition)
Definition: c.h:738
Definition: regguts.h:298
#define OffsetNumberNext(offsetNumber)
Definition: off.h:52
size_t Size
Definition: c.h:466
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
#define MAXALIGN(LEN)
Definition: c.h:691
Size basetupsize
Definition: nbtree.h:747
#define RelationNeedsWAL(relation)
Definition: rel.h:562
Size maxpostingsize
Definition: nbtree.h:742
#define PageGetLSN(page)
Definition: bufpage.h:366
Size PageGetExactFreeSpace(Page page)
Definition: bufpage.c:625
#define BTMaxItemSize(page)
Definition: nbtree.h:157
#define P_HIKEY
Definition: nbtree.h:242
int _bt_keep_natts_fast(Relation rel, IndexTuple lastleft, IndexTuple firstright)
Definition: nbtutils.c:2418
#define MaxIndexTuplesPerPage
Definition: itup.h:145
void * palloc(Size size)
Definition: mcxt.c:949
bool deduplicate
Definition: nbtree.h:741
#define elog(elevel,...)
Definition: elog.h:214
int i
static uint16 BTreeTupleGetNPosting(IndexTuple posting)
Definition: nbtree.h:386
unsigned short t_info
Definition: itup.h:49
BTDedupStateData * BTDedupState
Definition: nbtree.h:765
OffsetNumber baseoff
Definition: nbtree.h:716
void XLogBeginInsert(void)
Definition: xloginsert.c:121
uint16 btpo_flags
Definition: nbtree.h:65
#define PageSetLSN(page, lsn)
Definition: bufpage.h:368
int Buffer
Definition: buf.h:23
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:213
#define BTP_HAS_GARBAGE
Definition: nbtree.h:78
#define PageGetItem(page, itemId)
Definition: bufpage.h:340
Pointer Page
Definition: bufpage.h:78
static void _bt_singleval_fillfactor(Page page, BTDedupState state, Size newitemsz)
Definition: nbtdedup.c:559
#define IndexTupleSize(itup)
Definition: itup.h:71
#define ItemPointerCopy(fromPointer, toPointer)
Definition: itemptr.h:161