PostgreSQL Source Code  git master
nbtxlog.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * nbtxlog.c
4  * WAL replay logic for btrees.
5  *
6  *
7  * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
8  * Portions Copyright (c) 1994, Regents of the University of California
9  *
10  * IDENTIFICATION
11  * src/backend/access/nbtree/nbtxlog.c
12  *
13  *-------------------------------------------------------------------------
14  */
15 #include "postgres.h"
16 
17 #include "access/bufmask.h"
18 #include "access/nbtree.h"
19 #include "access/nbtxlog.h"
20 #include "access/transam.h"
21 #include "access/xlog.h"
22 #include "access/xlogutils.h"
23 #include "storage/procarray.h"
24 #include "miscadmin.h"
25 
26 /*
27  * _bt_restore_page -- re-enter all the index tuples on a page
28  *
29  * The page is freshly init'd, and *from (length len) is a copy of what
30  * had been its upper part (pd_upper to pd_special). We assume that the
31  * tuples had been added to the page in item-number order, and therefore
32  * the one with highest item number appears first (lowest on the page).
33  */
34 static void
35 _bt_restore_page(Page page, char *from, int len)
36 {
37  IndexTupleData itupdata;
38  Size itemsz;
39  char *end = from + len;
41  uint16 itemsizes[MaxIndexTuplesPerPage];
42  int i;
43  int nitems;
44 
45  /*
46  * To get the items back in the original order, we add them to the page in
47  * reverse. To figure out where one tuple ends and another begins, we
48  * have to scan them in forward order first.
49  */
50  i = 0;
51  while (from < end)
52  {
53  /*
54  * As we step through the items, 'from' won't always be properly
55  * aligned, so we need to use memcpy(). Further, we use Item (which
56  * is just a char*) here for our items array for the same reason;
57  * wouldn't want the compiler or anyone thinking that an item is
58  * aligned when it isn't.
59  */
60  memcpy(&itupdata, from, sizeof(IndexTupleData));
61  itemsz = IndexTupleSize(&itupdata);
62  itemsz = MAXALIGN(itemsz);
63 
64  items[i] = (Item) from;
65  itemsizes[i] = itemsz;
66  i++;
67 
68  from += itemsz;
69  }
70  nitems = i;
71 
72  for (i = nitems - 1; i >= 0; i--)
73  {
74  if (PageAddItem(page, items[i], itemsizes[i], nitems - i,
75  false, false) == InvalidOffsetNumber)
76  elog(PANIC, "_bt_restore_page: cannot add item to page");
77  from += itemsz;
78  }
79 }
80 
81 static void
83 {
84  XLogRecPtr lsn = record->EndRecPtr;
85  Buffer metabuf;
86  Page metapg;
87  BTMetaPageData *md;
88  BTPageOpaque pageop;
89  xl_btree_metadata *xlrec;
90  char *ptr;
91  Size len;
92 
93  metabuf = XLogInitBufferForRedo(record, block_id);
94  ptr = XLogRecGetBlockData(record, block_id, &len);
95 
96  Assert(len == sizeof(xl_btree_metadata));
98  xlrec = (xl_btree_metadata *) ptr;
99  metapg = BufferGetPage(metabuf);
100 
101  _bt_pageinit(metapg, BufferGetPageSize(metabuf));
102 
103  md = BTPageGetMeta(metapg);
104  md->btm_magic = BTREE_MAGIC;
105  md->btm_version = xlrec->version;
106  md->btm_root = xlrec->root;
107  md->btm_level = xlrec->level;
108  md->btm_fastroot = xlrec->fastroot;
109  md->btm_fastlevel = xlrec->fastlevel;
112 
113  pageop = (BTPageOpaque) PageGetSpecialPointer(metapg);
114  pageop->btpo_flags = BTP_META;
115 
116  /*
117  * Set pd_lower just past the end of the metadata. This is essential,
118  * because without doing so, metadata will be lost if xlog.c compresses
119  * the page.
120  */
121  ((PageHeader) metapg)->pd_lower =
122  ((char *) md + sizeof(BTMetaPageData)) - (char *) metapg;
123 
124  PageSetLSN(metapg, lsn);
125  MarkBufferDirty(metabuf);
126  UnlockReleaseBuffer(metabuf);
127 }
128 
129 /*
130  * _bt_clear_incomplete_split -- clear INCOMPLETE_SPLIT flag on a page
131  *
132  * This is a common subroutine of the redo functions of all the WAL record
133  * types that can insert a downlink: insert, split, and newroot.
134  */
135 static void
137 {
138  XLogRecPtr lsn = record->EndRecPtr;
139  Buffer buf;
140 
141  if (XLogReadBufferForRedo(record, block_id, &buf) == BLK_NEEDS_REDO)
142  {
143  Page page = (Page) BufferGetPage(buf);
145 
146  Assert(P_INCOMPLETE_SPLIT(pageop));
147  pageop->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
148 
149  PageSetLSN(page, lsn);
150  MarkBufferDirty(buf);
151  }
152  if (BufferIsValid(buf))
153  UnlockReleaseBuffer(buf);
154 }
155 
156 static void
157 btree_xlog_insert(bool isleaf, bool ismeta, XLogReaderState *record)
158 {
159  XLogRecPtr lsn = record->EndRecPtr;
160  xl_btree_insert *xlrec = (xl_btree_insert *) XLogRecGetData(record);
161  Buffer buffer;
162  Page page;
163 
164  /*
165  * Insertion to an internal page finishes an incomplete split at the child
166  * level. Clear the incomplete-split flag in the child. Note: during
167  * normal operation, the child and parent pages are locked at the same
168  * time, so that clearing the flag and inserting the downlink appear
169  * atomic to other backends. We don't bother with that during replay,
170  * because readers don't care about the incomplete-split flag and there
171  * cannot be updates happening.
172  */
173  if (!isleaf)
174  _bt_clear_incomplete_split(record, 1);
175  if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
176  {
177  Size datalen;
178  char *datapos = XLogRecGetBlockData(record, 0, &datalen);
179 
180  page = BufferGetPage(buffer);
181 
182  if (PageAddItem(page, (Item) datapos, datalen, xlrec->offnum,
183  false, false) == InvalidOffsetNumber)
184  elog(PANIC, "btree_xlog_insert: failed to add item");
185 
186  PageSetLSN(page, lsn);
187  MarkBufferDirty(buffer);
188  }
189  if (BufferIsValid(buffer))
190  UnlockReleaseBuffer(buffer);
191 
192  /*
193  * Note: in normal operation, we'd update the metapage while still holding
194  * lock on the page we inserted into. But during replay it's not
195  * necessary to hold that lock, since no other index updates can be
196  * happening concurrently, and readers will cope fine with following an
197  * obsolete link from the metapage.
198  */
199  if (ismeta)
200  _bt_restore_meta(record, 2);
201 }
202 
203 static void
204 btree_xlog_split(bool onleft, XLogReaderState *record)
205 {
206  XLogRecPtr lsn = record->EndRecPtr;
207  xl_btree_split *xlrec = (xl_btree_split *) XLogRecGetData(record);
208  bool isleaf = (xlrec->level == 0);
209  Buffer lbuf;
210  Buffer rbuf;
211  Page rpage;
212  BTPageOpaque ropaque;
213  char *datapos;
214  Size datalen;
215  BlockNumber leftsib;
216  BlockNumber rightsib;
217  BlockNumber rnext;
218 
219  XLogRecGetBlockTag(record, 0, NULL, NULL, &leftsib);
220  XLogRecGetBlockTag(record, 1, NULL, NULL, &rightsib);
221  if (!XLogRecGetBlockTag(record, 2, NULL, NULL, &rnext))
222  rnext = P_NONE;
223 
224  /*
225  * Clear the incomplete split flag on the left sibling of the child page
226  * this is a downlink for. (Like in btree_xlog_insert, this can be done
227  * before locking the other pages)
228  */
229  if (!isleaf)
230  _bt_clear_incomplete_split(record, 3);
231 
232  /* Reconstruct right (new) sibling page from scratch */
233  rbuf = XLogInitBufferForRedo(record, 1);
234  datapos = XLogRecGetBlockData(record, 1, &datalen);
235  rpage = (Page) BufferGetPage(rbuf);
236 
237  _bt_pageinit(rpage, BufferGetPageSize(rbuf));
238  ropaque = (BTPageOpaque) PageGetSpecialPointer(rpage);
239 
240  ropaque->btpo_prev = leftsib;
241  ropaque->btpo_next = rnext;
242  ropaque->btpo.level = xlrec->level;
243  ropaque->btpo_flags = isleaf ? BTP_LEAF : 0;
244  ropaque->btpo_cycleid = 0;
245 
246  _bt_restore_page(rpage, datapos, datalen);
247 
248  PageSetLSN(rpage, lsn);
249  MarkBufferDirty(rbuf);
250 
251  /* Now reconstruct left (original) sibling page */
252  if (XLogReadBufferForRedo(record, 0, &lbuf) == BLK_NEEDS_REDO)
253  {
254  /*
255  * To retain the same physical order of the tuples that they had, we
256  * initialize a temporary empty page for the left page and add all the
257  * items to that in item number order. This mirrors how _bt_split()
258  * works. Retaining the same physical order makes WAL consistency
259  * checking possible. See also _bt_restore_page(), which does the
260  * same for the right page.
261  */
262  Page lpage = (Page) BufferGetPage(lbuf);
264  OffsetNumber off;
265  IndexTuple newitem = NULL,
266  left_hikey = NULL;
267  Size newitemsz = 0,
268  left_hikeysz = 0;
269  Page newlpage;
270  OffsetNumber leftoff;
271 
272  datapos = XLogRecGetBlockData(record, 0, &datalen);
273 
274  if (onleft)
275  {
276  newitem = (IndexTuple) datapos;
277  newitemsz = MAXALIGN(IndexTupleSize(newitem));
278  datapos += newitemsz;
279  datalen -= newitemsz;
280  }
281 
282  /* Extract left hikey and its size (assuming 16-bit alignment) */
283  left_hikey = (IndexTuple) datapos;
284  left_hikeysz = MAXALIGN(IndexTupleSize(left_hikey));
285  datapos += left_hikeysz;
286  datalen -= left_hikeysz;
287 
288  Assert(datalen == 0);
289 
290  newlpage = PageGetTempPageCopySpecial(lpage);
291 
292  /* Set high key */
293  leftoff = P_HIKEY;
294  if (PageAddItem(newlpage, (Item) left_hikey, left_hikeysz,
295  P_HIKEY, false, false) == InvalidOffsetNumber)
296  elog(PANIC, "failed to add high key to left page after split");
297  leftoff = OffsetNumberNext(leftoff);
298 
299  for (off = P_FIRSTDATAKEY(lopaque); off < xlrec->firstright; off++)
300  {
301  ItemId itemid;
302  Size itemsz;
303  IndexTuple item;
304 
305  /* add the new item if it was inserted on left page */
306  if (onleft && off == xlrec->newitemoff)
307  {
308  if (PageAddItem(newlpage, (Item) newitem, newitemsz, leftoff,
309  false, false) == InvalidOffsetNumber)
310  elog(ERROR, "failed to add new item to left page after split");
311  leftoff = OffsetNumberNext(leftoff);
312  }
313 
314  itemid = PageGetItemId(lpage, off);
315  itemsz = ItemIdGetLength(itemid);
316  item = (IndexTuple) PageGetItem(lpage, itemid);
317  if (PageAddItem(newlpage, (Item) item, itemsz, leftoff,
318  false, false) == InvalidOffsetNumber)
319  elog(ERROR, "failed to add old item to left page after split");
320  leftoff = OffsetNumberNext(leftoff);
321  }
322 
323  /* cope with possibility that newitem goes at the end */
324  if (onleft && off == xlrec->newitemoff)
325  {
326  if (PageAddItem(newlpage, (Item) newitem, newitemsz, leftoff,
327  false, false) == InvalidOffsetNumber)
328  elog(ERROR, "failed to add new item to left page after split");
329  leftoff = OffsetNumberNext(leftoff);
330  }
331 
332  PageRestoreTempPage(newlpage, lpage);
333 
334  /* Fix opaque fields */
335  lopaque->btpo_flags = BTP_INCOMPLETE_SPLIT;
336  if (isleaf)
337  lopaque->btpo_flags |= BTP_LEAF;
338  lopaque->btpo_next = rightsib;
339  lopaque->btpo_cycleid = 0;
340 
341  PageSetLSN(lpage, lsn);
342  MarkBufferDirty(lbuf);
343  }
344 
345  /*
346  * We no longer need the buffers. They must be released together, so that
347  * readers cannot observe two inconsistent halves.
348  */
349  if (BufferIsValid(lbuf))
350  UnlockReleaseBuffer(lbuf);
351  UnlockReleaseBuffer(rbuf);
352 
353  /*
354  * Fix left-link of the page to the right of the new right sibling.
355  *
356  * Note: in normal operation, we do this while still holding lock on the
357  * two split pages. However, that's not necessary for correctness in WAL
358  * replay, because no other index update can be in progress, and readers
359  * will cope properly when following an obsolete left-link.
360  */
361  if (rnext != P_NONE)
362  {
363  Buffer buffer;
364 
365  if (XLogReadBufferForRedo(record, 2, &buffer) == BLK_NEEDS_REDO)
366  {
367  Page page = (Page) BufferGetPage(buffer);
369 
370  pageop->btpo_prev = rightsib;
371 
372  PageSetLSN(page, lsn);
373  MarkBufferDirty(buffer);
374  }
375  if (BufferIsValid(buffer))
376  UnlockReleaseBuffer(buffer);
377  }
378 }
379 
380 static void
382 {
383  XLogRecPtr lsn = record->EndRecPtr;
384  Buffer buffer;
385  Page page;
386  BTPageOpaque opaque;
387 #ifdef UNUSED
388  xl_btree_vacuum *xlrec = (xl_btree_vacuum *) XLogRecGetData(record);
389 
390  /*
391  * This section of code is thought to be no longer needed, after analysis
392  * of the calling paths. It is retained to allow the code to be reinstated
393  * if a flaw is revealed in that thinking.
394  *
395  * If we are running non-MVCC scans using this index we need to do some
396  * additional work to ensure correctness, which is known as a "pin scan"
397  * described in more detail in next paragraphs. We used to do the extra
398  * work in all cases, whereas we now avoid that work in most cases. If
399  * lastBlockVacuumed is set to InvalidBlockNumber then we skip the
400  * additional work required for the pin scan.
401  *
402  * Avoiding this extra work is important since it requires us to touch
403  * every page in the index, so is an O(N) operation. Worse, it is an
404  * operation performed in the foreground during redo, so it delays
405  * replication directly.
406  *
407  * If queries might be active then we need to ensure every leaf page is
408  * unpinned between the lastBlockVacuumed and the current block, if there
409  * are any. This prevents replay of the VACUUM from reaching the stage of
410  * removing heap tuples while there could still be indexscans "in flight"
411  * to those particular tuples for those scans which could be confused by
412  * finding new tuples at the old TID locations (see nbtree/README).
413  *
414  * It might be worth checking if there are actually any backends running;
415  * if not, we could just skip this.
416  *
417  * Since VACUUM can visit leaf pages out-of-order, it might issue records
418  * with lastBlockVacuumed >= block; that's not an error, it just means
419  * nothing to do now.
420  *
421  * Note: since we touch all pages in the range, we will lock non-leaf
422  * pages, and also any empty (all-zero) pages that may be in the index. It
423  * doesn't seem worth the complexity to avoid that. But it's important
424  * that HotStandbyActiveInReplay() will not return true if the database
425  * isn't yet consistent; so we need not fear reading still-corrupt blocks
426  * here during crash recovery.
427  */
429  {
430  RelFileNode thisrnode;
431  BlockNumber thisblkno;
432  BlockNumber blkno;
433 
434  XLogRecGetBlockTag(record, 0, &thisrnode, NULL, &thisblkno);
435 
436  for (blkno = xlrec->lastBlockVacuumed + 1; blkno < thisblkno; blkno++)
437  {
438  /*
439  * We use RBM_NORMAL_NO_LOG mode because it's not an error
440  * condition to see all-zero pages. The original btvacuumpage
441  * scan would have skipped over all-zero pages, noting them in FSM
442  * but not bothering to initialize them just yet; so we mustn't
443  * throw an error here. (We could skip acquiring the cleanup lock
444  * if PageIsNew, but it's probably not worth the cycles to test.)
445  *
446  * XXX we don't actually need to read the block, we just need to
447  * confirm it is unpinned. If we had a special call into the
448  * buffer manager we could optimise this so that if the block is
449  * not in shared_buffers we confirm it as unpinned. Optimizing
450  * this is now moot, since in most cases we avoid the scan.
451  */
452  buffer = XLogReadBufferExtended(thisrnode, MAIN_FORKNUM, blkno,
454  if (BufferIsValid(buffer))
455  {
456  LockBufferForCleanup(buffer);
457  UnlockReleaseBuffer(buffer);
458  }
459  }
460  }
461 #endif
462 
463  /*
464  * Like in btvacuumpage(), we need to take a cleanup lock on every leaf
465  * page. See nbtree/README for details.
466  */
467  if (XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true, &buffer)
468  == BLK_NEEDS_REDO)
469  {
470  char *ptr;
471  Size len;
472 
473  ptr = XLogRecGetBlockData(record, 0, &len);
474 
475  page = (Page) BufferGetPage(buffer);
476 
477  if (len > 0)
478  {
479  OffsetNumber *unused;
480  OffsetNumber *unend;
481 
482  unused = (OffsetNumber *) ptr;
483  unend = (OffsetNumber *) ((char *) ptr + len);
484 
485  if ((unend - unused) > 0)
486  PageIndexMultiDelete(page, unused, unend - unused);
487  }
488 
489  /*
490  * Mark the page as not containing any LP_DEAD items --- see comments
491  * in _bt_delitems_vacuum().
492  */
493  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
494  opaque->btpo_flags &= ~BTP_HAS_GARBAGE;
495 
496  PageSetLSN(page, lsn);
497  MarkBufferDirty(buffer);
498  }
499  if (BufferIsValid(buffer))
500  UnlockReleaseBuffer(buffer);
501 }
502 
503 static void
505 {
506  XLogRecPtr lsn = record->EndRecPtr;
507  xl_btree_delete *xlrec = (xl_btree_delete *) XLogRecGetData(record);
508  Buffer buffer;
509  Page page;
510  BTPageOpaque opaque;
511 
512  /*
513  * If we have any conflict processing to do, it must happen before we
514  * update the page.
515  *
516  * Btree delete records can conflict with standby queries. You might
517  * think that vacuum records would conflict as well, but we've handled
518  * that already. XLOG_HEAP2_CLEANUP_INFO records provide the highest xid
519  * cleaned by the vacuum of the heap and so we can resolve any conflicts
520  * just once when that arrives. After that we know that no conflicts
521  * exist from individual btree vacuum records on that index.
522  */
523  if (InHotStandby)
524  {
525  RelFileNode rnode;
526 
527  XLogRecGetBlockTag(record, 0, &rnode, NULL, NULL);
528 
530  }
531 
532  /*
533  * We don't need to take a cleanup lock to apply these changes. See
534  * nbtree/README for details.
535  */
536  if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
537  {
538  page = (Page) BufferGetPage(buffer);
539 
540  if (XLogRecGetDataLen(record) > SizeOfBtreeDelete)
541  {
542  OffsetNumber *unused;
543 
544  unused = (OffsetNumber *) ((char *) xlrec + SizeOfBtreeDelete);
545 
546  PageIndexMultiDelete(page, unused, xlrec->nitems);
547  }
548 
549  /*
550  * Mark the page as not containing any LP_DEAD items --- see comments
551  * in _bt_delitems_delete().
552  */
553  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
554  opaque->btpo_flags &= ~BTP_HAS_GARBAGE;
555 
556  PageSetLSN(page, lsn);
557  MarkBufferDirty(buffer);
558  }
559  if (BufferIsValid(buffer))
560  UnlockReleaseBuffer(buffer);
561 }
562 
563 static void
565 {
566  XLogRecPtr lsn = record->EndRecPtr;
568  Buffer buffer;
569  Page page;
570  BTPageOpaque pageop;
571  IndexTupleData trunctuple;
572 
573  /*
574  * In normal operation, we would lock all the pages this WAL record
575  * touches before changing any of them. In WAL replay, it should be okay
576  * to lock just one page at a time, since no concurrent index updates can
577  * be happening, and readers should not care whether they arrive at the
578  * target page or not (since it's surely empty).
579  */
580 
581  /* parent page */
582  if (XLogReadBufferForRedo(record, 1, &buffer) == BLK_NEEDS_REDO)
583  {
584  OffsetNumber poffset;
585  ItemId itemid;
586  IndexTuple itup;
587  OffsetNumber nextoffset;
588  BlockNumber rightsib;
589 
590  page = (Page) BufferGetPage(buffer);
591  pageop = (BTPageOpaque) PageGetSpecialPointer(page);
592 
593  poffset = xlrec->poffset;
594 
595  nextoffset = OffsetNumberNext(poffset);
596  itemid = PageGetItemId(page, nextoffset);
597  itup = (IndexTuple) PageGetItem(page, itemid);
598  rightsib = BTreeInnerTupleGetDownLink(itup);
599 
600  itemid = PageGetItemId(page, poffset);
601  itup = (IndexTuple) PageGetItem(page, itemid);
602  BTreeInnerTupleSetDownLink(itup, rightsib);
603  nextoffset = OffsetNumberNext(poffset);
604  PageIndexTupleDelete(page, nextoffset);
605 
606  PageSetLSN(page, lsn);
607  MarkBufferDirty(buffer);
608  }
609  if (BufferIsValid(buffer))
610  UnlockReleaseBuffer(buffer);
611 
612  /* Rewrite the leaf page as a halfdead page */
613  buffer = XLogInitBufferForRedo(record, 0);
614  page = (Page) BufferGetPage(buffer);
615 
616  _bt_pageinit(page, BufferGetPageSize(buffer));
617  pageop = (BTPageOpaque) PageGetSpecialPointer(page);
618 
619  pageop->btpo_prev = xlrec->leftblk;
620  pageop->btpo_next = xlrec->rightblk;
621  pageop->btpo.level = 0;
622  pageop->btpo_flags = BTP_HALF_DEAD | BTP_LEAF;
623  pageop->btpo_cycleid = 0;
624 
625  /*
626  * Construct a dummy hikey item that points to the next parent to be
627  * deleted (if any).
628  */
629  MemSet(&trunctuple, 0, sizeof(IndexTupleData));
630  trunctuple.t_info = sizeof(IndexTupleData);
631  BTreeTupleSetTopParent(&trunctuple, xlrec->topparent);
632 
633  if (PageAddItem(page, (Item) &trunctuple, sizeof(IndexTupleData), P_HIKEY,
634  false, false) == InvalidOffsetNumber)
635  elog(ERROR, "could not add dummy high key to half-dead page");
636 
637  PageSetLSN(page, lsn);
638  MarkBufferDirty(buffer);
639  UnlockReleaseBuffer(buffer);
640 }
641 
642 
643 static void
645 {
646  XLogRecPtr lsn = record->EndRecPtr;
648  BlockNumber leftsib;
649  BlockNumber rightsib;
650  Buffer buffer;
651  Page page;
652  BTPageOpaque pageop;
653 
654  leftsib = xlrec->leftsib;
655  rightsib = xlrec->rightsib;
656 
657  /*
658  * In normal operation, we would lock all the pages this WAL record
659  * touches before changing any of them. In WAL replay, it should be okay
660  * to lock just one page at a time, since no concurrent index updates can
661  * be happening, and readers should not care whether they arrive at the
662  * target page or not (since it's surely empty).
663  */
664 
665  /* Fix left-link of right sibling */
666  if (XLogReadBufferForRedo(record, 2, &buffer) == BLK_NEEDS_REDO)
667  {
668  page = (Page) BufferGetPage(buffer);
669  pageop = (BTPageOpaque) PageGetSpecialPointer(page);
670  pageop->btpo_prev = leftsib;
671 
672  PageSetLSN(page, lsn);
673  MarkBufferDirty(buffer);
674  }
675  if (BufferIsValid(buffer))
676  UnlockReleaseBuffer(buffer);
677 
678  /* Fix right-link of left sibling, if any */
679  if (leftsib != P_NONE)
680  {
681  if (XLogReadBufferForRedo(record, 1, &buffer) == BLK_NEEDS_REDO)
682  {
683  page = (Page) BufferGetPage(buffer);
684  pageop = (BTPageOpaque) PageGetSpecialPointer(page);
685  pageop->btpo_next = rightsib;
686 
687  PageSetLSN(page, lsn);
688  MarkBufferDirty(buffer);
689  }
690  if (BufferIsValid(buffer))
691  UnlockReleaseBuffer(buffer);
692  }
693 
694  /* Rewrite target page as empty deleted page */
695  buffer = XLogInitBufferForRedo(record, 0);
696  page = (Page) BufferGetPage(buffer);
697 
698  _bt_pageinit(page, BufferGetPageSize(buffer));
699  pageop = (BTPageOpaque) PageGetSpecialPointer(page);
700 
701  pageop->btpo_prev = leftsib;
702  pageop->btpo_next = rightsib;
703  pageop->btpo.xact = xlrec->btpo_xact;
704  pageop->btpo_flags = BTP_DELETED;
705  pageop->btpo_cycleid = 0;
706 
707  PageSetLSN(page, lsn);
708  MarkBufferDirty(buffer);
709  UnlockReleaseBuffer(buffer);
710 
711  /*
712  * If we deleted a parent of the targeted leaf page, instead of the leaf
713  * itself, update the leaf to point to the next remaining child in the
714  * branch.
715  */
716  if (XLogRecHasBlockRef(record, 3))
717  {
718  /*
719  * There is no real data on the page, so we just re-create it from
720  * scratch using the information from the WAL record.
721  */
722  IndexTupleData trunctuple;
723 
724  buffer = XLogInitBufferForRedo(record, 3);
725  page = (Page) BufferGetPage(buffer);
726 
727  _bt_pageinit(page, BufferGetPageSize(buffer));
728  pageop = (BTPageOpaque) PageGetSpecialPointer(page);
729 
730  pageop->btpo_flags = BTP_HALF_DEAD | BTP_LEAF;
731  pageop->btpo_prev = xlrec->leafleftsib;
732  pageop->btpo_next = xlrec->leafrightsib;
733  pageop->btpo.level = 0;
734  pageop->btpo_cycleid = 0;
735 
736  /* Add a dummy hikey item */
737  MemSet(&trunctuple, 0, sizeof(IndexTupleData));
738  trunctuple.t_info = sizeof(IndexTupleData);
739  BTreeTupleSetTopParent(&trunctuple, xlrec->topparent);
740 
741  if (PageAddItem(page, (Item) &trunctuple, sizeof(IndexTupleData), P_HIKEY,
742  false, false) == InvalidOffsetNumber)
743  elog(ERROR, "could not add dummy high key to half-dead page");
744 
745  PageSetLSN(page, lsn);
746  MarkBufferDirty(buffer);
747  UnlockReleaseBuffer(buffer);
748  }
749 
750  /* Update metapage if needed */
751  if (info == XLOG_BTREE_UNLINK_PAGE_META)
752  _bt_restore_meta(record, 4);
753 }
754 
755 static void
757 {
758  XLogRecPtr lsn = record->EndRecPtr;
759  xl_btree_newroot *xlrec = (xl_btree_newroot *) XLogRecGetData(record);
760  Buffer buffer;
761  Page page;
762  BTPageOpaque pageop;
763  char *ptr;
764  Size len;
765 
766  buffer = XLogInitBufferForRedo(record, 0);
767  page = (Page) BufferGetPage(buffer);
768 
769  _bt_pageinit(page, BufferGetPageSize(buffer));
770  pageop = (BTPageOpaque) PageGetSpecialPointer(page);
771 
772  pageop->btpo_flags = BTP_ROOT;
773  pageop->btpo_prev = pageop->btpo_next = P_NONE;
774  pageop->btpo.level = xlrec->level;
775  if (xlrec->level == 0)
776  pageop->btpo_flags |= BTP_LEAF;
777  pageop->btpo_cycleid = 0;
778 
779  if (xlrec->level > 0)
780  {
781  ptr = XLogRecGetBlockData(record, 0, &len);
782  _bt_restore_page(page, ptr, len);
783 
784  /* Clear the incomplete-split flag in left child */
785  _bt_clear_incomplete_split(record, 1);
786  }
787 
788  PageSetLSN(page, lsn);
789  MarkBufferDirty(buffer);
790  UnlockReleaseBuffer(buffer);
791 
792  _bt_restore_meta(record, 2);
793 }
794 
795 static void
797 {
799 
800  /*
801  * Btree reuse_page records exist to provide a conflict point when we
802  * reuse pages in the index via the FSM. That's all they do though.
803  *
804  * latestRemovedXid was the page's btpo.xact. The btpo.xact <
805  * RecentGlobalXmin test in _bt_page_recyclable() conceptually mirrors the
806  * pgxact->xmin > limitXmin test in GetConflictingVirtualXIDs().
807  * Consequently, one XID value achieves the same exclusion effect on
808  * master and standby.
809  */
810  if (InHotStandby)
811  {
813  xlrec->node);
814  }
815 }
816 
817 void
819 {
820  uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
821 
822  switch (info)
823  {
825  btree_xlog_insert(true, false, record);
826  break;
828  btree_xlog_insert(false, false, record);
829  break;
831  btree_xlog_insert(false, true, record);
832  break;
833  case XLOG_BTREE_SPLIT_L:
834  btree_xlog_split(true, record);
835  break;
836  case XLOG_BTREE_SPLIT_R:
837  btree_xlog_split(false, record);
838  break;
839  case XLOG_BTREE_VACUUM:
840  btree_xlog_vacuum(record);
841  break;
842  case XLOG_BTREE_DELETE:
843  btree_xlog_delete(record);
844  break;
846  btree_xlog_mark_page_halfdead(info, record);
847  break;
850  btree_xlog_unlink_page(info, record);
851  break;
852  case XLOG_BTREE_NEWROOT:
853  btree_xlog_newroot(record);
854  break;
856  btree_xlog_reuse_page(record);
857  break;
859  _bt_restore_meta(record, 0);
860  break;
861  default:
862  elog(PANIC, "btree_redo: unknown op code %u", info);
863  }
864 }
865 
866 /*
867  * Mask a btree page before performing consistency checks on it.
868  */
869 void
870 btree_mask(char *pagedata, BlockNumber blkno)
871 {
872  Page page = (Page) pagedata;
873  BTPageOpaque maskopaq;
874 
876 
877  mask_page_hint_bits(page);
878  mask_unused_space(page);
879 
880  maskopaq = (BTPageOpaque) PageGetSpecialPointer(page);
881 
882  if (P_ISDELETED(maskopaq))
883  {
884  /*
885  * Mask page content on a DELETED page since it will be re-initialized
886  * during replay. See btree_xlog_unlink_page() for details.
887  */
888  mask_page_content(page);
889  }
890  else if (P_ISLEAF(maskopaq))
891  {
892  /*
893  * In btree leaf pages, it is possible to modify the LP_FLAGS without
894  * emitting any WAL record. Hence, mask the line pointer flags. See
895  * _bt_killitems(), _bt_check_unique() for details.
896  */
897  mask_lp_flags(page);
898  }
899 
900  /*
901  * BTP_HAS_GARBAGE is just an un-logged hint bit. So, mask it. See
902  * _bt_killitems(), _bt_check_unique() for details.
903  */
904  maskopaq->btpo_flags &= ~BTP_HAS_GARBAGE;
905 
906  /*
907  * During replay of a btree page split, we don't set the BTP_SPLIT_END
908  * flag of the right sibling and initialize the cycle_id to 0 for the same
909  * page. See btree_xlog_split() for details.
910  */
911  maskopaq->btpo_flags &= ~BTP_SPLIT_END;
912  maskopaq->btpo_cycleid = 0;
913 }
BlockNumber lastBlockVacuumed
Definition: nbtxlog.h:174
TransactionId latestRemovedXid
Definition: nbtxlog.h:129
#define BTP_ROOT
Definition: nbtree.h:72
void LockBufferForCleanup(Buffer buffer)
Definition: bufmgr.c:3647
#define BTP_SPLIT_END
Definition: nbtree.h:76
BlockNumber btpo_next
Definition: nbtree.h:58
void PageRestoreTempPage(Page tempPage, Page oldPage)
Definition: bufpage.c:410
#define BTreeInnerTupleGetDownLink(itup)
Definition: nbtree.h:301
uint32 btm_version
Definition: nbtree.h:100
void PageIndexTupleDelete(Page page, OffsetNumber offnum)
Definition: bufpage.c:726
void MarkBufferDirty(Buffer buffer)
Definition: bufmgr.c:1458
static void btree_xlog_vacuum(XLogReaderState *record)
Definition: nbtxlog.c:381
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:219
uint32 btm_magic
Definition: nbtree.h:99
#define BTP_LEAF
Definition: nbtree.h:71
#define BTP_HALF_DEAD
Definition: nbtree.h:75
union BTPageOpaqueData::@46 btpo
BlockNumber root
Definition: nbtxlog.h:50
Buffer XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum, BlockNumber blkno, ReadBufferMode mode)
Definition: xlogutils.c:437
unsigned char uint8
Definition: c.h:356
Pointer Item
Definition: item.h:17
#define P_NONE
Definition: nbtree.h:181
void mask_page_hint_bits(Page page)
Definition: bufmask.c:46
#define XLOG_BTREE_INSERT_META
Definition: nbtxlog.h:28
RelFileNode node
Definition: nbtxlog.h:142
#define XLogRecHasBlockRef(decoder, block_id)
Definition: xlogreader.h:240
#define InHotStandby
Definition: xlog.h:74
uint32 level
Definition: nbtxlog.h:248
#define BTP_INCOMPLETE_SPLIT
Definition: nbtree.h:78
#define MemSet(start, val, len)
Definition: c.h:941
#define PageAddItem(page, item, size, offsetNumber, overwrite, is_heap)
Definition: bufpage.h:416
uint32 BlockNumber
Definition: block.h:31
#define P_INCOMPLETE_SPLIT(opaque)
Definition: nbtree.h:196
#define BTP_DELETED
Definition: nbtree.h:73
void btree_redo(XLogReaderState *record)
Definition: nbtxlog.c:818
static void btree_xlog_delete(XLogReaderState *record)
Definition: nbtxlog.c:504
#define PANIC
Definition: elog.h:53
bool HotStandbyActiveInReplay(void)
Definition: xlog.c:7976
TransactionId xact
Definition: nbtree.h:62
#define BTP_META
Definition: nbtree.h:74
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:68
void mask_unused_space(Page page)
Definition: bufmask.c:71
XLogRecPtr EndRecPtr
Definition: xlogreader.h:124
uint16 OffsetNumber
Definition: off.h:24
Page PageGetTempPageCopySpecial(Page page)
Definition: bufpage.c:388
void mask_page_content(Page page)
Definition: bufmask.c:119
BlockNumber btm_fastroot
Definition: nbtree.h:103
#define XLOG_BTREE_NEWROOT
Definition: nbtxlog.h:35
unsigned short uint16
Definition: c.h:357
#define ItemIdGetLength(itemId)
Definition: itemid.h:59
#define BTREE_MAGIC
Definition: nbtree.h:132
static void btree_xlog_newroot(XLogReaderState *record)
Definition: nbtxlog.c:756
static void _bt_restore_meta(XLogReaderState *record, uint8 block_id)
Definition: nbtxlog.c:82
#define XLogRecGetData(decoder)
Definition: xlogreader.h:237
void UnlockReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:3376
#define ERROR
Definition: elog.h:43
#define XLogRecGetDataLen(decoder)
Definition: xlogreader.h:238
static void btree_xlog_reuse_page(XLogReaderState *record)
Definition: nbtxlog.c:796
float8 last_cleanup_num_heap_tuples
Definition: nbtxlog.h:55
Buffer XLogInitBufferForRedo(XLogReaderState *record, uint8 block_id)
Definition: xlogutils.c:301
#define XLOG_BTREE_INSERT_LEAF
Definition: nbtxlog.h:26
OffsetNumber newitemoff
Definition: nbtxlog.h:115
#define BTreeTupleSetTopParent(itup, blkno)
Definition: nbtree.h:314
BTCycleId btpo_cycleid
Definition: nbtree.h:65
TransactionId oldest_btpo_xact
Definition: nbtxlog.h:54
#define BTPageGetMeta(p)
Definition: nbtree.h:112
BlockNumber btpo_prev
Definition: nbtree.h:57
static void _bt_clear_incomplete_split(XLogReaderState *record, uint8 block_id)
Definition: nbtxlog.c:136
static char * buf
Definition: pg_test_fsync.c:68
IndexTupleData * IndexTuple
Definition: itup.h:53
#define XLOG_BTREE_VACUUM
Definition: nbtxlog.h:37
static void btree_xlog_split(bool onleft, XLogReaderState *record)
Definition: nbtxlog.c:204
#define XLOG_BTREE_UNLINK_PAGE
Definition: nbtxlog.h:33
#define BufferGetPage(buffer)
Definition: bufmgr.h:159
#define XLogRecGetInfo(decoder)
Definition: xlogreader.h:233
#define BTREE_METAPAGE
Definition: nbtree.h:131
#define P_ISDELETED(opaque)
Definition: nbtree.h:191
uint32 version
Definition: nbtxlog.h:49
#define XLOG_BTREE_DELETE
Definition: nbtxlog.h:32
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:235
uint32 btm_fastlevel
Definition: nbtree.h:104
uint32 level
Definition: nbtree.h:61
bool XLogRecGetBlockTag(XLogReaderState *record, uint8 block_id, RelFileNode *rnode, ForkNumber *forknum, BlockNumber *blknum)
Definition: xlogreader.c:1348
char * XLogRecGetBlockData(XLogReaderState *record, uint8 block_id, Size *len)
Definition: xlogreader.c:1372
#define XLOG_BTREE_REUSE_PAGE
Definition: nbtxlog.h:39
void mask_page_lsn_and_checksum(Page page)
Definition: bufmask.c:31
uint32 level
Definition: nbtxlog.h:113
#define XLOG_BTREE_MARK_PAGE_HALFDEAD
Definition: nbtxlog.h:36
OffsetNumber offnum
Definition: nbtxlog.h:70
struct IndexTupleData IndexTupleData
static void _bt_restore_page(Page page, char *from, int len)
Definition: nbtxlog.c:35
#define BufferGetPageSize(buffer)
Definition: bufmgr.h:146
BlockNumber btm_root
Definition: nbtree.h:101
#define InvalidOffsetNumber
Definition: off.h:26
#define XLOG_BTREE_SPLIT_R
Definition: nbtxlog.h:30
#define BlockNumberIsValid(blockNumber)
Definition: block.h:70
XLogRedoAction XLogReadBufferForRedo(XLogReaderState *record, uint8 block_id, Buffer *buf)
Definition: xlogutils.c:289
PageHeaderData * PageHeader
Definition: bufpage.h:166
uint64 XLogRecPtr
Definition: xlogdefs.h:21
#define Assert(condition)
Definition: c.h:732
#define XLR_INFO_MASK
Definition: xlogrecord.h:62
OffsetNumber firstright
Definition: nbtxlog.h:114
void PageIndexMultiDelete(Page page, OffsetNumber *itemnos, int nitems)
Definition: bufpage.c:835
#define OffsetNumberNext(offsetNumber)
Definition: off.h:53
size_t Size
Definition: c.h:466
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
float8 btm_last_cleanup_num_heap_tuples
Definition: nbtree.h:108
#define MAXALIGN(LEN)
Definition: c.h:685
#define BufferIsValid(bufnum)
Definition: bufmgr.h:113
static void btree_xlog_unlink_page(uint8 info, XLogReaderState *record)
Definition: nbtxlog.c:644
#define XLOG_BTREE_INSERT_UPPER
Definition: nbtxlog.h:27
#define BTreeInnerTupleSetDownLink(itup, blkno)
Definition: nbtree.h:303
#define P_HIKEY
Definition: nbtree.h:217
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2613
#define MaxIndexTuplesPerPage
Definition: itup.h:145
static void btree_xlog_mark_page_halfdead(uint8 info, XLogReaderState *record)
Definition: nbtxlog.c:564
XLogRedoAction XLogReadBufferForRedoExtended(XLogReaderState *record, uint8 block_id, ReadBufferMode mode, bool get_cleanup_lock, Buffer *buf)
Definition: xlogutils.c:326
uint32 fastlevel
Definition: nbtxlog.h:53
uint32 btm_level
Definition: nbtree.h:102
#define elog(elevel,...)
Definition: elog.h:226
uint32 level
Definition: nbtxlog.h:51
int i
void _bt_pageinit(Page page, Size size)
Definition: nbtpage.c:963
#define SizeOfBtreeDelete
Definition: nbtxlog.h:135
void ResolveRecoveryConflictWithSnapshot(TransactionId latestRemovedXid, RelFileNode node)
Definition: standby.c:294
#define XLOG_BTREE_SPLIT_L
Definition: nbtxlog.h:29
BlockNumber fastroot
Definition: nbtxlog.h:52
#define XLOG_BTREE_UNLINK_PAGE_META
Definition: nbtxlog.h:34
TransactionId latestRemovedXid
Definition: nbtxlog.h:144
TransactionId btm_oldest_btpo_xact
Definition: nbtree.h:106
unsigned short t_info
Definition: itup.h:49
uint16 btpo_flags
Definition: nbtree.h:64
#define PageSetLSN(page, lsn)
Definition: bufpage.h:368
int Buffer
Definition: buf.h:23
#define XLOG_BTREE_META_CLEANUP
Definition: nbtxlog.h:41
void mask_lp_flags(Page page)
Definition: bufmask.c:95
void btree_mask(char *pagedata, BlockNumber blkno)
Definition: nbtxlog.c:870
#define BTP_HAS_GARBAGE
Definition: nbtree.h:77
#define PageGetItem(page, itemId)
Definition: bufpage.h:340
Pointer Page
Definition: bufpage.h:78
#define IndexTupleSize(itup)
Definition: itup.h:71
#define P_ISLEAF(opaque)
Definition: nbtree.h:189
static void btree_xlog_insert(bool isleaf, bool ismeta, XLogReaderState *record)
Definition: nbtxlog.c:157