PostgreSQL Source Code  git master
nbtxlog.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * nbtxlog.c
4  * WAL replay logic for btrees.
5  *
6  *
7  * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
8  * Portions Copyright (c) 1994, Regents of the University of California
9  *
10  * IDENTIFICATION
11  * src/backend/access/nbtree/nbtxlog.c
12  *
13  *-------------------------------------------------------------------------
14  */
15 #include "postgres.h"
16 
17 #include "access/bufmask.h"
18 #include "access/nbtree.h"
19 #include "access/nbtxlog.h"
20 #include "access/transam.h"
21 #include "access/xlog.h"
22 #include "access/xlogutils.h"
23 #include "miscadmin.h"
24 #include "storage/procarray.h"
25 
26 /*
27  * _bt_restore_page -- re-enter all the index tuples on a page
28  *
29  * The page is freshly init'd, and *from (length len) is a copy of what
30  * had been its upper part (pd_upper to pd_special). We assume that the
31  * tuples had been added to the page in item-number order, and therefore
32  * the one with highest item number appears first (lowest on the page).
33  */
34 static void
35 _bt_restore_page(Page page, char *from, int len)
36 {
37  IndexTupleData itupdata;
38  Size itemsz;
39  char *end = from + len;
41  uint16 itemsizes[MaxIndexTuplesPerPage];
42  int i;
43  int nitems;
44 
45  /*
46  * To get the items back in the original order, we add them to the page in
47  * reverse. To figure out where one tuple ends and another begins, we
48  * have to scan them in forward order first.
49  */
50  i = 0;
51  while (from < end)
52  {
53  /*
54  * As we step through the items, 'from' won't always be properly
55  * aligned, so we need to use memcpy(). Further, we use Item (which
56  * is just a char*) here for our items array for the same reason;
57  * wouldn't want the compiler or anyone thinking that an item is
58  * aligned when it isn't.
59  */
60  memcpy(&itupdata, from, sizeof(IndexTupleData));
61  itemsz = IndexTupleSize(&itupdata);
62  itemsz = MAXALIGN(itemsz);
63 
64  items[i] = (Item) from;
65  itemsizes[i] = itemsz;
66  i++;
67 
68  from += itemsz;
69  }
70  nitems = i;
71 
72  for (i = nitems - 1; i >= 0; i--)
73  {
74  if (PageAddItem(page, items[i], itemsizes[i], nitems - i,
75  false, false) == InvalidOffsetNumber)
76  elog(PANIC, "_bt_restore_page: cannot add item to page");
77  from += itemsz;
78  }
79 }
80 
81 static void
83 {
84  XLogRecPtr lsn = record->EndRecPtr;
85  Buffer metabuf;
86  Page metapg;
87  BTMetaPageData *md;
88  BTPageOpaque pageop;
89  xl_btree_metadata *xlrec;
90  char *ptr;
91  Size len;
92 
93  metabuf = XLogInitBufferForRedo(record, block_id);
94  ptr = XLogRecGetBlockData(record, block_id, &len);
95 
96  Assert(len == sizeof(xl_btree_metadata));
98  xlrec = (xl_btree_metadata *) ptr;
99  metapg = BufferGetPage(metabuf);
100 
101  _bt_pageinit(metapg, BufferGetPageSize(metabuf));
102 
103  md = BTPageGetMeta(metapg);
104  md->btm_magic = BTREE_MAGIC;
105  md->btm_version = xlrec->version;
106  md->btm_root = xlrec->root;
107  md->btm_level = xlrec->level;
108  md->btm_fastroot = xlrec->fastroot;
109  md->btm_fastlevel = xlrec->fastlevel;
110  /* Cannot log BTREE_MIN_VERSION index metapage without upgrade */
114 
115  pageop = (BTPageOpaque) PageGetSpecialPointer(metapg);
116  pageop->btpo_flags = BTP_META;
117 
118  /*
119  * Set pd_lower just past the end of the metadata. This is essential,
120  * because without doing so, metadata will be lost if xlog.c compresses
121  * the page.
122  */
123  ((PageHeader) metapg)->pd_lower =
124  ((char *) md + sizeof(BTMetaPageData)) - (char *) metapg;
125 
126  PageSetLSN(metapg, lsn);
127  MarkBufferDirty(metabuf);
128  UnlockReleaseBuffer(metabuf);
129 }
130 
131 /*
132  * _bt_clear_incomplete_split -- clear INCOMPLETE_SPLIT flag on a page
133  *
134  * This is a common subroutine of the redo functions of all the WAL record
135  * types that can insert a downlink: insert, split, and newroot.
136  */
137 static void
139 {
140  XLogRecPtr lsn = record->EndRecPtr;
141  Buffer buf;
142 
143  if (XLogReadBufferForRedo(record, block_id, &buf) == BLK_NEEDS_REDO)
144  {
145  Page page = (Page) BufferGetPage(buf);
147 
148  Assert(P_INCOMPLETE_SPLIT(pageop));
149  pageop->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
150 
151  PageSetLSN(page, lsn);
152  MarkBufferDirty(buf);
153  }
154  if (BufferIsValid(buf))
155  UnlockReleaseBuffer(buf);
156 }
157 
158 static void
159 btree_xlog_insert(bool isleaf, bool ismeta, XLogReaderState *record)
160 {
161  XLogRecPtr lsn = record->EndRecPtr;
162  xl_btree_insert *xlrec = (xl_btree_insert *) XLogRecGetData(record);
163  Buffer buffer;
164  Page page;
165 
166  /*
167  * Insertion to an internal page finishes an incomplete split at the child
168  * level. Clear the incomplete-split flag in the child. Note: during
169  * normal operation, the child and parent pages are locked at the same
170  * time, so that clearing the flag and inserting the downlink appear
171  * atomic to other backends. We don't bother with that during replay,
172  * because readers don't care about the incomplete-split flag and there
173  * cannot be updates happening.
174  */
175  if (!isleaf)
176  _bt_clear_incomplete_split(record, 1);
177  if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
178  {
179  Size datalen;
180  char *datapos = XLogRecGetBlockData(record, 0, &datalen);
181 
182  page = BufferGetPage(buffer);
183 
184  if (PageAddItem(page, (Item) datapos, datalen, xlrec->offnum,
185  false, false) == InvalidOffsetNumber)
186  elog(PANIC, "btree_xlog_insert: failed to add item");
187 
188  PageSetLSN(page, lsn);
189  MarkBufferDirty(buffer);
190  }
191  if (BufferIsValid(buffer))
192  UnlockReleaseBuffer(buffer);
193 
194  /*
195  * Note: in normal operation, we'd update the metapage while still holding
196  * lock on the page we inserted into. But during replay it's not
197  * necessary to hold that lock, since no other index updates can be
198  * happening concurrently, and readers will cope fine with following an
199  * obsolete link from the metapage.
200  */
201  if (ismeta)
202  _bt_restore_meta(record, 2);
203 }
204 
205 static void
206 btree_xlog_split(bool onleft, XLogReaderState *record)
207 {
208  XLogRecPtr lsn = record->EndRecPtr;
209  xl_btree_split *xlrec = (xl_btree_split *) XLogRecGetData(record);
210  bool isleaf = (xlrec->level == 0);
211  Buffer lbuf;
212  Buffer rbuf;
213  Page rpage;
214  BTPageOpaque ropaque;
215  char *datapos;
216  Size datalen;
217  BlockNumber leftsib;
218  BlockNumber rightsib;
219  BlockNumber rnext;
220 
221  XLogRecGetBlockTag(record, 0, NULL, NULL, &leftsib);
222  XLogRecGetBlockTag(record, 1, NULL, NULL, &rightsib);
223  if (!XLogRecGetBlockTag(record, 2, NULL, NULL, &rnext))
224  rnext = P_NONE;
225 
226  /*
227  * Clear the incomplete split flag on the left sibling of the child page
228  * this is a downlink for. (Like in btree_xlog_insert, this can be done
229  * before locking the other pages)
230  */
231  if (!isleaf)
232  _bt_clear_incomplete_split(record, 3);
233 
234  /* Reconstruct right (new) sibling page from scratch */
235  rbuf = XLogInitBufferForRedo(record, 1);
236  datapos = XLogRecGetBlockData(record, 1, &datalen);
237  rpage = (Page) BufferGetPage(rbuf);
238 
239  _bt_pageinit(rpage, BufferGetPageSize(rbuf));
240  ropaque = (BTPageOpaque) PageGetSpecialPointer(rpage);
241 
242  ropaque->btpo_prev = leftsib;
243  ropaque->btpo_next = rnext;
244  ropaque->btpo.level = xlrec->level;
245  ropaque->btpo_flags = isleaf ? BTP_LEAF : 0;
246  ropaque->btpo_cycleid = 0;
247 
248  _bt_restore_page(rpage, datapos, datalen);
249 
250  PageSetLSN(rpage, lsn);
251  MarkBufferDirty(rbuf);
252 
253  /* Now reconstruct left (original) sibling page */
254  if (XLogReadBufferForRedo(record, 0, &lbuf) == BLK_NEEDS_REDO)
255  {
256  /*
257  * To retain the same physical order of the tuples that they had, we
258  * initialize a temporary empty page for the left page and add all the
259  * items to that in item number order. This mirrors how _bt_split()
260  * works. Retaining the same physical order makes WAL consistency
261  * checking possible. See also _bt_restore_page(), which does the
262  * same for the right page.
263  */
264  Page lpage = (Page) BufferGetPage(lbuf);
266  OffsetNumber off;
267  IndexTuple newitem = NULL,
268  left_hikey = NULL;
269  Size newitemsz = 0,
270  left_hikeysz = 0;
271  Page newlpage;
272  OffsetNumber leftoff;
273 
274  datapos = XLogRecGetBlockData(record, 0, &datalen);
275 
276  if (onleft)
277  {
278  newitem = (IndexTuple) datapos;
279  newitemsz = MAXALIGN(IndexTupleSize(newitem));
280  datapos += newitemsz;
281  datalen -= newitemsz;
282  }
283 
284  /*
285  * Extract left hikey and its size. We assume that 16-bit alignment
286  * is enough to apply IndexTupleSize (since it's fetching from a
287  * uint16 field).
288  */
289  left_hikey = (IndexTuple) datapos;
290  left_hikeysz = MAXALIGN(IndexTupleSize(left_hikey));
291  datapos += left_hikeysz;
292  datalen -= left_hikeysz;
293 
294  Assert(datalen == 0);
295 
296  newlpage = PageGetTempPageCopySpecial(lpage);
297 
298  /* Set high key */
299  leftoff = P_HIKEY;
300  if (PageAddItem(newlpage, (Item) left_hikey, left_hikeysz,
301  P_HIKEY, false, false) == InvalidOffsetNumber)
302  elog(PANIC, "failed to add high key to left page after split");
303  leftoff = OffsetNumberNext(leftoff);
304 
305  for (off = P_FIRSTDATAKEY(lopaque); off < xlrec->firstright; off++)
306  {
307  ItemId itemid;
308  Size itemsz;
309  IndexTuple item;
310 
311  /* add the new item if it was inserted on left page */
312  if (onleft && off == xlrec->newitemoff)
313  {
314  if (PageAddItem(newlpage, (Item) newitem, newitemsz, leftoff,
315  false, false) == InvalidOffsetNumber)
316  elog(ERROR, "failed to add new item to left page after split");
317  leftoff = OffsetNumberNext(leftoff);
318  }
319 
320  itemid = PageGetItemId(lpage, off);
321  itemsz = ItemIdGetLength(itemid);
322  item = (IndexTuple) PageGetItem(lpage, itemid);
323  if (PageAddItem(newlpage, (Item) item, itemsz, leftoff,
324  false, false) == InvalidOffsetNumber)
325  elog(ERROR, "failed to add old item to left page after split");
326  leftoff = OffsetNumberNext(leftoff);
327  }
328 
329  /* cope with possibility that newitem goes at the end */
330  if (onleft && off == xlrec->newitemoff)
331  {
332  if (PageAddItem(newlpage, (Item) newitem, newitemsz, leftoff,
333  false, false) == InvalidOffsetNumber)
334  elog(ERROR, "failed to add new item to left page after split");
335  leftoff = OffsetNumberNext(leftoff);
336  }
337 
338  PageRestoreTempPage(newlpage, lpage);
339 
340  /* Fix opaque fields */
341  lopaque->btpo_flags = BTP_INCOMPLETE_SPLIT;
342  if (isleaf)
343  lopaque->btpo_flags |= BTP_LEAF;
344  lopaque->btpo_next = rightsib;
345  lopaque->btpo_cycleid = 0;
346 
347  PageSetLSN(lpage, lsn);
348  MarkBufferDirty(lbuf);
349  }
350 
351  /*
352  * We no longer need the buffers. They must be released together, so that
353  * readers cannot observe two inconsistent halves.
354  */
355  if (BufferIsValid(lbuf))
356  UnlockReleaseBuffer(lbuf);
357  UnlockReleaseBuffer(rbuf);
358 
359  /*
360  * Fix left-link of the page to the right of the new right sibling.
361  *
362  * Note: in normal operation, we do this while still holding lock on the
363  * two split pages. However, that's not necessary for correctness in WAL
364  * replay, because no other index update can be in progress, and readers
365  * will cope properly when following an obsolete left-link.
366  */
367  if (rnext != P_NONE)
368  {
369  Buffer buffer;
370 
371  if (XLogReadBufferForRedo(record, 2, &buffer) == BLK_NEEDS_REDO)
372  {
373  Page page = (Page) BufferGetPage(buffer);
375 
376  pageop->btpo_prev = rightsib;
377 
378  PageSetLSN(page, lsn);
379  MarkBufferDirty(buffer);
380  }
381  if (BufferIsValid(buffer))
382  UnlockReleaseBuffer(buffer);
383  }
384 }
385 
386 static void
388 {
389  XLogRecPtr lsn = record->EndRecPtr;
390  xl_btree_vacuum *xlrec = (xl_btree_vacuum *) XLogRecGetData(record);
391  Buffer buffer;
392  Page page;
393  BTPageOpaque opaque;
394 
395  /*
396  * We need to take a cleanup lock here, just like btvacuumpage(). However,
397  * it isn't necessary to exhaustively get a cleanup lock on every block in
398  * the index during recovery (just getting a cleanup lock on pages with
399  * items to kill suffices). See nbtree/README for details.
400  */
401  if (XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true, &buffer)
402  == BLK_NEEDS_REDO)
403  {
404  char *ptr = XLogRecGetBlockData(record, 0, NULL);
405 
406  page = (Page) BufferGetPage(buffer);
407 
408  PageIndexMultiDelete(page, (OffsetNumber *) ptr, xlrec->ndeleted);
409 
410  /*
411  * Mark the page as not containing any LP_DEAD items --- see comments
412  * in _bt_delitems_vacuum().
413  */
414  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
415  opaque->btpo_flags &= ~BTP_HAS_GARBAGE;
416 
417  PageSetLSN(page, lsn);
418  MarkBufferDirty(buffer);
419  }
420  if (BufferIsValid(buffer))
421  UnlockReleaseBuffer(buffer);
422 }
423 
424 static void
426 {
427  XLogRecPtr lsn = record->EndRecPtr;
428  xl_btree_delete *xlrec = (xl_btree_delete *) XLogRecGetData(record);
429  Buffer buffer;
430  Page page;
431  BTPageOpaque opaque;
432 
433  /*
434  * If we have any conflict processing to do, it must happen before we
435  * update the page
436  */
437  if (InHotStandby)
438  {
439  RelFileNode rnode;
440 
441  XLogRecGetBlockTag(record, 0, &rnode, NULL, NULL);
442 
444  }
445 
446  /*
447  * We don't need to take a cleanup lock to apply these changes. See
448  * nbtree/README for details.
449  */
450  if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
451  {
452  char *ptr = XLogRecGetBlockData(record, 0, NULL);
453 
454  page = (Page) BufferGetPage(buffer);
455 
456  PageIndexMultiDelete(page, (OffsetNumber *) ptr, xlrec->ndeleted);
457 
458  /* Mark the page as not containing any LP_DEAD items */
459  opaque = (BTPageOpaque) PageGetSpecialPointer(page);
460  opaque->btpo_flags &= ~BTP_HAS_GARBAGE;
461 
462  PageSetLSN(page, lsn);
463  MarkBufferDirty(buffer);
464  }
465  if (BufferIsValid(buffer))
466  UnlockReleaseBuffer(buffer);
467 }
468 
469 static void
471 {
472  XLogRecPtr lsn = record->EndRecPtr;
474  Buffer buffer;
475  Page page;
476  BTPageOpaque pageop;
477  IndexTupleData trunctuple;
478 
479  /*
480  * In normal operation, we would lock all the pages this WAL record
481  * touches before changing any of them. In WAL replay, it should be okay
482  * to lock just one page at a time, since no concurrent index updates can
483  * be happening, and readers should not care whether they arrive at the
484  * target page or not (since it's surely empty).
485  */
486 
487  /* parent page */
488  if (XLogReadBufferForRedo(record, 1, &buffer) == BLK_NEEDS_REDO)
489  {
490  OffsetNumber poffset;
491  ItemId itemid;
492  IndexTuple itup;
493  OffsetNumber nextoffset;
494  BlockNumber rightsib;
495 
496  page = (Page) BufferGetPage(buffer);
497  pageop = (BTPageOpaque) PageGetSpecialPointer(page);
498 
499  poffset = xlrec->poffset;
500 
501  nextoffset = OffsetNumberNext(poffset);
502  itemid = PageGetItemId(page, nextoffset);
503  itup = (IndexTuple) PageGetItem(page, itemid);
504  rightsib = BTreeTupleGetDownLink(itup);
505 
506  itemid = PageGetItemId(page, poffset);
507  itup = (IndexTuple) PageGetItem(page, itemid);
508  BTreeTupleSetDownLink(itup, rightsib);
509  nextoffset = OffsetNumberNext(poffset);
510  PageIndexTupleDelete(page, nextoffset);
511 
512  PageSetLSN(page, lsn);
513  MarkBufferDirty(buffer);
514  }
515  if (BufferIsValid(buffer))
516  UnlockReleaseBuffer(buffer);
517 
518  /* Rewrite the leaf page as a halfdead page */
519  buffer = XLogInitBufferForRedo(record, 0);
520  page = (Page) BufferGetPage(buffer);
521 
522  _bt_pageinit(page, BufferGetPageSize(buffer));
523  pageop = (BTPageOpaque) PageGetSpecialPointer(page);
524 
525  pageop->btpo_prev = xlrec->leftblk;
526  pageop->btpo_next = xlrec->rightblk;
527  pageop->btpo.level = 0;
528  pageop->btpo_flags = BTP_HALF_DEAD | BTP_LEAF;
529  pageop->btpo_cycleid = 0;
530 
531  /*
532  * Construct a dummy hikey item that points to the next parent to be
533  * deleted (if any).
534  */
535  MemSet(&trunctuple, 0, sizeof(IndexTupleData));
536  trunctuple.t_info = sizeof(IndexTupleData);
537  BTreeTupleSetTopParent(&trunctuple, xlrec->topparent);
538 
539  if (PageAddItem(page, (Item) &trunctuple, sizeof(IndexTupleData), P_HIKEY,
540  false, false) == InvalidOffsetNumber)
541  elog(ERROR, "could not add dummy high key to half-dead page");
542 
543  PageSetLSN(page, lsn);
544  MarkBufferDirty(buffer);
545  UnlockReleaseBuffer(buffer);
546 }
547 
548 
549 static void
551 {
552  XLogRecPtr lsn = record->EndRecPtr;
554  BlockNumber leftsib;
555  BlockNumber rightsib;
556  Buffer buffer;
557  Page page;
558  BTPageOpaque pageop;
559 
560  leftsib = xlrec->leftsib;
561  rightsib = xlrec->rightsib;
562 
563  /*
564  * In normal operation, we would lock all the pages this WAL record
565  * touches before changing any of them. In WAL replay, it should be okay
566  * to lock just one page at a time, since no concurrent index updates can
567  * be happening, and readers should not care whether they arrive at the
568  * target page or not (since it's surely empty).
569  */
570 
571  /* Fix left-link of right sibling */
572  if (XLogReadBufferForRedo(record, 2, &buffer) == BLK_NEEDS_REDO)
573  {
574  page = (Page) BufferGetPage(buffer);
575  pageop = (BTPageOpaque) PageGetSpecialPointer(page);
576  pageop->btpo_prev = leftsib;
577 
578  PageSetLSN(page, lsn);
579  MarkBufferDirty(buffer);
580  }
581  if (BufferIsValid(buffer))
582  UnlockReleaseBuffer(buffer);
583 
584  /* Fix right-link of left sibling, if any */
585  if (leftsib != P_NONE)
586  {
587  if (XLogReadBufferForRedo(record, 1, &buffer) == BLK_NEEDS_REDO)
588  {
589  page = (Page) BufferGetPage(buffer);
590  pageop = (BTPageOpaque) PageGetSpecialPointer(page);
591  pageop->btpo_next = rightsib;
592 
593  PageSetLSN(page, lsn);
594  MarkBufferDirty(buffer);
595  }
596  if (BufferIsValid(buffer))
597  UnlockReleaseBuffer(buffer);
598  }
599 
600  /* Rewrite target page as empty deleted page */
601  buffer = XLogInitBufferForRedo(record, 0);
602  page = (Page) BufferGetPage(buffer);
603 
604  _bt_pageinit(page, BufferGetPageSize(buffer));
605  pageop = (BTPageOpaque) PageGetSpecialPointer(page);
606 
607  pageop->btpo_prev = leftsib;
608  pageop->btpo_next = rightsib;
609  pageop->btpo.xact = xlrec->btpo_xact;
610  pageop->btpo_flags = BTP_DELETED;
611  pageop->btpo_cycleid = 0;
612 
613  PageSetLSN(page, lsn);
614  MarkBufferDirty(buffer);
615  UnlockReleaseBuffer(buffer);
616 
617  /*
618  * If we deleted a parent of the targeted leaf page, instead of the leaf
619  * itself, update the leaf to point to the next remaining child in the
620  * branch.
621  */
622  if (XLogRecHasBlockRef(record, 3))
623  {
624  /*
625  * There is no real data on the page, so we just re-create it from
626  * scratch using the information from the WAL record.
627  */
628  IndexTupleData trunctuple;
629 
630  buffer = XLogInitBufferForRedo(record, 3);
631  page = (Page) BufferGetPage(buffer);
632 
633  _bt_pageinit(page, BufferGetPageSize(buffer));
634  pageop = (BTPageOpaque) PageGetSpecialPointer(page);
635 
636  pageop->btpo_flags = BTP_HALF_DEAD | BTP_LEAF;
637  pageop->btpo_prev = xlrec->leafleftsib;
638  pageop->btpo_next = xlrec->leafrightsib;
639  pageop->btpo.level = 0;
640  pageop->btpo_cycleid = 0;
641 
642  /* Add a dummy hikey item */
643  MemSet(&trunctuple, 0, sizeof(IndexTupleData));
644  trunctuple.t_info = sizeof(IndexTupleData);
645  BTreeTupleSetTopParent(&trunctuple, xlrec->topparent);
646 
647  if (PageAddItem(page, (Item) &trunctuple, sizeof(IndexTupleData), P_HIKEY,
648  false, false) == InvalidOffsetNumber)
649  elog(ERROR, "could not add dummy high key to half-dead page");
650 
651  PageSetLSN(page, lsn);
652  MarkBufferDirty(buffer);
653  UnlockReleaseBuffer(buffer);
654  }
655 
656  /* Update metapage if needed */
657  if (info == XLOG_BTREE_UNLINK_PAGE_META)
658  _bt_restore_meta(record, 4);
659 }
660 
661 static void
663 {
664  XLogRecPtr lsn = record->EndRecPtr;
665  xl_btree_newroot *xlrec = (xl_btree_newroot *) XLogRecGetData(record);
666  Buffer buffer;
667  Page page;
668  BTPageOpaque pageop;
669  char *ptr;
670  Size len;
671 
672  buffer = XLogInitBufferForRedo(record, 0);
673  page = (Page) BufferGetPage(buffer);
674 
675  _bt_pageinit(page, BufferGetPageSize(buffer));
676  pageop = (BTPageOpaque) PageGetSpecialPointer(page);
677 
678  pageop->btpo_flags = BTP_ROOT;
679  pageop->btpo_prev = pageop->btpo_next = P_NONE;
680  pageop->btpo.level = xlrec->level;
681  if (xlrec->level == 0)
682  pageop->btpo_flags |= BTP_LEAF;
683  pageop->btpo_cycleid = 0;
684 
685  if (xlrec->level > 0)
686  {
687  ptr = XLogRecGetBlockData(record, 0, &len);
688  _bt_restore_page(page, ptr, len);
689 
690  /* Clear the incomplete-split flag in left child */
691  _bt_clear_incomplete_split(record, 1);
692  }
693 
694  PageSetLSN(page, lsn);
695  MarkBufferDirty(buffer);
696  UnlockReleaseBuffer(buffer);
697 
698  _bt_restore_meta(record, 2);
699 }
700 
701 static void
703 {
705 
706  /*
707  * Btree reuse_page records exist to provide a conflict point when we
708  * reuse pages in the index via the FSM. That's all they do though.
709  *
710  * latestRemovedXid was the page's btpo.xact. The btpo.xact <
711  * RecentGlobalXmin test in _bt_page_recyclable() conceptually mirrors the
712  * pgxact->xmin > limitXmin test in GetConflictingVirtualXIDs().
713  * Consequently, one XID value achieves the same exclusion effect on
714  * master and standby.
715  */
716  if (InHotStandby)
717  {
719  xlrec->node);
720  }
721 }
722 
723 void
725 {
726  uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
727 
728  switch (info)
729  {
731  btree_xlog_insert(true, false, record);
732  break;
734  btree_xlog_insert(false, false, record);
735  break;
737  btree_xlog_insert(false, true, record);
738  break;
739  case XLOG_BTREE_SPLIT_L:
740  btree_xlog_split(true, record);
741  break;
742  case XLOG_BTREE_SPLIT_R:
743  btree_xlog_split(false, record);
744  break;
745  case XLOG_BTREE_VACUUM:
746  btree_xlog_vacuum(record);
747  break;
748  case XLOG_BTREE_DELETE:
749  btree_xlog_delete(record);
750  break;
752  btree_xlog_mark_page_halfdead(info, record);
753  break;
756  btree_xlog_unlink_page(info, record);
757  break;
758  case XLOG_BTREE_NEWROOT:
759  btree_xlog_newroot(record);
760  break;
762  btree_xlog_reuse_page(record);
763  break;
765  _bt_restore_meta(record, 0);
766  break;
767  default:
768  elog(PANIC, "btree_redo: unknown op code %u", info);
769  }
770 }
771 
772 /*
773  * Mask a btree page before performing consistency checks on it.
774  */
775 void
776 btree_mask(char *pagedata, BlockNumber blkno)
777 {
778  Page page = (Page) pagedata;
779  BTPageOpaque maskopaq;
780 
782 
783  mask_page_hint_bits(page);
784  mask_unused_space(page);
785 
786  maskopaq = (BTPageOpaque) PageGetSpecialPointer(page);
787 
788  if (P_ISDELETED(maskopaq))
789  {
790  /*
791  * Mask page content on a DELETED page since it will be re-initialized
792  * during replay. See btree_xlog_unlink_page() for details.
793  */
794  mask_page_content(page);
795  }
796  else if (P_ISLEAF(maskopaq))
797  {
798  /*
799  * In btree leaf pages, it is possible to modify the LP_FLAGS without
800  * emitting any WAL record. Hence, mask the line pointer flags. See
801  * _bt_killitems(), _bt_check_unique() for details.
802  */
803  mask_lp_flags(page);
804  }
805 
806  /*
807  * BTP_HAS_GARBAGE is just an un-logged hint bit. So, mask it. See
808  * _bt_killitems(), _bt_check_unique() for details.
809  */
810  maskopaq->btpo_flags &= ~BTP_HAS_GARBAGE;
811 
812  /*
813  * During replay of a btree page split, we don't set the BTP_SPLIT_END
814  * flag of the right sibling and initialize the cycle_id to 0 for the same
815  * page. See btree_xlog_split() for details.
816  */
817  maskopaq->btpo_flags &= ~BTP_SPLIT_END;
818  maskopaq->btpo_cycleid = 0;
819 }
TransactionId latestRemovedXid
Definition: nbtxlog.h:128
#define BTP_ROOT
Definition: nbtree.h:73
#define BTP_SPLIT_END
Definition: nbtree.h:77
BlockNumber btpo_next
Definition: nbtree.h:59
void PageRestoreTempPage(Page tempPage, Page oldPage)
Definition: bufpage.c:403
uint32 btm_version
Definition: nbtree.h:101
void PageIndexTupleDelete(Page page, OffsetNumber offnum)
Definition: bufpage.c:719
void MarkBufferDirty(Buffer buffer)
Definition: bufmgr.c:1458
static void btree_xlog_vacuum(XLogReaderState *record)
Definition: nbtxlog.c:387
#define BTreeTupleGetDownLink(itup)
Definition: nbtree.h:301
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:220
uint32 btm_magic
Definition: nbtree.h:100
#define BTP_LEAF
Definition: nbtree.h:72
#define BTP_HALF_DEAD
Definition: nbtree.h:76
union BTPageOpaqueData::@46 btpo
BlockNumber root
Definition: nbtxlog.h:50
unsigned char uint8
Definition: c.h:365
Pointer Item
Definition: item.h:17
#define P_NONE
Definition: nbtree.h:182
void mask_page_hint_bits(Page page)
Definition: bufmask.c:46
#define XLOG_BTREE_INSERT_META
Definition: nbtxlog.h:28
RelFileNode node
Definition: nbtxlog.h:145
#define XLogRecHasBlockRef(decoder, block_id)
Definition: xlogreader.h:291
#define InHotStandby
Definition: xlog.h:74
uint32 level
Definition: nbtxlog.h:236
#define BTP_INCOMPLETE_SPLIT
Definition: nbtree.h:79
#define MemSet(start, val, len)
Definition: c.h:971
#define PageAddItem(page, item, size, offsetNumber, overwrite, is_heap)
Definition: bufpage.h:416
uint32 BlockNumber
Definition: block.h:31
#define P_INCOMPLETE_SPLIT(opaque)
Definition: nbtree.h:197
#define BTreeTupleSetDownLink(itup, blkno)
Definition: nbtree.h:303
#define BTP_DELETED
Definition: nbtree.h:74
void btree_redo(XLogReaderState *record)
Definition: nbtxlog.c:724
static void btree_xlog_delete(XLogReaderState *record)
Definition: nbtxlog.c:425
#define PANIC
Definition: elog.h:53
TransactionId xact
Definition: nbtree.h:63
#define BTP_META
Definition: nbtree.h:75
BTPageOpaqueData * BTPageOpaque
Definition: nbtree.h:69
uint32 ndeleted
Definition: nbtxlog.h:129
void mask_unused_space(Page page)
Definition: bufmask.c:71
XLogRecPtr EndRecPtr
Definition: xlogreader.h:135
uint16 OffsetNumber
Definition: off.h:24
Page PageGetTempPageCopySpecial(Page page)
Definition: bufpage.c:381
void mask_page_content(Page page)
Definition: bufmask.c:119
BlockNumber btm_fastroot
Definition: nbtree.h:104
#define XLOG_BTREE_NEWROOT
Definition: nbtxlog.h:35
unsigned short uint16
Definition: c.h:366
#define ItemIdGetLength(itemId)
Definition: itemid.h:59
#define BTREE_MAGIC
Definition: nbtree.h:133
static void btree_xlog_newroot(XLogReaderState *record)
Definition: nbtxlog.c:662
static void _bt_restore_meta(XLogReaderState *record, uint8 block_id)
Definition: nbtxlog.c:82
#define XLogRecGetData(decoder)
Definition: xlogreader.h:288
void UnlockReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:3398
#define ERROR
Definition: elog.h:43
static void btree_xlog_reuse_page(XLogReaderState *record)
Definition: nbtxlog.c:702
float8 last_cleanup_num_heap_tuples
Definition: nbtxlog.h:55
Buffer XLogInitBufferForRedo(XLogReaderState *record, uint8 block_id)
Definition: xlogutils.c:306
#define XLOG_BTREE_INSERT_LEAF
Definition: nbtxlog.h:26
OffsetNumber newitemoff
Definition: nbtxlog.h:114
#define BTreeTupleSetTopParent(itup, blkno)
Definition: nbtree.h:314
BTCycleId btpo_cycleid
Definition: nbtree.h:66
TransactionId oldest_btpo_xact
Definition: nbtxlog.h:54
#define BTPageGetMeta(p)
Definition: nbtree.h:113
BlockNumber btpo_prev
Definition: nbtree.h:58
static void _bt_clear_incomplete_split(XLogReaderState *record, uint8 block_id)
Definition: nbtxlog.c:138
static char * buf
Definition: pg_test_fsync.c:67
IndexTupleData * IndexTuple
Definition: itup.h:53
#define XLOG_BTREE_VACUUM
Definition: nbtxlog.h:37
static void btree_xlog_split(bool onleft, XLogReaderState *record)
Definition: nbtxlog.c:206
#define XLOG_BTREE_UNLINK_PAGE
Definition: nbtxlog.h:33
#define BTREE_NOVAC_VERSION
Definition: nbtree.h:136
#define BufferGetPage(buffer)
Definition: bufmgr.h:159
#define XLogRecGetInfo(decoder)
Definition: xlogreader.h:284
#define BTREE_METAPAGE
Definition: nbtree.h:132
#define P_ISDELETED(opaque)
Definition: nbtree.h:192
uint32 version
Definition: nbtxlog.h:49
#define XLOG_BTREE_DELETE
Definition: nbtxlog.h:32
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:235
uint32 btm_fastlevel
Definition: nbtree.h:105
uint32 level
Definition: nbtree.h:62
bool XLogRecGetBlockTag(XLogReaderState *record, uint8 block_id, RelFileNode *rnode, ForkNumber *forknum, BlockNumber *blknum)
Definition: xlogreader.c:1481
char * XLogRecGetBlockData(XLogReaderState *record, uint8 block_id, Size *len)
Definition: xlogreader.c:1505
#define XLOG_BTREE_REUSE_PAGE
Definition: nbtxlog.h:39
void mask_page_lsn_and_checksum(Page page)
Definition: bufmask.c:31
uint32 level
Definition: nbtxlog.h:112
#define XLOG_BTREE_MARK_PAGE_HALFDEAD
Definition: nbtxlog.h:36
OffsetNumber offnum
Definition: nbtxlog.h:70
struct IndexTupleData IndexTupleData
static void _bt_restore_page(Page page, char *from, int len)
Definition: nbtxlog.c:35
#define BufferGetPageSize(buffer)
Definition: bufmgr.h:146
BlockNumber btm_root
Definition: nbtree.h:102
#define InvalidOffsetNumber
Definition: off.h:26
#define XLOG_BTREE_SPLIT_R
Definition: nbtxlog.h:30
XLogRedoAction XLogReadBufferForRedo(XLogReaderState *record, uint8 block_id, Buffer *buf)
Definition: xlogutils.c:294
PageHeaderData * PageHeader
Definition: bufpage.h:166
uint64 XLogRecPtr
Definition: xlogdefs.h:21
#define Assert(condition)
Definition: c.h:738
#define XLR_INFO_MASK
Definition: xlogrecord.h:62
OffsetNumber firstright
Definition: nbtxlog.h:113
void PageIndexMultiDelete(Page page, OffsetNumber *itemnos, int nitems)
Definition: bufpage.c:828
#define OffsetNumberNext(offsetNumber)
Definition: off.h:52
size_t Size
Definition: c.h:466
#define PageGetSpecialPointer(page)
Definition: bufpage.h:326
float8 btm_last_cleanup_num_heap_tuples
Definition: nbtree.h:109
#define MAXALIGN(LEN)
Definition: c.h:691
#define BufferIsValid(bufnum)
Definition: bufmgr.h:113
static void btree_xlog_unlink_page(uint8 info, XLogReaderState *record)
Definition: nbtxlog.c:550
#define XLOG_BTREE_INSERT_UPPER
Definition: nbtxlog.h:27
#define P_HIKEY
Definition: nbtree.h:218
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2623
#define MaxIndexTuplesPerPage
Definition: itup.h:145
static void btree_xlog_mark_page_halfdead(uint8 info, XLogReaderState *record)
Definition: nbtxlog.c:470
XLogRedoAction XLogReadBufferForRedoExtended(XLogReaderState *record, uint8 block_id, ReadBufferMode mode, bool get_cleanup_lock, Buffer *buf)
Definition: xlogutils.c:331
uint32 fastlevel
Definition: nbtxlog.h:53
uint32 btm_level
Definition: nbtree.h:103
#define elog(elevel,...)
Definition: elog.h:228
uint32 level
Definition: nbtxlog.h:51
int i
void _bt_pageinit(Page page, Size size)
Definition: nbtpage.c:924
void ResolveRecoveryConflictWithSnapshot(TransactionId latestRemovedXid, RelFileNode node)
Definition: standby.c:294
#define XLOG_BTREE_SPLIT_L
Definition: nbtxlog.h:29
BlockNumber fastroot
Definition: nbtxlog.h:52
#define XLOG_BTREE_UNLINK_PAGE_META
Definition: nbtxlog.h:34
TransactionId latestRemovedXid
Definition: nbtxlog.h:147
TransactionId btm_oldest_btpo_xact
Definition: nbtree.h:107
unsigned short t_info
Definition: itup.h:49
uint16 btpo_flags
Definition: nbtree.h:65
#define PageSetLSN(page, lsn)
Definition: bufpage.h:368
uint32 ndeleted
Definition: nbtxlog.h:162
int Buffer
Definition: buf.h:23
#define XLOG_BTREE_META_CLEANUP
Definition: nbtxlog.h:41
void mask_lp_flags(Page page)
Definition: bufmask.c:95
void btree_mask(char *pagedata, BlockNumber blkno)
Definition: nbtxlog.c:776
#define BTP_HAS_GARBAGE
Definition: nbtree.h:78
#define PageGetItem(page, itemId)
Definition: bufpage.h:340
Pointer Page
Definition: bufpage.h:78
#define IndexTupleSize(itup)
Definition: itup.h:71
#define P_ISLEAF(opaque)
Definition: nbtree.h:190
static void btree_xlog_insert(bool isleaf, bool ismeta, XLogReaderState *record)
Definition: nbtxlog.c:159