PostgreSQL Source Code  git master
nbtxlog.h
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * nbtxlog.h
4  * header file for postgres btree xlog routines
5  *
6  * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  * src/include/access/nbtxlog.h
10  *
11  *-------------------------------------------------------------------------
12  */
13 #ifndef NBTXLOG_H
14 #define NBTXLOG_H
15 
16 #include "access/xlogreader.h"
17 #include "lib/stringinfo.h"
18 #include "storage/off.h"
19 
20 /*
21  * XLOG records for btree operations
22  *
23  * XLOG allows to store some information in high 4 bits of log
24  * record xl_info field
25  */
26 #define XLOG_BTREE_INSERT_LEAF 0x00 /* add index tuple without split */
27 #define XLOG_BTREE_INSERT_UPPER 0x10 /* same, on a non-leaf page */
28 #define XLOG_BTREE_INSERT_META 0x20 /* same, plus update metapage */
29 #define XLOG_BTREE_SPLIT_L 0x30 /* add index tuple with split */
30 #define XLOG_BTREE_SPLIT_R 0x40 /* as above, new item on right */
31 /* 0x50 and 0x60 are unused */
32 #define XLOG_BTREE_DELETE 0x70 /* delete leaf index tuples for a page */
33 #define XLOG_BTREE_UNLINK_PAGE 0x80 /* delete a half-dead page */
34 #define XLOG_BTREE_UNLINK_PAGE_META 0x90 /* same, and update metapage */
35 #define XLOG_BTREE_NEWROOT 0xA0 /* new root page */
36 #define XLOG_BTREE_MARK_PAGE_HALFDEAD 0xB0 /* mark a leaf as half-dead */
37 #define XLOG_BTREE_VACUUM 0xC0 /* delete entries on a page during
38  * vacuum */
39 #define XLOG_BTREE_REUSE_PAGE 0xD0 /* old page is about to be reused from
40  * FSM */
41 #define XLOG_BTREE_META_CLEANUP 0xE0 /* update cleanup-related data in the
42  * metapage */
43 
44 /*
45  * All that we need to regenerate the meta-data page
46  */
47 typedef struct xl_btree_metadata
48 {
57 
58 /*
59  * This is what we need to know about simple (without split) insert.
60  *
61  * This data record is used for INSERT_LEAF, INSERT_UPPER, INSERT_META.
62  * Note that INSERT_META implies it's not a leaf page.
63  *
64  * Backup Blk 0: original page (data contains the inserted tuple)
65  * Backup Blk 1: child's left sibling, if INSERT_UPPER or INSERT_META
66  * Backup Blk 2: xl_btree_metadata, if INSERT_META
67  */
68 typedef struct xl_btree_insert
69 {
72 
73 #define SizeOfBtreeInsert (offsetof(xl_btree_insert, offnum) + sizeof(OffsetNumber))
74 
75 /*
76  * On insert with split, we save all the items going into the right sibling
77  * so that we can restore it completely from the log record. This way takes
78  * less xlog space than the normal approach, because if we did it standardly,
79  * XLogInsert would almost always think the right page is new and store its
80  * whole page image. The left page, however, is handled in the normal
81  * incremental-update fashion.
82  *
83  * Note: XLOG_BTREE_SPLIT_L and XLOG_BTREE_SPLIT_R share this data record.
84  * There are two variants to indicate whether the inserted tuple went into the
85  * left or right split page (and thus, whether the new item is stored or not).
86  * We always log the left page high key because suffix truncation can generate
87  * a new leaf high key using user-defined code. This is also necessary on
88  * internal pages, since the first right item that the left page's high key
89  * was based on will have been truncated to zero attributes in the right page
90  * (the original is unavailable from the right page).
91  *
92  * Backup Blk 0: original page / new left page
93  *
94  * The left page's data portion contains the new item, if it's the _L variant.
95  * An IndexTuple representing the high key of the left page must follow with
96  * either variant.
97  *
98  * Backup Blk 1: new right page
99  *
100  * The right page's data portion contains the right page's tuples in the form
101  * used by _bt_restore_page. This includes the new item, if it's the _R
102  * variant. The right page's tuples also include the right page's high key
103  * with either variant (moved from the left/original page during the split),
104  * unless the split happened to be of the rightmost page on its level, where
105  * there is no high key for new right page.
106  *
107  * Backup Blk 2: next block (orig page's rightlink), if any
108  * Backup Blk 3: child's left sibling, if non-leaf split
109  */
110 typedef struct xl_btree_split
111 {
112  uint32 level; /* tree level of page being split */
113  OffsetNumber firstright; /* first item moved to right page */
114  OffsetNumber newitemoff; /* new item's offset (useful for _L variant) */
116 
117 #define SizeOfBtreeSplit (offsetof(xl_btree_split, newitemoff) + sizeof(OffsetNumber))
118 
119 /*
120  * This is what we need to know about delete of individual leaf index tuples.
121  * The WAL record can represent deletion of any number of index tuples on a
122  * single index page when *not* executed by VACUUM.
123  *
124  * Backup Blk 0: index page
125  */
126 typedef struct xl_btree_delete
127 {
129  int nitems;
130 
131  /* TARGET OFFSET NUMBERS FOLLOW AT THE END */
133 
134 #define SizeOfBtreeDelete (offsetof(xl_btree_delete, nitems) + sizeof(int))
135 
136 /*
137  * This is what we need to know about page reuse within btree.
138  */
139 typedef struct xl_btree_reuse_page
140 {
145 
146 #define SizeOfBtreeReusePage (sizeof(xl_btree_reuse_page))
147 
148 /*
149  * This is what we need to know about vacuum of individual leaf index tuples.
150  * The WAL record can represent deletion of any number of index tuples on a
151  * single index page when executed by VACUUM.
152  *
153  * For MVCC scans, lastBlockVacuumed will be set to InvalidBlockNumber.
154  * For a non-MVCC index scans there is an additional correctness requirement
155  * for applying these changes during recovery, which is that we must do one
156  * of these two things for every block in the index:
157  * * lock the block for cleanup and apply any required changes
158  * * EnsureBlockUnpinned()
159  * The purpose of this is to ensure that no index scans started before we
160  * finish scanning the index are still running by the time we begin to remove
161  * heap tuples.
162  *
163  * Any changes to any one block are registered on just one WAL record. All
164  * blocks that we need to run EnsureBlockUnpinned() are listed as a block range
165  * starting from the last block vacuumed through until this one. Individual
166  * block numbers aren't given.
167  *
168  * Note that the *last* WAL record in any vacuum of an index is allowed to
169  * have a zero length array of offsets. Earlier records must have at least one.
170  */
171 typedef struct xl_btree_vacuum
172 {
174 
175  /* TARGET OFFSET NUMBERS FOLLOW */
177 
178 #define SizeOfBtreeVacuum (offsetof(xl_btree_vacuum, lastBlockVacuumed) + sizeof(BlockNumber))
179 
180 /*
181  * This is what we need to know about marking an empty branch for deletion.
182  * The target identifies the tuple removed from the parent page (note that we
183  * remove this tuple's downlink and the *following* tuple's key). Note that
184  * the leaf page is empty, so we don't need to store its content --- it is
185  * just reinitialized during recovery using the rest of the fields.
186  *
187  * Backup Blk 0: leaf block
188  * Backup Blk 1: top parent
189  */
191 {
192  OffsetNumber poffset; /* deleted tuple id in parent page */
193 
194  /* information needed to recreate the leaf page: */
195  BlockNumber leafblk; /* leaf block ultimately being deleted */
196  BlockNumber leftblk; /* leaf block's left sibling, if any */
197  BlockNumber rightblk; /* leaf block's right sibling */
198  BlockNumber topparent; /* topmost internal page in the branch */
200 
201 #define SizeOfBtreeMarkPageHalfDead (offsetof(xl_btree_mark_page_halfdead, topparent) + sizeof(BlockNumber))
202 
203 /*
204  * This is what we need to know about deletion of a btree page. Note we do
205  * not store any content for the deleted page --- it is just rewritten as empty
206  * during recovery, apart from resetting the btpo.xact.
207  *
208  * Backup Blk 0: target block being deleted
209  * Backup Blk 1: target block's left sibling, if any
210  * Backup Blk 2: target block's right sibling
211  * Backup Blk 3: leaf block (if different from target)
212  * Backup Blk 4: metapage (if rightsib becomes new fast root)
213  */
214 typedef struct xl_btree_unlink_page
215 {
216  BlockNumber leftsib; /* target block's left sibling, if any */
217  BlockNumber rightsib; /* target block's right sibling */
218 
219  /*
220  * Information needed to recreate the leaf page, when target is an
221  * internal page.
222  */
225  BlockNumber topparent; /* next child down in the branch */
226 
227  TransactionId btpo_xact; /* value of btpo.xact for use in recovery */
228  /* xl_btree_metadata FOLLOWS IF XLOG_BTREE_UNLINK_PAGE_META */
230 
231 #define SizeOfBtreeUnlinkPage (offsetof(xl_btree_unlink_page, btpo_xact) + sizeof(TransactionId))
232 
233 /*
234  * New root log record. There are zero tuples if this is to establish an
235  * empty root, or two if it is the result of splitting an old root.
236  *
237  * Note that although this implies rewriting the metadata page, we don't need
238  * an xl_btree_metadata record --- the rootblk and level are sufficient.
239  *
240  * Backup Blk 0: new root page (2 tuples as payload, if splitting old root)
241  * Backup Blk 1: left child (if splitting an old root)
242  * Backup Blk 2: metapage
243  */
244 typedef struct xl_btree_newroot
245 {
246  BlockNumber rootblk; /* location of new root (redundant with blk 0) */
247  uint32 level; /* its tree level */
249 
250 #define SizeOfBtreeNewroot (offsetof(xl_btree_newroot, level) + sizeof(uint32))
251 
252 
253 /*
254  * prototypes for functions in nbtxlog.c
255  */
256 extern void btree_redo(XLogReaderState *record);
257 extern void btree_desc(StringInfo buf, XLogReaderState *record);
258 extern const char *btree_identify(uint8 info);
259 extern void btree_mask(char *pagedata, BlockNumber blkno);
260 
261 #endif /* NBTXLOG_H */
BlockNumber lastBlockVacuumed
Definition: nbtxlog.h:173
void btree_mask(char *pagedata, BlockNumber blkno)
Definition: nbtxlog.c:872
BlockNumber rootblk
Definition: nbtxlog.h:246
TransactionId latestRemovedXid
Definition: nbtxlog.h:128
void btree_desc(StringInfo buf, XLogReaderState *record)
Definition: nbtdesc.c:20
uint32 TransactionId
Definition: c.h:507
struct xl_btree_split xl_btree_split
struct xl_btree_reuse_page xl_btree_reuse_page
struct xl_btree_metadata xl_btree_metadata
BlockNumber root
Definition: nbtxlog.h:50
unsigned char uint8
Definition: c.h:356
RelFileNode node
Definition: nbtxlog.h:141
uint32 level
Definition: nbtxlog.h:247
uint32 BlockNumber
Definition: block.h:31
struct xl_btree_newroot xl_btree_newroot
struct xl_btree_insert xl_btree_insert
uint16 OffsetNumber
Definition: off.h:24
struct xl_btree_delete xl_btree_delete
BlockNumber block
Definition: nbtxlog.h:142
double float8
Definition: c.h:491
float8 last_cleanup_num_heap_tuples
Definition: nbtxlog.h:55
const char * btree_identify(uint8 info)
Definition: nbtdesc.c:113
OffsetNumber newitemoff
Definition: nbtxlog.h:114
TransactionId oldest_btpo_xact
Definition: nbtxlog.h:54
struct xl_btree_mark_page_halfdead xl_btree_mark_page_halfdead
static char * buf
Definition: pg_test_fsync.c:68
unsigned int uint32
Definition: c.h:358
struct xl_btree_vacuum xl_btree_vacuum
uint32 version
Definition: nbtxlog.h:49
uint32 level
Definition: nbtxlog.h:112
OffsetNumber offnum
Definition: nbtxlog.h:70
OffsetNumber firstright
Definition: nbtxlog.h:113
uint32 fastlevel
Definition: nbtxlog.h:53
struct xl_btree_unlink_page xl_btree_unlink_page
uint32 level
Definition: nbtxlog.h:51
BlockNumber fastroot
Definition: nbtxlog.h:52
TransactionId latestRemovedXid
Definition: nbtxlog.h:143
void btree_redo(XLogReaderState *record)
Definition: nbtxlog.c:820