PostgreSQL Source Code  git master
nbtxlog.h
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * nbtxlog.h
4  * header file for postgres btree xlog routines
5  *
6  * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  * src/include/access/nbtxlog.h
10  *
11  *-------------------------------------------------------------------------
12  */
13 #ifndef NBTXLOG_H
14 #define NBTXLOG_H
15 
16 #include "access/transam.h"
17 #include "access/xlogreader.h"
18 #include "lib/stringinfo.h"
19 #include "storage/off.h"
20 
21 /*
22  * XLOG records for btree operations
23  *
24  * XLOG allows to store some information in high 4 bits of log
25  * record xl_info field
26  */
27 #define XLOG_BTREE_INSERT_LEAF 0x00 /* add index tuple without split */
28 #define XLOG_BTREE_INSERT_UPPER 0x10 /* same, on a non-leaf page */
29 #define XLOG_BTREE_INSERT_META 0x20 /* same, plus update metapage */
30 #define XLOG_BTREE_SPLIT_L 0x30 /* add index tuple with split */
31 #define XLOG_BTREE_SPLIT_R 0x40 /* as above, new item on right */
32 #define XLOG_BTREE_INSERT_POST 0x50 /* add index tuple with posting split */
33 #define XLOG_BTREE_DEDUP 0x60 /* deduplicate tuples for a page */
34 #define XLOG_BTREE_DELETE 0x70 /* delete leaf index tuples for a page */
35 #define XLOG_BTREE_UNLINK_PAGE 0x80 /* delete a half-dead page */
36 #define XLOG_BTREE_UNLINK_PAGE_META 0x90 /* same, and update metapage */
37 #define XLOG_BTREE_NEWROOT 0xA0 /* new root page */
38 #define XLOG_BTREE_MARK_PAGE_HALFDEAD 0xB0 /* mark a leaf as half-dead */
39 #define XLOG_BTREE_VACUUM 0xC0 /* delete entries on a page during
40  * vacuum */
41 #define XLOG_BTREE_REUSE_PAGE 0xD0 /* old page is about to be reused from
42  * FSM */
43 #define XLOG_BTREE_META_CLEANUP 0xE0 /* update cleanup-related data in the
44  * metapage */
45 
46 /*
47  * All that we need to regenerate the meta-data page
48  */
49 typedef struct xl_btree_metadata
50 {
57  bool allequalimage;
59 
60 /*
61  * This is what we need to know about simple (without split) insert.
62  *
63  * This data record is used for INSERT_LEAF, INSERT_UPPER, INSERT_META, and
64  * INSERT_POST. Note that INSERT_META and INSERT_UPPER implies it's not a
65  * leaf page, while INSERT_POST and INSERT_LEAF imply that it must be a leaf
66  * page.
67  *
68  * Backup Blk 0: original page
69  * Backup Blk 1: child's left sibling, if INSERT_UPPER or INSERT_META
70  * Backup Blk 2: xl_btree_metadata, if INSERT_META
71  *
72  * Note: The new tuple is actually the "original" new item in the posting
73  * list split insert case (i.e. the INSERT_POST case). A split offset for
74  * the posting list is logged before the original new item. Recovery needs
75  * both, since it must do an in-place update of the existing posting list
76  * that was split as an extra step. Also, recovery generates a "final"
77  * newitem. See _bt_swap_posting() for details on posting list splits.
78  */
79 typedef struct xl_btree_insert
80 {
82 
83  /* POSTING SPLIT OFFSET FOLLOWS (INSERT_POST case) */
84  /* NEW TUPLE ALWAYS FOLLOWS AT THE END */
86 
87 #define SizeOfBtreeInsert (offsetof(xl_btree_insert, offnum) + sizeof(OffsetNumber))
88 
89 /*
90  * On insert with split, we save all the items going into the right sibling
91  * so that we can restore it completely from the log record. This way takes
92  * less xlog space than the normal approach, because if we did it standardly,
93  * XLogInsert would almost always think the right page is new and store its
94  * whole page image. The left page, however, is handled in the normal
95  * incremental-update fashion.
96  *
97  * Note: XLOG_BTREE_SPLIT_L and XLOG_BTREE_SPLIT_R share this data record.
98  * There are two variants to indicate whether the inserted tuple went into the
99  * left or right split page (and thus, whether the new item is stored or not).
100  * We always log the left page high key because suffix truncation can generate
101  * a new leaf high key using user-defined code. This is also necessary on
102  * internal pages, since the firstright item that the left page's high key was
103  * based on will have been truncated to zero attributes in the right page (the
104  * separator key is unavailable from the right page).
105  *
106  * Backup Blk 0: original page / new left page
107  *
108  * The left page's data portion contains the new item, if it's the _L variant.
109  * _R variant split records generally do not have a newitem (_R variant leaf
110  * page split records that must deal with a posting list split will include an
111  * explicit newitem, though it is never used on the right page -- it is
112  * actually an orignewitem needed to update existing posting list). The new
113  * high key of the left/original page appears last of all (and must always be
114  * present).
115  *
116  * Page split records that need the REDO routine to deal with a posting list
117  * split directly will have an explicit newitem, which is actually an
118  * orignewitem (the newitem as it was before the posting list split, not
119  * after). A posting list split always has a newitem that comes immediately
120  * after the posting list being split (which would have overlapped with
121  * orignewitem prior to split). Usually REDO must deal with posting list
122  * splits with an _L variant page split record, and usually both the new
123  * posting list and the final newitem go on the left page (the existing
124  * posting list will be inserted instead of the old, and the final newitem
125  * will be inserted next to that). However, _R variant split records will
126  * include an orignewitem when the split point for the page happens to have a
127  * lastleft tuple that is also the posting list being split (leaving newitem
128  * as the page split's firstright tuple). The existence of this corner case
129  * does not change the basic fact about newitem/orignewitem for the REDO
130  * routine: it is always state used for the left page alone. (This is why the
131  * record's postingoff field isn't a reliable indicator of whether or not a
132  * posting list split occurred during the page split; a non-zero value merely
133  * indicates that the REDO routine must reconstruct a new posting list tuple
134  * that is needed for the left page.)
135  *
136  * This posting list split handling is equivalent to the xl_btree_insert REDO
137  * routine's INSERT_POST handling. While the details are more complicated
138  * here, the concept and goals are exactly the same. See _bt_swap_posting()
139  * for details on posting list splits.
140  *
141  * Backup Blk 1: new right page
142  *
143  * The right page's data portion contains the right page's tuples in the form
144  * used by _bt_restore_page. This includes the new item, if it's the _R
145  * variant. The right page's tuples also include the right page's high key
146  * with either variant (moved from the left/original page during the split),
147  * unless the split happened to be of the rightmost page on its level, where
148  * there is no high key for new right page.
149  *
150  * Backup Blk 2: next block (orig page's rightlink), if any
151  * Backup Blk 3: child's left sibling, if non-leaf split
152  */
153 typedef struct xl_btree_split
154 {
155  uint32 level; /* tree level of page being split */
156  OffsetNumber firstrightoff; /* first origpage item on rightpage */
157  OffsetNumber newitemoff; /* new item's offset */
158  uint16 postingoff; /* offset inside orig posting tuple */
160 
161 #define SizeOfBtreeSplit (offsetof(xl_btree_split, postingoff) + sizeof(uint16))
162 
163 /*
164  * When page is deduplicated, consecutive groups of tuples with equal keys are
165  * merged together into posting list tuples.
166  *
167  * The WAL record represents a deduplication pass for a leaf page. An array
168  * of BTDedupInterval structs follows.
169  */
170 typedef struct xl_btree_dedup
171 {
173 
174  /* DEDUPLICATION INTERVALS FOLLOW */
176 
177 #define SizeOfBtreeDedup (offsetof(xl_btree_dedup, nintervals) + sizeof(uint16))
178 
179 /*
180  * This is what we need to know about page reuse within btree. This record
181  * only exists to generate a conflict point for Hot Standby.
182  *
183  * Note that we must include a RelFileLocator in the record because we don't
184  * actually register the buffer with the record.
185  */
186 typedef struct xl_btree_reuse_page
187 {
191  bool isCatalogRel; /* to handle recovery conflict during logical
192  * decoding on standby */
194 
195 #define SizeOfBtreeReusePage (offsetof(xl_btree_reuse_page, isCatalogRel) + sizeof(bool))
196 
197 /*
198  * xl_btree_vacuum and xl_btree_delete records describe deletion of index
199  * tuples on a leaf page. The former variant is used by VACUUM, while the
200  * latter variant is used by the ad-hoc deletions that sometimes take place
201  * when btinsert() is called.
202  *
203  * The records are very similar. The only difference is that xl_btree_delete
204  * have snapshotConflictHorizon/isCatalogRel fields for recovery conflicts.
205  * (VACUUM operations can just rely on earlier conflicts generated during
206  * pruning of the table whose TIDs the to-be-deleted index tuples point to.
207  * There are also small differences between each REDO routine that we don't go
208  * into here.)
209  *
210  * xl_btree_vacuum and xl_btree_delete both represent deletion of any number
211  * of index tuples on a single leaf page using page offset numbers. Both also
212  * support "updates" of index tuples, which is how deletes of a subset of TIDs
213  * contained in an existing posting list tuple are implemented.
214  *
215  * Updated posting list tuples are represented using xl_btree_update metadata.
216  * The REDO routines each use the xl_btree_update entries (plus each
217  * corresponding original index tuple from the target leaf page) to generate
218  * the final updated tuple.
219  *
220  * Updates are only used when there will be some remaining TIDs left by the
221  * REDO routine. Otherwise the posting list tuple just gets deleted outright.
222  */
223 typedef struct xl_btree_vacuum
224 {
227 
228  /*----
229  * In payload of blk 0 :
230  * - DELETED TARGET OFFSET NUMBERS
231  * - UPDATED TARGET OFFSET NUMBERS
232  * - UPDATED TUPLES METADATA (xl_btree_update) ITEMS
233  *----
234  */
236 
237 #define SizeOfBtreeVacuum (offsetof(xl_btree_vacuum, nupdated) + sizeof(uint16))
238 
239 typedef struct xl_btree_delete
240 {
244  bool isCatalogRel; /* to handle recovery conflict during logical
245  * decoding on standby */
246 
247  /*----
248  * In payload of blk 0 :
249  * - DELETED TARGET OFFSET NUMBERS
250  * - UPDATED TARGET OFFSET NUMBERS
251  * - UPDATED TUPLES METADATA (xl_btree_update) ITEMS
252  *----
253  */
255 
256 #define SizeOfBtreeDelete (offsetof(xl_btree_delete, isCatalogRel) + sizeof(bool))
257 
258 /*
259  * The offsets that appear in xl_btree_update metadata are offsets into the
260  * original posting list from tuple, not page offset numbers. These are
261  * 0-based. The page offset number for the original posting list tuple comes
262  * from the main xl_btree_vacuum/xl_btree_delete record.
263  */
264 typedef struct xl_btree_update
265 {
267 
268  /* POSTING LIST uint16 OFFSETS TO A DELETED TID FOLLOW */
270 
271 #define SizeOfBtreeUpdate (offsetof(xl_btree_update, ndeletedtids) + sizeof(uint16))
272 
273 /*
274  * This is what we need to know about marking an empty subtree for deletion.
275  * The target identifies the tuple removed from the parent page (note that we
276  * remove this tuple's downlink and the *following* tuple's key). Note that
277  * the leaf page is empty, so we don't need to store its content --- it is
278  * just reinitialized during recovery using the rest of the fields.
279  *
280  * Backup Blk 0: leaf block
281  * Backup Blk 1: top parent
282  */
283 typedef struct xl_btree_mark_page_halfdead
284 {
285  OffsetNumber poffset; /* deleted tuple id in parent page */
286 
287  /* information needed to recreate the leaf page: */
288  BlockNumber leafblk; /* leaf block ultimately being deleted */
289  BlockNumber leftblk; /* leaf block's left sibling, if any */
290  BlockNumber rightblk; /* leaf block's right sibling */
291  BlockNumber topparent; /* topmost internal page in the subtree */
293 
294 #define SizeOfBtreeMarkPageHalfDead (offsetof(xl_btree_mark_page_halfdead, topparent) + sizeof(BlockNumber))
295 
296 /*
297  * This is what we need to know about deletion of a btree page. Note that we
298  * only leave behind a small amount of bookkeeping information in deleted
299  * pages (deleted pages must be kept around as tombstones for a while). It is
300  * convenient for the REDO routine to regenerate its target page from scratch.
301  * This is why WAL record describes certain details that are actually directly
302  * available from the target page.
303  *
304  * Backup Blk 0: target block being deleted
305  * Backup Blk 1: target block's left sibling, if any
306  * Backup Blk 2: target block's right sibling
307  * Backup Blk 3: leaf block (if different from target)
308  * Backup Blk 4: metapage (if rightsib becomes new fast root)
309  */
310 typedef struct xl_btree_unlink_page
311 {
312  BlockNumber leftsib; /* target block's left sibling, if any */
313  BlockNumber rightsib; /* target block's right sibling */
314  uint32 level; /* target block's level */
315  FullTransactionId safexid; /* target block's BTPageSetDeleted() XID */
316 
317  /*
318  * Information needed to recreate a half-dead leaf page with correct
319  * topparent link. The fields are only used when deletion operation's
320  * target page is an internal page. REDO routine creates half-dead page
321  * from scratch to keep things simple (this is the same convenient
322  * approach used for the target page itself).
323  */
326  BlockNumber leaftopparent; /* next child down in the subtree */
327 
328  /* xl_btree_metadata FOLLOWS IF XLOG_BTREE_UNLINK_PAGE_META */
330 
331 #define SizeOfBtreeUnlinkPage (offsetof(xl_btree_unlink_page, leaftopparent) + sizeof(BlockNumber))
332 
333 /*
334  * New root log record. There are zero tuples if this is to establish an
335  * empty root, or two if it is the result of splitting an old root.
336  *
337  * Note that although this implies rewriting the metadata page, we don't need
338  * an xl_btree_metadata record --- the rootblk and level are sufficient.
339  *
340  * Backup Blk 0: new root page (2 tuples as payload, if splitting old root)
341  * Backup Blk 1: left child (if splitting an old root)
342  * Backup Blk 2: metapage
343  */
344 typedef struct xl_btree_newroot
345 {
346  BlockNumber rootblk; /* location of new root (redundant with blk 0) */
347  uint32 level; /* its tree level */
349 
350 #define SizeOfBtreeNewroot (offsetof(xl_btree_newroot, level) + sizeof(uint32))
351 
352 
353 /*
354  * prototypes for functions in nbtxlog.c
355  */
356 extern void btree_redo(XLogReaderState *record);
357 extern void btree_xlog_startup(void);
358 extern void btree_xlog_cleanup(void);
359 extern void btree_mask(char *pagedata, BlockNumber blkno);
360 
361 /*
362  * prototypes for functions in nbtdesc.c
363  */
364 extern void btree_desc(StringInfo buf, XLogReaderState *record);
365 extern const char *btree_identify(uint8 info);
366 
367 #endif /* NBTXLOG_H */
uint32 BlockNumber
Definition: block.h:31
unsigned short uint16
Definition: c.h:508
unsigned int uint32
Definition: c.h:509
unsigned char uint8
Definition: c.h:507
uint32 TransactionId
Definition: c.h:655
void btree_redo(XLogReaderState *record)
Definition: nbtxlog.c:1014
struct xl_btree_delete xl_btree_delete
struct xl_btree_reuse_page xl_btree_reuse_page
struct xl_btree_update xl_btree_update
const char * btree_identify(uint8 info)
Definition: nbtdesc.c:139
struct xl_btree_mark_page_halfdead xl_btree_mark_page_halfdead
struct xl_btree_dedup xl_btree_dedup
struct xl_btree_unlink_page xl_btree_unlink_page
void btree_xlog_cleanup(void)
Definition: nbtxlog.c:1081
struct xl_btree_split xl_btree_split
struct xl_btree_vacuum xl_btree_vacuum
struct xl_btree_metadata xl_btree_metadata
void btree_mask(char *pagedata, BlockNumber blkno)
Definition: nbtxlog.c:1091
struct xl_btree_newroot xl_btree_newroot
struct xl_btree_insert xl_btree_insert
void btree_desc(StringInfo buf, XLogReaderState *record)
Definition: nbtdesc.c:24
void btree_xlog_startup(void)
Definition: nbtxlog.c:1073
uint16 OffsetNumber
Definition: off.h:24
static char * buf
Definition: pg_test_fsync.c:73
uint16 nintervals
Definition: nbtxlog.h:169
TransactionId snapshotConflictHorizon
Definition: nbtxlog.h:238
bool isCatalogRel
Definition: nbtxlog.h:241
uint16 ndeleted
Definition: nbtxlog.h:239
uint16 nupdated
Definition: nbtxlog.h:240
OffsetNumber offnum
Definition: nbtxlog.h:78
uint32 level
Definition: nbtxlog.h:50
uint32 version
Definition: nbtxlog.h:48
bool allequalimage
Definition: nbtxlog.h:54
BlockNumber fastroot
Definition: nbtxlog.h:51
uint32 fastlevel
Definition: nbtxlog.h:52
BlockNumber root
Definition: nbtxlog.h:49
uint32 last_cleanup_num_delpages
Definition: nbtxlog.h:53
uint32 level
Definition: nbtxlog.h:344
BlockNumber rootblk
Definition: nbtxlog.h:343
FullTransactionId snapshotConflictHorizon
Definition: nbtxlog.h:187
RelFileLocator locator
Definition: nbtxlog.h:185
BlockNumber block
Definition: nbtxlog.h:186
uint16 postingoff
Definition: nbtxlog.h:155
OffsetNumber firstrightoff
Definition: nbtxlog.h:153
uint32 level
Definition: nbtxlog.h:152
OffsetNumber newitemoff
Definition: nbtxlog.h:154
uint16 ndeletedtids
Definition: nbtxlog.h:263
uint16 ndeleted
Definition: nbtxlog.h:222
uint16 nupdated
Definition: nbtxlog.h:223