PostgreSQL Source Code
git master
Loading...
Searching...
No Matches
nbtxlog.h
Go to the documentation of this file.
1
/*-------------------------------------------------------------------------
2
*
3
* nbtxlog.h
4
* header file for postgres btree xlog routines
5
*
6
* Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
7
* Portions Copyright (c) 1994, Regents of the University of California
8
*
9
* src/include/access/nbtxlog.h
10
*
11
*-------------------------------------------------------------------------
12
*/
13
#ifndef NBTXLOG_H
14
#define NBTXLOG_H
15
16
#include "
access/transam.h
"
17
#include "
access/xlogreader.h
"
18
#include "
lib/stringinfo.h
"
19
#include "
storage/off.h
"
20
21
/*
22
* XLOG records for btree operations
23
*
24
* XLOG allows to store some information in high 4 bits of log
25
* record xl_info field
26
*/
27
#define XLOG_BTREE_INSERT_LEAF 0x00
/* add index tuple without split */
28
#define XLOG_BTREE_INSERT_UPPER 0x10
/* same, on a non-leaf page */
29
#define XLOG_BTREE_INSERT_META 0x20
/* same, plus update metapage */
30
#define XLOG_BTREE_SPLIT_L 0x30
/* add index tuple with split */
31
#define XLOG_BTREE_SPLIT_R 0x40
/* as above, new item on right */
32
#define XLOG_BTREE_INSERT_POST 0x50
/* add index tuple with posting split */
33
#define XLOG_BTREE_DEDUP 0x60
/* deduplicate tuples for a page */
34
#define XLOG_BTREE_DELETE 0x70
/* delete leaf index tuples for a page */
35
#define XLOG_BTREE_UNLINK_PAGE 0x80
/* delete a half-dead page */
36
#define XLOG_BTREE_UNLINK_PAGE_META 0x90
/* same, and update metapage */
37
#define XLOG_BTREE_NEWROOT 0xA0
/* new root page */
38
#define XLOG_BTREE_MARK_PAGE_HALFDEAD 0xB0
/* mark a leaf as half-dead */
39
#define XLOG_BTREE_VACUUM 0xC0
/* delete entries on a page during
40
* vacuum */
41
#define XLOG_BTREE_REUSE_PAGE 0xD0
/* old page is about to be reused from
42
* FSM */
43
#define XLOG_BTREE_META_CLEANUP 0xE0
/* update cleanup-related data in the
44
* metapage */
45
46
/*
47
* All that we need to regenerate the meta-data page
48
*/
49
typedef
struct
xl_btree_metadata
50
{
51
uint32
version
;
52
BlockNumber
root
;
53
uint32
level
;
54
BlockNumber
fastroot
;
55
uint32
fastlevel
;
56
uint32
last_cleanup_num_delpages
;
57
bool
allequalimage
;
58
}
xl_btree_metadata
;
59
60
/*
61
* This is what we need to know about simple (without split) insert.
62
*
63
* This data record is used for INSERT_LEAF, INSERT_UPPER, INSERT_META, and
64
* INSERT_POST. Note that INSERT_META and INSERT_UPPER implies it's not a
65
* leaf page, while INSERT_POST and INSERT_LEAF imply that it must be a leaf
66
* page.
67
*
68
* Backup Blk 0: original page
69
* Backup Blk 1: child's left sibling, if INSERT_UPPER or INSERT_META
70
* Backup Blk 2: xl_btree_metadata, if INSERT_META
71
*
72
* Note: The new tuple is actually the "original" new item in the posting
73
* list split insert case (i.e. the INSERT_POST case). A split offset for
74
* the posting list is logged before the original new item. Recovery needs
75
* both, since it must do an in-place update of the existing posting list
76
* that was split as an extra step. Also, recovery generates a "final"
77
* newitem. See _bt_swap_posting() for details on posting list splits.
78
*/
79
typedef
struct
xl_btree_insert
80
{
81
OffsetNumber
offnum
;
82
83
/* POSTING SPLIT OFFSET FOLLOWS (INSERT_POST case) */
84
/* NEW TUPLE ALWAYS FOLLOWS AT THE END */
85
}
xl_btree_insert
;
86
87
#define SizeOfBtreeInsert (offsetof(xl_btree_insert, offnum) + sizeof(OffsetNumber))
88
89
/*
90
* On insert with split, we save all the items going into the right sibling
91
* so that we can restore it completely from the log record. This way takes
92
* less xlog space than the normal approach, because if we did it standardly,
93
* XLogInsert would almost always think the right page is new and store its
94
* whole page image. The left page, however, is handled in the normal
95
* incremental-update fashion.
96
*
97
* Note: XLOG_BTREE_SPLIT_L and XLOG_BTREE_SPLIT_R share this data record.
98
* There are two variants to indicate whether the inserted tuple went into the
99
* left or right split page (and thus, whether the new item is stored or not).
100
* We always log the left page high key because suffix truncation can generate
101
* a new leaf high key using user-defined code. This is also necessary on
102
* internal pages, since the firstright item that the left page's high key was
103
* based on will have been truncated to zero attributes in the right page (the
104
* separator key is unavailable from the right page).
105
*
106
* Backup Blk 0: original page / new left page
107
*
108
* The left page's data portion contains the new item, if it's the _L variant.
109
* _R variant split records generally do not have a newitem (_R variant leaf
110
* page split records that must deal with a posting list split will include an
111
* explicit newitem, though it is never used on the right page -- it is
112
* actually an orignewitem needed to update existing posting list). The new
113
* high key of the left/original page appears last of all (and must always be
114
* present).
115
*
116
* Page split records that need the REDO routine to deal with a posting list
117
* split directly will have an explicit newitem, which is actually an
118
* orignewitem (the newitem as it was before the posting list split, not
119
* after). A posting list split always has a newitem that comes immediately
120
* after the posting list being split (which would have overlapped with
121
* orignewitem prior to split). Usually REDO must deal with posting list
122
* splits with an _L variant page split record, and usually both the new
123
* posting list and the final newitem go on the left page (the existing
124
* posting list will be inserted instead of the old, and the final newitem
125
* will be inserted next to that). However, _R variant split records will
126
* include an orignewitem when the split point for the page happens to have a
127
* lastleft tuple that is also the posting list being split (leaving newitem
128
* as the page split's firstright tuple). The existence of this corner case
129
* does not change the basic fact about newitem/orignewitem for the REDO
130
* routine: it is always state used for the left page alone. (This is why the
131
* record's postingoff field isn't a reliable indicator of whether or not a
132
* posting list split occurred during the page split; a non-zero value merely
133
* indicates that the REDO routine must reconstruct a new posting list tuple
134
* that is needed for the left page.)
135
*
136
* This posting list split handling is equivalent to the xl_btree_insert REDO
137
* routine's INSERT_POST handling. While the details are more complicated
138
* here, the concept and goals are exactly the same. See _bt_swap_posting()
139
* for details on posting list splits.
140
*
141
* Backup Blk 1: new right page
142
*
143
* The right page's data portion contains the right page's tuples in the form
144
* used by _bt_restore_page. This includes the new item, if it's the _R
145
* variant. The right page's tuples also include the right page's high key
146
* with either variant (moved from the left/original page during the split),
147
* unless the split happened to be of the rightmost page on its level, where
148
* there is no high key for new right page.
149
*
150
* Backup Blk 2: next block (orig page's rightlink), if any
151
* Backup Blk 3: child's left sibling, if non-leaf split
152
*/
153
typedef
struct
xl_btree_split
154
{
155
uint32
level
;
/* tree level of page being split */
156
OffsetNumber
firstrightoff
;
/* first origpage item on rightpage */
157
OffsetNumber
newitemoff
;
/* new item's offset */
158
uint16
postingoff
;
/* offset inside orig posting tuple */
159
}
xl_btree_split
;
160
161
#define SizeOfBtreeSplit (offsetof(xl_btree_split, postingoff) + sizeof(uint16))
162
163
/*
164
* When page is deduplicated, consecutive groups of tuples with equal keys are
165
* merged together into posting list tuples.
166
*
167
* The WAL record represents a deduplication pass for a leaf page. An array
168
* of BTDedupInterval structs follows.
169
*/
170
typedef
struct
xl_btree_dedup
171
{
172
uint16
nintervals
;
173
174
/* DEDUPLICATION INTERVALS FOLLOW */
175
}
xl_btree_dedup
;
176
177
#define SizeOfBtreeDedup (offsetof(xl_btree_dedup, nintervals) + sizeof(uint16))
178
179
/*
180
* This is what we need to know about page reuse within btree. This record
181
* only exists to generate a conflict point for Hot Standby.
182
*
183
* Note that we must include a RelFileLocator in the record because we don't
184
* actually register the buffer with the record.
185
*/
186
typedef
struct
xl_btree_reuse_page
187
{
188
RelFileLocator
locator
;
189
BlockNumber
block
;
190
FullTransactionId
snapshotConflictHorizon
;
191
bool
isCatalogRel
;
/* to handle recovery conflict during logical
192
* decoding on standby */
193
}
xl_btree_reuse_page
;
194
195
#define SizeOfBtreeReusePage (offsetof(xl_btree_reuse_page, isCatalogRel) + sizeof(bool))
196
197
/*
198
* xl_btree_vacuum and xl_btree_delete records describe deletion of index
199
* tuples on a leaf page. The former variant is used by VACUUM, while the
200
* latter variant is used by the ad-hoc deletions that sometimes take place
201
* when btinsert() is called.
202
*
203
* The records are very similar. The only difference is that xl_btree_delete
204
* have snapshotConflictHorizon/isCatalogRel fields for recovery conflicts.
205
* (VACUUM operations can just rely on earlier conflicts generated during
206
* pruning of the table whose TIDs the to-be-deleted index tuples point to.
207
* There are also small differences between each REDO routine that we don't go
208
* into here.)
209
*
210
* xl_btree_vacuum and xl_btree_delete both represent deletion of any number
211
* of index tuples on a single leaf page using page offset numbers. Both also
212
* support "updates" of index tuples, which is how deletes of a subset of TIDs
213
* contained in an existing posting list tuple are implemented.
214
*
215
* Updated posting list tuples are represented using xl_btree_update metadata.
216
* The REDO routines each use the xl_btree_update entries (plus each
217
* corresponding original index tuple from the target leaf page) to generate
218
* the final updated tuple.
219
*
220
* Updates are only used when there will be some remaining TIDs left by the
221
* REDO routine. Otherwise the posting list tuple just gets deleted outright.
222
*/
223
typedef
struct
xl_btree_vacuum
224
{
225
uint16
ndeleted
;
226
uint16
nupdated
;
227
228
/*----
229
* In payload of blk 0 :
230
* - DELETED TARGET OFFSET NUMBERS
231
* - UPDATED TARGET OFFSET NUMBERS
232
* - UPDATED TUPLES METADATA (xl_btree_update) ITEMS
233
*----
234
*/
235
}
xl_btree_vacuum
;
236
237
#define SizeOfBtreeVacuum (offsetof(xl_btree_vacuum, nupdated) + sizeof(uint16))
238
239
typedef
struct
xl_btree_delete
240
{
241
TransactionId
snapshotConflictHorizon
;
242
uint16
ndeleted
;
243
uint16
nupdated
;
244
bool
isCatalogRel
;
/* to handle recovery conflict during logical
245
* decoding on standby */
246
247
/*----
248
* In payload of blk 0 :
249
* - DELETED TARGET OFFSET NUMBERS
250
* - UPDATED TARGET OFFSET NUMBERS
251
* - UPDATED TUPLES METADATA (xl_btree_update) ITEMS
252
*----
253
*/
254
}
xl_btree_delete
;
255
256
#define SizeOfBtreeDelete (offsetof(xl_btree_delete, isCatalogRel) + sizeof(bool))
257
258
/*
259
* The offsets that appear in xl_btree_update metadata are offsets into the
260
* original posting list from tuple, not page offset numbers. These are
261
* 0-based. The page offset number for the original posting list tuple comes
262
* from the main xl_btree_vacuum/xl_btree_delete record.
263
*/
264
typedef
struct
xl_btree_update
265
{
266
uint16
ndeletedtids
;
267
268
/* POSTING LIST uint16 OFFSETS TO A DELETED TID FOLLOW */
269
}
xl_btree_update
;
270
271
#define SizeOfBtreeUpdate (offsetof(xl_btree_update, ndeletedtids) + sizeof(uint16))
272
273
/*
274
* This is what we need to know about marking an empty subtree for deletion.
275
* The target identifies the tuple removed from the parent page (note that we
276
* remove this tuple's downlink and the *following* tuple's key). Note that
277
* the leaf page is empty, so we don't need to store its content --- it is
278
* just reinitialized during recovery using the rest of the fields.
279
*
280
* Backup Blk 0: leaf block
281
* Backup Blk 1: top parent
282
*/
283
typedef
struct
xl_btree_mark_page_halfdead
284
{
285
OffsetNumber
poffset
;
/* deleted tuple id in parent page */
286
287
/* information needed to recreate the leaf page: */
288
BlockNumber
leafblk
;
/* leaf block ultimately being deleted */
289
BlockNumber
leftblk
;
/* leaf block's left sibling, if any */
290
BlockNumber
rightblk
;
/* leaf block's right sibling */
291
BlockNumber
topparent
;
/* topmost internal page in the subtree */
292
}
xl_btree_mark_page_halfdead
;
293
294
#define SizeOfBtreeMarkPageHalfDead (offsetof(xl_btree_mark_page_halfdead, topparent) + sizeof(BlockNumber))
295
296
/*
297
* This is what we need to know about deletion of a btree page. Note that we
298
* only leave behind a small amount of bookkeeping information in deleted
299
* pages (deleted pages must be kept around as tombstones for a while). It is
300
* convenient for the REDO routine to regenerate its target page from scratch.
301
* This is why WAL record describes certain details that are actually directly
302
* available from the target page.
303
*
304
* Backup Blk 0: target block being deleted
305
* Backup Blk 1: target block's left sibling, if any
306
* Backup Blk 2: target block's right sibling
307
* Backup Blk 3: leaf block (if different from target)
308
* Backup Blk 4: metapage (if rightsib becomes new fast root)
309
*/
310
typedef
struct
xl_btree_unlink_page
311
{
312
BlockNumber
leftsib
;
/* target block's left sibling, if any */
313
BlockNumber
rightsib
;
/* target block's right sibling */
314
uint32
level
;
/* target block's level */
315
FullTransactionId
safexid
;
/* target block's BTPageSetDeleted() XID */
316
317
/*
318
* Information needed to recreate a half-dead leaf page with correct
319
* topparent link. The fields are only used when deletion operation's
320
* target page is an internal page. REDO routine creates half-dead page
321
* from scratch to keep things simple (this is the same convenient
322
* approach used for the target page itself).
323
*/
324
BlockNumber
leafleftsib
;
325
BlockNumber
leafrightsib
;
326
BlockNumber
leaftopparent
;
/* next child down in the subtree */
327
328
/* xl_btree_metadata FOLLOWS IF XLOG_BTREE_UNLINK_PAGE_META */
329
}
xl_btree_unlink_page
;
330
331
#define SizeOfBtreeUnlinkPage (offsetof(xl_btree_unlink_page, leaftopparent) + sizeof(BlockNumber))
332
333
/*
334
* New root log record. There are zero tuples if this is to establish an
335
* empty root, or two if it is the result of splitting an old root.
336
*
337
* Note that although this implies rewriting the metadata page, we don't need
338
* an xl_btree_metadata record --- the rootblk and level are sufficient.
339
*
340
* Backup Blk 0: new root page (2 tuples as payload, if splitting old root)
341
* Backup Blk 1: left child (if splitting an old root)
342
* Backup Blk 2: metapage
343
*/
344
typedef
struct
xl_btree_newroot
345
{
346
BlockNumber
rootblk
;
/* location of new root (redundant with blk 0) */
347
uint32
level
;
/* its tree level */
348
}
xl_btree_newroot
;
349
350
#define SizeOfBtreeNewroot (offsetof(xl_btree_newroot, level) + sizeof(uint32))
351
352
353
/*
354
* prototypes for functions in nbtxlog.c
355
*/
356
extern
void
btree_redo
(
XLogReaderState
*record);
357
extern
void
btree_xlog_startup
(
void
);
358
extern
void
btree_xlog_cleanup
(
void
);
359
extern
void
btree_mask
(
char
*
pagedata
,
BlockNumber
blkno);
360
361
/*
362
* prototypes for functions in nbtdesc.c
363
*/
364
extern
void
btree_desc
(
StringInfo
buf
,
XLogReaderState
*record);
365
extern
const
char
*
btree_identify
(
uint8
info);
366
367
#endif
/* NBTXLOG_H */
BlockNumber
uint32 BlockNumber
Definition
block.h:31
uint8
uint8_t uint8
Definition
c.h:544
uint16
uint16_t uint16
Definition
c.h:545
uint32
uint32_t uint32
Definition
c.h:546
TransactionId
uint32 TransactionId
Definition
c.h:666
btree_redo
void btree_redo(XLogReaderState *record)
Definition
nbtxlog.c:1004
btree_xlog_cleanup
void btree_xlog_cleanup(void)
Definition
nbtxlog.c:1071
btree_identify
const char * btree_identify(uint8 info)
Definition
nbtdesc.c:139
btree_mask
void btree_mask(char *pagedata, BlockNumber blkno)
Definition
nbtxlog.c:1081
btree_desc
void btree_desc(StringInfo buf, XLogReaderState *record)
Definition
nbtdesc.c:24
btree_xlog_startup
void btree_xlog_startup(void)
Definition
nbtxlog.c:1063
off.h
OffsetNumber
uint16 OffsetNumber
Definition
off.h:24
buf
static char buf[DEFAULT_XLOG_SEG_SIZE]
Definition
pg_test_fsync.c:71
fb
static int fb(int x)
Definition
preproc-init.c:92
stringinfo.h
FullTransactionId
Definition
transam.h:66
RelFileLocator
Definition
relfilelocator.h:59
StringInfoData
Definition
stringinfo.h:47
XLogReaderState
Definition
xlogreader.h:175
xl_btree_dedup
Definition
nbtxlog.h:168
xl_btree_dedup::nintervals
uint16 nintervals
Definition
nbtxlog.h:169
xl_btree_delete
Definition
nbtxlog.h:237
xl_btree_delete::snapshotConflictHorizon
TransactionId snapshotConflictHorizon
Definition
nbtxlog.h:238
xl_btree_delete::isCatalogRel
bool isCatalogRel
Definition
nbtxlog.h:241
xl_btree_delete::ndeleted
uint16 ndeleted
Definition
nbtxlog.h:239
xl_btree_delete::nupdated
uint16 nupdated
Definition
nbtxlog.h:240
xl_btree_insert
Definition
nbtxlog.h:77
xl_btree_insert::offnum
OffsetNumber offnum
Definition
nbtxlog.h:78
xl_btree_mark_page_halfdead
Definition
nbtxlog.h:281
xl_btree_mark_page_halfdead::poffset
OffsetNumber poffset
Definition
nbtxlog.h:282
xl_btree_mark_page_halfdead::leafblk
BlockNumber leafblk
Definition
nbtxlog.h:285
xl_btree_mark_page_halfdead::leftblk
BlockNumber leftblk
Definition
nbtxlog.h:286
xl_btree_mark_page_halfdead::topparent
BlockNumber topparent
Definition
nbtxlog.h:288
xl_btree_mark_page_halfdead::rightblk
BlockNumber rightblk
Definition
nbtxlog.h:287
xl_btree_metadata
Definition
nbtxlog.h:47
xl_btree_metadata::level
uint32 level
Definition
nbtxlog.h:50
xl_btree_metadata::version
uint32 version
Definition
nbtxlog.h:48
xl_btree_metadata::allequalimage
bool allequalimage
Definition
nbtxlog.h:54
xl_btree_metadata::fastroot
BlockNumber fastroot
Definition
nbtxlog.h:51
xl_btree_metadata::fastlevel
uint32 fastlevel
Definition
nbtxlog.h:52
xl_btree_metadata::root
BlockNumber root
Definition
nbtxlog.h:49
xl_btree_metadata::last_cleanup_num_delpages
uint32 last_cleanup_num_delpages
Definition
nbtxlog.h:53
xl_btree_newroot
Definition
nbtxlog.h:342
xl_btree_newroot::level
uint32 level
Definition
nbtxlog.h:344
xl_btree_newroot::rootblk
BlockNumber rootblk
Definition
nbtxlog.h:343
xl_btree_reuse_page
Definition
nbtxlog.h:184
xl_btree_reuse_page::snapshotConflictHorizon
FullTransactionId snapshotConflictHorizon
Definition
nbtxlog.h:187
xl_btree_reuse_page::locator
RelFileLocator locator
Definition
nbtxlog.h:185
xl_btree_reuse_page::block
BlockNumber block
Definition
nbtxlog.h:186
xl_btree_reuse_page::isCatalogRel
bool isCatalogRel
Definition
nbtxlog.h:188
xl_btree_split
Definition
nbtxlog.h:151
xl_btree_split::postingoff
uint16 postingoff
Definition
nbtxlog.h:155
xl_btree_split::firstrightoff
OffsetNumber firstrightoff
Definition
nbtxlog.h:153
xl_btree_split::level
uint32 level
Definition
nbtxlog.h:152
xl_btree_split::newitemoff
OffsetNumber newitemoff
Definition
nbtxlog.h:154
xl_btree_unlink_page
Definition
nbtxlog.h:308
xl_btree_unlink_page::leftsib
BlockNumber leftsib
Definition
nbtxlog.h:309
xl_btree_unlink_page::safexid
FullTransactionId safexid
Definition
nbtxlog.h:312
xl_btree_unlink_page::rightsib
BlockNumber rightsib
Definition
nbtxlog.h:310
xl_btree_unlink_page::leaftopparent
BlockNumber leaftopparent
Definition
nbtxlog.h:323
xl_btree_unlink_page::leafleftsib
BlockNumber leafleftsib
Definition
nbtxlog.h:321
xl_btree_unlink_page::level
uint32 level
Definition
nbtxlog.h:311
xl_btree_unlink_page::leafrightsib
BlockNumber leafrightsib
Definition
nbtxlog.h:322
xl_btree_update
Definition
nbtxlog.h:262
xl_btree_update::ndeletedtids
uint16 ndeletedtids
Definition
nbtxlog.h:263
xl_btree_vacuum
Definition
nbtxlog.h:221
xl_btree_vacuum::ndeleted
uint16 ndeleted
Definition
nbtxlog.h:222
xl_btree_vacuum::nupdated
uint16 nupdated
Definition
nbtxlog.h:223
transam.h
xlogreader.h
src
include
access
nbtxlog.h
Generated on Sun Feb 1 2026 06:13:15 for PostgreSQL Source Code by
1.9.8