PostgreSQL Source Code git master
blkreftable.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * blkreftable.c
4 * Block reference tables.
5 *
6 * A block reference table is used to keep track of which blocks have
7 * been modified by WAL records within a certain LSN range.
8 *
9 * For each relation fork, we keep track of all blocks that have appeared
10 * in block reference in the WAL. We also keep track of the "limit block",
11 * which is the smallest relation length in blocks known to have occurred
12 * during that range of WAL records. This should be set to 0 if the relation
13 * fork is created or destroyed, and to the post-truncation length if
14 * truncated.
15 *
16 * Whenever we set the limit block, we also forget about any modified blocks
17 * beyond that point. Those blocks don't exist any more. Such blocks can
18 * later be marked as modified again; if that happens, it means the relation
19 * was re-extended.
20 *
21 * Portions Copyright (c) 2010-2025, PostgreSQL Global Development Group
22 *
23 * src/common/blkreftable.c
24 *
25 *-------------------------------------------------------------------------
26 */
27
28
29#ifndef FRONTEND
30#include "postgres.h"
31#else
32#include "postgres_fe.h"
33#endif
34
35#ifdef FRONTEND
36#include "common/logging.h"
37#endif
38
39#include "common/blkreftable.h"
40#include "common/hashfn.h"
41#include "port/pg_crc32c.h"
42
43/*
44 * A block reference table keeps track of the status of each relation
45 * fork individually.
46 */
47typedef struct BlockRefTableKey
48{
52
53/*
54 * We could need to store data either for a relation in which only a
55 * tiny fraction of the blocks have been modified or for a relation in
56 * which nearly every block has been modified, and we want a
57 * space-efficient representation in both cases. To accomplish this,
58 * we divide the relation into chunks of 2^16 blocks and choose between
59 * an array representation and a bitmap representation for each chunk.
60 *
61 * When the number of modified blocks in a given chunk is small, we
62 * essentially store an array of block numbers, but we need not store the
63 * entire block number: instead, we store each block number as a 2-byte
64 * offset from the start of the chunk.
65 *
66 * When the number of modified blocks in a given chunk is large, we switch
67 * to a bitmap representation.
68 *
69 * These same basic representational choices are used both when a block
70 * reference table is stored in memory and when it is serialized to disk.
71 *
72 * In the in-memory representation, we initially allocate each chunk with
73 * space for a number of entries given by INITIAL_ENTRIES_PER_CHUNK and
74 * increase that as necessary until we reach MAX_ENTRIES_PER_CHUNK.
75 * Any chunk whose allocated size reaches MAX_ENTRIES_PER_CHUNK is converted
76 * to a bitmap, and thus never needs to grow further.
77 */
78#define BLOCKS_PER_CHUNK (1 << 16)
79#define BLOCKS_PER_ENTRY (BITS_PER_BYTE * sizeof(uint16))
80#define MAX_ENTRIES_PER_CHUNK (BLOCKS_PER_CHUNK / BLOCKS_PER_ENTRY)
81#define INITIAL_ENTRIES_PER_CHUNK 16
83
84/*
85 * State for one relation fork.
86 *
87 * 'rlocator' and 'forknum' identify the relation fork to which this entry
88 * pertains.
89 *
90 * 'limit_block' is the shortest known length of the relation in blocks
91 * within the LSN range covered by a particular block reference table.
92 * It should be set to 0 if the relation fork is created or dropped. If the
93 * relation fork is truncated, it should be set to the number of blocks that
94 * remain after truncation.
95 *
96 * 'nchunks' is the allocated length of each of the three arrays that follow.
97 * We can only represent the status of block numbers less than nchunks *
98 * BLOCKS_PER_CHUNK.
99 *
100 * 'chunk_size' is an array storing the allocated size of each chunk.
101 *
102 * 'chunk_usage' is an array storing the number of elements used in each
103 * chunk. If that value is less than MAX_ENTRIES_PER_CHUNK, the corresponding
104 * chunk is used as an array; else the corresponding chunk is used as a bitmap.
105 * When used as a bitmap, the least significant bit of the first array element
106 * is the status of the lowest-numbered block covered by this chunk.
107 *
108 * 'chunk_data' is the array of chunks.
109 */
111{
114 char status;
119};
120
121/* Declare and define a hash table over type BlockRefTableEntry. */
122#define SH_PREFIX blockreftable
123#define SH_ELEMENT_TYPE BlockRefTableEntry
124#define SH_KEY_TYPE BlockRefTableKey
125#define SH_KEY key
126#define SH_HASH_KEY(tb, key) \
127 hash_bytes((const unsigned char *) &key, sizeof(BlockRefTableKey))
128#define SH_EQUAL(tb, a, b) (memcmp(&a, &b, sizeof(BlockRefTableKey)) == 0)
129#define SH_SCOPE static inline
130#ifdef FRONTEND
131#define SH_RAW_ALLOCATOR pg_malloc0
132#endif
133#define SH_DEFINE
134#define SH_DECLARE
135#include "lib/simplehash.h"
136
137/*
138 * A block reference table is basically just the hash table, but we don't
139 * want to expose that to outside callers.
140 *
141 * We keep track of the memory context in use explicitly too, so that it's
142 * easy to place all of our allocations in the same context.
143 */
145{
146 blockreftable_hash *hash;
147#ifndef FRONTEND
149#endif
150};
151
152/*
153 * On-disk serialization format for block reference table entries.
154 */
156{
162
163/*
164 * Buffer size, so that we avoid doing many small I/Os.
165 */
166#define BUFSIZE 65536
167
168/*
169 * Ad-hoc buffer for file I/O.
170 */
172{
176 int used;
180
181/*
182 * State for keeping track of progress while incrementally reading a block
183 * table reference file from disk.
184 *
185 * total_chunks means the number of chunks for the RelFileLocator/ForkNumber
186 * combination that is currently being read, and consumed_chunks is the number
187 * of those that have been read. (We always read all the information for
188 * a single chunk at one time, so we don't need to be able to represent the
189 * state where a chunk has been partially read.)
190 *
191 * chunk_size is the array of chunk sizes. The length is given by total_chunks.
192 *
193 * chunk_data holds the current chunk.
194 *
195 * chunk_position helps us figure out how much progress we've made in returning
196 * the block numbers for the current chunk to the caller. If the chunk is a
197 * bitmap, it's the number of bits we've scanned; otherwise, it's the number
198 * of chunk entries we've scanned.
199 */
201{
211};
212
213/*
214 * State for keeping track of progress while incrementally writing a block
215 * reference table file to disk.
216 */
218{
220};
221
222/* Function prototypes. */
223static int BlockRefTableComparator(const void *a, const void *b);
224static void BlockRefTableFlush(BlockRefTableBuffer *buffer);
225static void BlockRefTableRead(BlockRefTableReader *reader, void *data,
226 int length);
227static void BlockRefTableWrite(BlockRefTableBuffer *buffer, void *data,
228 int length);
230
231/*
232 * Create an empty block reference table.
233 */
236{
237 BlockRefTable *brtab = palloc(sizeof(BlockRefTable));
238
239 /*
240 * Even completely empty database has a few hundred relation forks, so it
241 * seems best to size the hash on the assumption that we're going to have
242 * at least a few thousand entries.
243 */
244#ifdef FRONTEND
245 brtab->hash = blockreftable_create(4096, NULL);
246#else
247 brtab->mcxt = CurrentMemoryContext;
248 brtab->hash = blockreftable_create(brtab->mcxt, 4096, NULL);
249#endif
250
251 return brtab;
252}
253
254/*
255 * Set the "limit block" for a relation fork and forget any modified blocks
256 * with equal or higher block numbers.
257 *
258 * The "limit block" is the shortest known length of the relation within the
259 * range of WAL records covered by this block reference table.
260 */
261void
263 const RelFileLocator *rlocator,
264 ForkNumber forknum,
265 BlockNumber limit_block)
266{
267 BlockRefTableEntry *brtentry;
268 BlockRefTableKey key = {{0}}; /* make sure any padding is zero */
269 bool found;
270
271 memcpy(&key.rlocator, rlocator, sizeof(RelFileLocator));
272 key.forknum = forknum;
273 brtentry = blockreftable_insert(brtab->hash, key, &found);
274
275 if (!found)
276 {
277 /*
278 * We have no existing data about this relation fork, so just record
279 * the limit_block value supplied by the caller, and make sure other
280 * parts of the entry are properly initialized.
281 */
282 brtentry->limit_block = limit_block;
283 brtentry->nchunks = 0;
284 brtentry->chunk_size = NULL;
285 brtentry->chunk_usage = NULL;
286 brtentry->chunk_data = NULL;
287 return;
288 }
289
290 BlockRefTableEntrySetLimitBlock(brtentry, limit_block);
291}
292
293/*
294 * Mark a block in a given relation fork as known to have been modified.
295 */
296void
298 const RelFileLocator *rlocator,
299 ForkNumber forknum,
300 BlockNumber blknum)
301{
302 BlockRefTableEntry *brtentry;
303 BlockRefTableKey key = {{0}}; /* make sure any padding is zero */
304 bool found;
305#ifndef FRONTEND
306 MemoryContext oldcontext = MemoryContextSwitchTo(brtab->mcxt);
307#endif
308
309 memcpy(&key.rlocator, rlocator, sizeof(RelFileLocator));
310 key.forknum = forknum;
311 brtentry = blockreftable_insert(brtab->hash, key, &found);
312
313 if (!found)
314 {
315 /*
316 * We want to set the initial limit block value to something higher
317 * than any legal block number. InvalidBlockNumber fits the bill.
318 */
320 brtentry->nchunks = 0;
321 brtentry->chunk_size = NULL;
322 brtentry->chunk_usage = NULL;
323 brtentry->chunk_data = NULL;
324 }
325
326 BlockRefTableEntryMarkBlockModified(brtentry, forknum, blknum);
327
328#ifndef FRONTEND
329 MemoryContextSwitchTo(oldcontext);
330#endif
331}
332
333/*
334 * Get an entry from a block reference table.
335 *
336 * If the entry does not exist, this function returns NULL. Otherwise, it
337 * returns the entry and sets *limit_block to the value from the entry.
338 */
341 ForkNumber forknum, BlockNumber *limit_block)
342{
343 BlockRefTableKey key = {{0}}; /* make sure any padding is zero */
344 BlockRefTableEntry *entry;
345
346 Assert(limit_block != NULL);
347
348 memcpy(&key.rlocator, rlocator, sizeof(RelFileLocator));
349 key.forknum = forknum;
350 entry = blockreftable_lookup(brtab->hash, key);
351
352 if (entry != NULL)
353 *limit_block = entry->limit_block;
354
355 return entry;
356}
357
358/*
359 * Get block numbers from a table entry.
360 *
361 * 'blocks' must point to enough space to hold at least 'nblocks' block
362 * numbers, and any block numbers we manage to get will be written there.
363 * The return value is the number of block numbers actually written.
364 *
365 * We do not return block numbers unless they are greater than or equal to
366 * start_blkno and strictly less than stop_blkno.
367 */
368int
370 BlockNumber start_blkno,
371 BlockNumber stop_blkno,
372 BlockNumber *blocks,
373 int nblocks)
374{
375 uint32 start_chunkno;
376 uint32 stop_chunkno;
377 uint32 chunkno;
378 int nresults = 0;
379
380 Assert(entry != NULL);
381
382 /*
383 * Figure out which chunks could potentially contain blocks of interest.
384 *
385 * We need to be careful about overflow here, because stop_blkno could be
386 * InvalidBlockNumber or something very close to it.
387 */
388 start_chunkno = start_blkno / BLOCKS_PER_CHUNK;
389 stop_chunkno = stop_blkno / BLOCKS_PER_CHUNK;
390 if ((stop_blkno % BLOCKS_PER_CHUNK) != 0)
391 ++stop_chunkno;
392 if (stop_chunkno > entry->nchunks)
393 stop_chunkno = entry->nchunks;
394
395 /*
396 * Loop over chunks.
397 */
398 for (chunkno = start_chunkno; chunkno < stop_chunkno; ++chunkno)
399 {
400 uint16 chunk_usage = entry->chunk_usage[chunkno];
401 BlockRefTableChunk chunk_data = entry->chunk_data[chunkno];
402 unsigned start_offset = 0;
403 unsigned stop_offset = BLOCKS_PER_CHUNK;
404
405 /*
406 * If the start and/or stop block number falls within this chunk, the
407 * whole chunk may not be of interest. Figure out which portion we
408 * care about, if it's not the whole thing.
409 */
410 if (chunkno == start_chunkno)
411 start_offset = start_blkno % BLOCKS_PER_CHUNK;
412 if (chunkno == stop_chunkno - 1)
413 {
414 Assert(stop_blkno > chunkno * BLOCKS_PER_CHUNK);
415 stop_offset = stop_blkno - (chunkno * BLOCKS_PER_CHUNK);
416 Assert(stop_offset <= BLOCKS_PER_CHUNK);
417 }
418
419 /*
420 * Handling differs depending on whether this is an array of offsets
421 * or a bitmap.
422 */
423 if (chunk_usage == MAX_ENTRIES_PER_CHUNK)
424 {
425 unsigned i;
426
427 /* It's a bitmap, so test every relevant bit. */
428 for (i = start_offset; i < stop_offset; ++i)
429 {
430 uint16 w = chunk_data[i / BLOCKS_PER_ENTRY];
431
432 if ((w & (1 << (i % BLOCKS_PER_ENTRY))) != 0)
433 {
434 BlockNumber blkno = chunkno * BLOCKS_PER_CHUNK + i;
435
436 blocks[nresults++] = blkno;
437
438 /* Early exit if we run out of output space. */
439 if (nresults == nblocks)
440 return nresults;
441 }
442 }
443 }
444 else
445 {
446 unsigned i;
447
448 /* It's an array of offsets, so check each one. */
449 for (i = 0; i < chunk_usage; ++i)
450 {
451 uint16 offset = chunk_data[i];
452
453 if (offset >= start_offset && offset < stop_offset)
454 {
455 BlockNumber blkno = chunkno * BLOCKS_PER_CHUNK + offset;
456
457 blocks[nresults++] = blkno;
458
459 /* Early exit if we run out of output space. */
460 if (nresults == nblocks)
461 return nresults;
462 }
463 }
464 }
465 }
466
467 return nresults;
468}
469
470/*
471 * Serialize a block reference table to a file.
472 */
473void
475 io_callback_fn write_callback,
476 void *write_callback_arg)
477{
478 BlockRefTableSerializedEntry *sdata = NULL;
479 BlockRefTableBuffer buffer;
481
482 /* Prepare buffer. */
483 memset(&buffer, 0, sizeof(BlockRefTableBuffer));
484 buffer.io_callback = write_callback;
485 buffer.io_callback_arg = write_callback_arg;
486 INIT_CRC32C(buffer.crc);
487
488 /* Write magic number. */
489 BlockRefTableWrite(&buffer, &magic, sizeof(uint32));
490
491 /* Write the entries, assuming there are some. */
492 if (brtab->hash->members > 0)
493 {
494 unsigned i = 0;
495 blockreftable_iterator it;
496 BlockRefTableEntry *brtentry;
497
498 /* Extract entries into serializable format and sort them. */
499 sdata =
500 palloc(brtab->hash->members * sizeof(BlockRefTableSerializedEntry));
501 blockreftable_start_iterate(brtab->hash, &it);
502 while ((brtentry = blockreftable_iterate(brtab->hash, &it)) != NULL)
503 {
504 BlockRefTableSerializedEntry *sentry = &sdata[i++];
505
506 sentry->rlocator = brtentry->key.rlocator;
507 sentry->forknum = brtentry->key.forknum;
508 sentry->limit_block = brtentry->limit_block;
509 sentry->nchunks = brtentry->nchunks;
510
511 /* trim trailing zero entries */
512 while (sentry->nchunks > 0 &&
513 brtentry->chunk_usage[sentry->nchunks - 1] == 0)
514 sentry->nchunks--;
515 }
516 Assert(i == brtab->hash->members);
517 qsort(sdata, i, sizeof(BlockRefTableSerializedEntry),
519
520 /* Loop over entries in sorted order and serialize each one. */
521 for (i = 0; i < brtab->hash->members; ++i)
522 {
523 BlockRefTableSerializedEntry *sentry = &sdata[i];
524 BlockRefTableKey key = {{0}}; /* make sure any padding is zero */
525 unsigned j;
526
527 /* Write the serialized entry itself. */
528 BlockRefTableWrite(&buffer, sentry,
530
531 /* Look up the original entry so we can access the chunks. */
532 memcpy(&key.rlocator, &sentry->rlocator, sizeof(RelFileLocator));
533 key.forknum = sentry->forknum;
534 brtentry = blockreftable_lookup(brtab->hash, key);
535 Assert(brtentry != NULL);
536
537 /* Write the untruncated portion of the chunk length array. */
538 if (sentry->nchunks != 0)
539 BlockRefTableWrite(&buffer, brtentry->chunk_usage,
540 sentry->nchunks * sizeof(uint16));
541
542 /* Write the contents of each chunk. */
543 for (j = 0; j < brtentry->nchunks; ++j)
544 {
545 if (brtentry->chunk_usage[j] == 0)
546 continue;
547 BlockRefTableWrite(&buffer, brtentry->chunk_data[j],
548 brtentry->chunk_usage[j] * sizeof(uint16));
549 }
550 }
551 }
552
553 /* Write out appropriate terminator and CRC and flush buffer. */
555}
556
557/*
558 * Prepare to incrementally read a block reference table file.
559 *
560 * 'read_callback' is a function that can be called to read data from the
561 * underlying file (or other data source) into our internal buffer.
562 *
563 * 'read_callback_arg' is an opaque argument to be passed to read_callback.
564 *
565 * 'error_filename' is the filename that should be included in error messages
566 * if the file is found to be malformed. The value is not copied, so the
567 * caller should ensure that it remains valid until done with this
568 * BlockRefTableReader.
569 *
570 * 'error_callback' is a function to be called if the file is found to be
571 * malformed. This is not used for I/O errors, which must be handled internally
572 * by read_callback.
573 *
574 * 'error_callback_arg' is an opaque argument to be passed to error_callback.
575 */
578 void *read_callback_arg,
579 char *error_filename,
580 report_error_fn error_callback,
581 void *error_callback_arg)
582{
583 BlockRefTableReader *reader;
584 uint32 magic;
585
586 /* Initialize data structure. */
587 reader = palloc0(sizeof(BlockRefTableReader));
588 reader->buffer.io_callback = read_callback;
589 reader->buffer.io_callback_arg = read_callback_arg;
590 reader->error_filename = error_filename;
591 reader->error_callback = error_callback;
592 reader->error_callback_arg = error_callback_arg;
593 INIT_CRC32C(reader->buffer.crc);
594
595 /* Verify magic number. */
596 BlockRefTableRead(reader, &magic, sizeof(uint32));
597 if (magic != BLOCKREFTABLE_MAGIC)
598 error_callback(error_callback_arg,
599 "file \"%s\" has wrong magic number: expected %u, found %u",
600 error_filename,
601 BLOCKREFTABLE_MAGIC, magic);
602
603 return reader;
604}
605
606/*
607 * Read next relation fork covered by this block reference table file.
608 *
609 * After calling this function, you must call BlockRefTableReaderGetBlocks
610 * until it returns 0 before calling it again.
611 */
612bool
614 RelFileLocator *rlocator,
615 ForkNumber *forknum,
616 BlockNumber *limit_block)
617{
619 BlockRefTableSerializedEntry zentry = {{0}};
620
621 /*
622 * Sanity check: caller must read all blocks from all chunks before moving
623 * on to the next relation.
624 */
625 Assert(reader->total_chunks == reader->consumed_chunks);
626
627 /* Read serialized entry. */
628 BlockRefTableRead(reader, &sentry,
630
631 /*
632 * If we just read the sentinel entry indicating that we've reached the
633 * end, read and check the CRC.
634 */
635 if (memcmp(&sentry, &zentry, sizeof(BlockRefTableSerializedEntry)) == 0)
636 {
637 pg_crc32c expected_crc;
638 pg_crc32c actual_crc;
639
640 /*
641 * We want to know the CRC of the file excluding the 4-byte CRC
642 * itself, so copy the current value of the CRC accumulator before
643 * reading those bytes, and use the copy to finalize the calculation.
644 */
645 expected_crc = reader->buffer.crc;
646 FIN_CRC32C(expected_crc);
647
648 /* Now we can read the actual value. */
649 BlockRefTableRead(reader, &actual_crc, sizeof(pg_crc32c));
650
651 /* Throw an error if there is a mismatch. */
652 if (!EQ_CRC32C(expected_crc, actual_crc))
653 reader->error_callback(reader->error_callback_arg,
654 "file \"%s\" has wrong checksum: expected %08X, found %08X",
655 reader->error_filename, expected_crc, actual_crc);
656
657 return false;
658 }
659
660 /* Read chunk size array. */
661 if (reader->chunk_size != NULL)
662 pfree(reader->chunk_size);
663 reader->chunk_size = palloc(sentry.nchunks * sizeof(uint16));
664 BlockRefTableRead(reader, reader->chunk_size,
665 sentry.nchunks * sizeof(uint16));
666
667 /* Set up for chunk scan. */
668 reader->total_chunks = sentry.nchunks;
669 reader->consumed_chunks = 0;
670
671 /* Return data to caller. */
672 memcpy(rlocator, &sentry.rlocator, sizeof(RelFileLocator));
673 *forknum = sentry.forknum;
674 *limit_block = sentry.limit_block;
675 return true;
676}
677
678/*
679 * Get modified blocks associated with the relation fork returned by
680 * the most recent call to BlockRefTableReaderNextRelation.
681 *
682 * On return, block numbers will be written into the 'blocks' array, whose
683 * length should be passed via 'nblocks'. The return value is the number of
684 * entries actually written into the 'blocks' array, which may be less than
685 * 'nblocks' if we run out of modified blocks in the relation fork before
686 * we run out of room in the array.
687 */
688unsigned
690 BlockNumber *blocks,
691 int nblocks)
692{
693 unsigned blocks_found = 0;
694
695 /* Must provide space for at least one block number to be returned. */
696 Assert(nblocks > 0);
697
698 /* Loop collecting blocks to return to caller. */
699 for (;;)
700 {
701 uint16 next_chunk_size;
702
703 /*
704 * If we've read at least one chunk, maybe it contains some block
705 * numbers that could satisfy caller's request.
706 */
707 if (reader->consumed_chunks > 0)
708 {
709 uint32 chunkno = reader->consumed_chunks - 1;
710 uint16 chunk_size = reader->chunk_size[chunkno];
711
712 if (chunk_size == MAX_ENTRIES_PER_CHUNK)
713 {
714 /* Bitmap format, so search for bits that are set. */
715 while (reader->chunk_position < BLOCKS_PER_CHUNK &&
716 blocks_found < nblocks)
717 {
718 uint16 chunkoffset = reader->chunk_position;
719 uint16 w;
720
721 w = reader->chunk_data[chunkoffset / BLOCKS_PER_ENTRY];
722 if ((w & (1u << (chunkoffset % BLOCKS_PER_ENTRY))) != 0)
723 blocks[blocks_found++] =
724 chunkno * BLOCKS_PER_CHUNK + chunkoffset;
725 ++reader->chunk_position;
726 }
727 }
728 else
729 {
730 /* Not in bitmap format, so each entry is a 2-byte offset. */
731 while (reader->chunk_position < chunk_size &&
732 blocks_found < nblocks)
733 {
734 blocks[blocks_found++] = chunkno * BLOCKS_PER_CHUNK
735 + reader->chunk_data[reader->chunk_position];
736 ++reader->chunk_position;
737 }
738 }
739 }
740
741 /* We found enough blocks, so we're done. */
742 if (blocks_found >= nblocks)
743 break;
744
745 /*
746 * We didn't find enough blocks, so we must need the next chunk. If
747 * there are none left, though, then we're done anyway.
748 */
749 if (reader->consumed_chunks == reader->total_chunks)
750 break;
751
752 /*
753 * Read data for next chunk and reset scan position to beginning of
754 * chunk. Note that the next chunk might be empty, in which case we
755 * consume the chunk without actually consuming any bytes from the
756 * underlying file.
757 */
758 next_chunk_size = reader->chunk_size[reader->consumed_chunks];
759 if (next_chunk_size > 0)
760 BlockRefTableRead(reader, reader->chunk_data,
761 next_chunk_size * sizeof(uint16));
762 ++reader->consumed_chunks;
763 reader->chunk_position = 0;
764 }
765
766 return blocks_found;
767}
768
769/*
770 * Release memory used while reading a block reference table from a file.
771 */
772void
774{
775 if (reader->chunk_size != NULL)
776 {
777 pfree(reader->chunk_size);
778 reader->chunk_size = NULL;
779 }
780 pfree(reader);
781}
782
783/*
784 * Prepare to write a block reference table file incrementally.
785 *
786 * Caller must be able to supply BlockRefTableEntry objects sorted in the
787 * appropriate order.
788 */
791 void *write_callback_arg)
792{
793 BlockRefTableWriter *writer;
795
796 /* Prepare buffer and CRC check and save callbacks. */
797 writer = palloc0(sizeof(BlockRefTableWriter));
798 writer->buffer.io_callback = write_callback;
799 writer->buffer.io_callback_arg = write_callback_arg;
800 INIT_CRC32C(writer->buffer.crc);
801
802 /* Write magic number. */
803 BlockRefTableWrite(&writer->buffer, &magic, sizeof(uint32));
804
805 return writer;
806}
807
808/*
809 * Append one entry to a block reference table file.
810 *
811 * Note that entries must be written in the proper order, that is, sorted by
812 * tablespace, then database, then relfilenumber, then fork number. Caller
813 * is responsible for supplying data in the correct order. If that seems hard,
814 * use an in-memory BlockRefTable instead.
815 */
816void
818{
820 unsigned j;
821
822 /* Convert to serialized entry format. */
823 sentry.rlocator = entry->key.rlocator;
824 sentry.forknum = entry->key.forknum;
825 sentry.limit_block = entry->limit_block;
826 sentry.nchunks = entry->nchunks;
827
828 /* Trim trailing zero entries. */
829 while (sentry.nchunks > 0 && entry->chunk_usage[sentry.nchunks - 1] == 0)
830 sentry.nchunks--;
831
832 /* Write the serialized entry itself. */
833 BlockRefTableWrite(&writer->buffer, &sentry,
835
836 /* Write the untruncated portion of the chunk length array. */
837 if (sentry.nchunks != 0)
838 BlockRefTableWrite(&writer->buffer, entry->chunk_usage,
839 sentry.nchunks * sizeof(uint16));
840
841 /* Write the contents of each chunk. */
842 for (j = 0; j < entry->nchunks; ++j)
843 {
844 if (entry->chunk_usage[j] == 0)
845 continue;
846 BlockRefTableWrite(&writer->buffer, entry->chunk_data[j],
847 entry->chunk_usage[j] * sizeof(uint16));
848 }
849}
850
851/*
852 * Finalize an incremental write of a block reference table file.
853 */
854void
856{
858 pfree(writer);
859}
860
861/*
862 * Allocate a standalone BlockRefTableEntry.
863 *
864 * When we're manipulating a full in-memory BlockRefTable, the entries are
865 * part of the hash table and are allocated by simplehash. This routine is
866 * used by callers that want to write out a BlockRefTable to a file without
867 * needing to store the whole thing in memory at once.
868 *
869 * Entries allocated by this function can be manipulated using the functions
870 * BlockRefTableEntrySetLimitBlock and BlockRefTableEntryMarkBlockModified
871 * and then written using BlockRefTableWriteEntry and freed using
872 * BlockRefTableFreeEntry.
873 */
876{
878
879 memcpy(&entry->key.rlocator, &rlocator, sizeof(RelFileLocator));
880 entry->key.forknum = forknum;
882
883 return entry;
884}
885
886/*
887 * Update a BlockRefTableEntry with a new value for the "limit block" and
888 * forget any equal-or-higher-numbered modified blocks.
889 *
890 * The "limit block" is the shortest known length of the relation within the
891 * range of WAL records covered by this block reference table.
892 */
893void
895 BlockNumber limit_block)
896{
897 unsigned chunkno;
898 unsigned limit_chunkno;
899 unsigned limit_chunkoffset;
900 BlockRefTableChunk limit_chunk;
901
902 /* If we already have an equal or lower limit block, do nothing. */
903 if (limit_block >= entry->limit_block)
904 return;
905
906 /* Record the new limit block value. */
907 entry->limit_block = limit_block;
908
909 /*
910 * Figure out which chunk would store the state of the new limit block,
911 * and which offset within that chunk.
912 */
913 limit_chunkno = limit_block / BLOCKS_PER_CHUNK;
914 limit_chunkoffset = limit_block % BLOCKS_PER_CHUNK;
915
916 /*
917 * If the number of chunks is not large enough for any blocks with equal
918 * or higher block numbers to exist, then there is nothing further to do.
919 */
920 if (limit_chunkno >= entry->nchunks)
921 return;
922
923 /* Discard entire contents of any higher-numbered chunks. */
924 for (chunkno = limit_chunkno + 1; chunkno < entry->nchunks; ++chunkno)
925 entry->chunk_usage[chunkno] = 0;
926
927 /*
928 * Next, we need to discard any offsets within the chunk that would
929 * contain the limit_block. We must handle this differently depending on
930 * whether the chunk that would contain limit_block is a bitmap or an
931 * array of offsets.
932 */
933 limit_chunk = entry->chunk_data[limit_chunkno];
934 if (entry->chunk_usage[limit_chunkno] == MAX_ENTRIES_PER_CHUNK)
935 {
936 unsigned chunkoffset;
937
938 /* It's a bitmap. Unset bits. */
939 for (chunkoffset = limit_chunkoffset; chunkoffset < BLOCKS_PER_CHUNK;
940 ++chunkoffset)
941 limit_chunk[chunkoffset / BLOCKS_PER_ENTRY] &=
942 ~(1 << (chunkoffset % BLOCKS_PER_ENTRY));
943 }
944 else
945 {
946 unsigned i,
947 j = 0;
948
949 /* It's an offset array. Filter out large offsets. */
950 for (i = 0; i < entry->chunk_usage[limit_chunkno]; ++i)
951 {
952 Assert(j <= i);
953 if (limit_chunk[i] < limit_chunkoffset)
954 limit_chunk[j++] = limit_chunk[i];
955 }
956 Assert(j <= entry->chunk_usage[limit_chunkno]);
957 entry->chunk_usage[limit_chunkno] = j;
958 }
959}
960
961/*
962 * Mark a block in a given BlockRefTableEntry as known to have been modified.
963 */
964void
966 ForkNumber forknum,
967 BlockNumber blknum)
968{
969 unsigned chunkno;
970 unsigned chunkoffset;
971 unsigned i;
972
973 /*
974 * Which chunk should store the state of this block? And what is the
975 * offset of this block relative to the start of that chunk?
976 */
977 chunkno = blknum / BLOCKS_PER_CHUNK;
978 chunkoffset = blknum % BLOCKS_PER_CHUNK;
979
980 /*
981 * If 'nchunks' isn't big enough for us to be able to represent the state
982 * of this block, we need to enlarge our arrays.
983 */
984 if (chunkno >= entry->nchunks)
985 {
986 unsigned max_chunks;
987 unsigned extra_chunks;
988
989 /*
990 * New array size is a power of 2, at least 16, big enough so that
991 * chunkno will be a valid array index.
992 */
993 max_chunks = Max(16, entry->nchunks);
994 while (max_chunks < chunkno + 1)
995 max_chunks *= 2;
996 extra_chunks = max_chunks - entry->nchunks;
997
998 if (entry->nchunks == 0)
999 {
1000 entry->chunk_size = palloc0(sizeof(uint16) * max_chunks);
1001 entry->chunk_usage = palloc0(sizeof(uint16) * max_chunks);
1002 entry->chunk_data =
1003 palloc0(sizeof(BlockRefTableChunk) * max_chunks);
1004 }
1005 else
1006 {
1007 entry->chunk_size = repalloc(entry->chunk_size,
1008 sizeof(uint16) * max_chunks);
1009 memset(&entry->chunk_size[entry->nchunks], 0,
1010 extra_chunks * sizeof(uint16));
1011 entry->chunk_usage = repalloc(entry->chunk_usage,
1012 sizeof(uint16) * max_chunks);
1013 memset(&entry->chunk_usage[entry->nchunks], 0,
1014 extra_chunks * sizeof(uint16));
1015 entry->chunk_data = repalloc(entry->chunk_data,
1016 sizeof(BlockRefTableChunk) * max_chunks);
1017 memset(&entry->chunk_data[entry->nchunks], 0,
1018 extra_chunks * sizeof(BlockRefTableChunk));
1019 }
1020 entry->nchunks = max_chunks;
1021 }
1022
1023 /*
1024 * If the chunk that covers this block number doesn't exist yet, create it
1025 * as an array and add the appropriate offset to it. We make it pretty
1026 * small initially, because there might only be 1 or a few block
1027 * references in this chunk and we don't want to use up too much memory.
1028 */
1029 if (entry->chunk_size[chunkno] == 0)
1030 {
1031 entry->chunk_data[chunkno] =
1033 entry->chunk_size[chunkno] = INITIAL_ENTRIES_PER_CHUNK;
1034 entry->chunk_data[chunkno][0] = chunkoffset;
1035 entry->chunk_usage[chunkno] = 1;
1036 return;
1037 }
1038
1039 /*
1040 * If the number of entries in this chunk is already maximum, it must be a
1041 * bitmap. Just set the appropriate bit.
1042 */
1043 if (entry->chunk_usage[chunkno] == MAX_ENTRIES_PER_CHUNK)
1044 {
1045 BlockRefTableChunk chunk = entry->chunk_data[chunkno];
1046
1047 chunk[chunkoffset / BLOCKS_PER_ENTRY] |=
1048 1 << (chunkoffset % BLOCKS_PER_ENTRY);
1049 return;
1050 }
1051
1052 /*
1053 * There is an existing chunk and it's in array format. Let's find out
1054 * whether it already has an entry for this block. If so, we do not need
1055 * to do anything.
1056 */
1057 for (i = 0; i < entry->chunk_usage[chunkno]; ++i)
1058 {
1059 if (entry->chunk_data[chunkno][i] == chunkoffset)
1060 return;
1061 }
1062
1063 /*
1064 * If the number of entries currently used is one less than the maximum,
1065 * it's time to convert to bitmap format.
1066 */
1067 if (entry->chunk_usage[chunkno] == MAX_ENTRIES_PER_CHUNK - 1)
1068 {
1069 BlockRefTableChunk newchunk;
1070 unsigned j;
1071
1072 /* Allocate a new chunk. */
1073 newchunk = palloc0(MAX_ENTRIES_PER_CHUNK * sizeof(uint16));
1074
1075 /* Set the bit for each existing entry. */
1076 for (j = 0; j < entry->chunk_usage[chunkno]; ++j)
1077 {
1078 unsigned coff = entry->chunk_data[chunkno][j];
1079
1080 newchunk[coff / BLOCKS_PER_ENTRY] |=
1081 1 << (coff % BLOCKS_PER_ENTRY);
1082 }
1083
1084 /* Set the bit for the new entry. */
1085 newchunk[chunkoffset / BLOCKS_PER_ENTRY] |=
1086 1 << (chunkoffset % BLOCKS_PER_ENTRY);
1087
1088 /* Swap the new chunk into place and update metadata. */
1089 pfree(entry->chunk_data[chunkno]);
1090 entry->chunk_data[chunkno] = newchunk;
1091 entry->chunk_size[chunkno] = MAX_ENTRIES_PER_CHUNK;
1092 entry->chunk_usage[chunkno] = MAX_ENTRIES_PER_CHUNK;
1093 return;
1094 }
1095
1096 /*
1097 * OK, we currently have an array, and we don't need to convert to a
1098 * bitmap, but we do need to add a new element. If there's not enough
1099 * room, we'll have to expand the array.
1100 */
1101 if (entry->chunk_usage[chunkno] == entry->chunk_size[chunkno])
1102 {
1103 unsigned newsize = entry->chunk_size[chunkno] * 2;
1104
1105 Assert(newsize <= MAX_ENTRIES_PER_CHUNK);
1106 entry->chunk_data[chunkno] = repalloc(entry->chunk_data[chunkno],
1107 newsize * sizeof(uint16));
1108 entry->chunk_size[chunkno] = newsize;
1109 }
1110
1111 /* Now we can add the new entry. */
1112 entry->chunk_data[chunkno][entry->chunk_usage[chunkno]] =
1113 chunkoffset;
1114 entry->chunk_usage[chunkno]++;
1115}
1116
1117/*
1118 * Release memory for a BlockRefTableEntry that was created by
1119 * CreateBlockRefTableEntry.
1120 */
1121void
1123{
1124 if (entry->chunk_size != NULL)
1125 {
1126 pfree(entry->chunk_size);
1127 entry->chunk_size = NULL;
1128 }
1129
1130 if (entry->chunk_usage != NULL)
1131 {
1132 pfree(entry->chunk_usage);
1133 entry->chunk_usage = NULL;
1134 }
1135
1136 if (entry->chunk_data != NULL)
1137 {
1138 pfree(entry->chunk_data);
1139 entry->chunk_data = NULL;
1140 }
1141
1142 pfree(entry);
1143}
1144
1145/*
1146 * Comparator for BlockRefTableSerializedEntry objects.
1147 *
1148 * We make the tablespace OID the first column of the sort key to match
1149 * the on-disk tree structure.
1150 */
1151static int
1152BlockRefTableComparator(const void *a, const void *b)
1153{
1155 const BlockRefTableSerializedEntry *sb = b;
1156
1157 if (sa->rlocator.spcOid > sb->rlocator.spcOid)
1158 return 1;
1159 if (sa->rlocator.spcOid < sb->rlocator.spcOid)
1160 return -1;
1161
1162 if (sa->rlocator.dbOid > sb->rlocator.dbOid)
1163 return 1;
1164 if (sa->rlocator.dbOid < sb->rlocator.dbOid)
1165 return -1;
1166
1167 if (sa->rlocator.relNumber > sb->rlocator.relNumber)
1168 return 1;
1169 if (sa->rlocator.relNumber < sb->rlocator.relNumber)
1170 return -1;
1171
1172 if (sa->forknum > sb->forknum)
1173 return 1;
1174 if (sa->forknum < sb->forknum)
1175 return -1;
1176
1177 return 0;
1178}
1179
1180/*
1181 * Flush any buffered data out of a BlockRefTableBuffer.
1182 */
1183static void
1185{
1186 buffer->io_callback(buffer->io_callback_arg, buffer->data, buffer->used);
1187 buffer->used = 0;
1188}
1189
1190/*
1191 * Read data from a BlockRefTableBuffer, and update the running CRC
1192 * calculation for the returned data (but not any data that we may have
1193 * buffered but not yet actually returned).
1194 */
1195static void
1196BlockRefTableRead(BlockRefTableReader *reader, void *data, int length)
1197{
1198 BlockRefTableBuffer *buffer = &reader->buffer;
1199
1200 /* Loop until read is fully satisfied. */
1201 while (length > 0)
1202 {
1203 if (buffer->cursor < buffer->used)
1204 {
1205 /*
1206 * If any buffered data is available, use that to satisfy as much
1207 * of the request as possible.
1208 */
1209 int bytes_to_copy = Min(length, buffer->used - buffer->cursor);
1210
1211 memcpy(data, &buffer->data[buffer->cursor], bytes_to_copy);
1212 COMP_CRC32C(buffer->crc, &buffer->data[buffer->cursor],
1213 bytes_to_copy);
1214 buffer->cursor += bytes_to_copy;
1215 data = ((char *) data) + bytes_to_copy;
1216 length -= bytes_to_copy;
1217 }
1218 else if (length >= BUFSIZE)
1219 {
1220 /*
1221 * If the request length is long, read directly into caller's
1222 * buffer.
1223 */
1224 int bytes_read;
1225
1226 bytes_read = buffer->io_callback(buffer->io_callback_arg,
1227 data, length);
1228 COMP_CRC32C(buffer->crc, data, bytes_read);
1229 data = ((char *) data) + bytes_read;
1230 length -= bytes_read;
1231
1232 /* If we didn't get anything, that's bad. */
1233 if (bytes_read == 0)
1234 reader->error_callback(reader->error_callback_arg,
1235 "file \"%s\" ends unexpectedly",
1236 reader->error_filename);
1237 }
1238 else
1239 {
1240 /*
1241 * Refill our buffer.
1242 */
1243 buffer->used = buffer->io_callback(buffer->io_callback_arg,
1244 buffer->data, BUFSIZE);
1245 buffer->cursor = 0;
1246
1247 /* If we didn't get anything, that's bad. */
1248 if (buffer->used == 0)
1249 reader->error_callback(reader->error_callback_arg,
1250 "file \"%s\" ends unexpectedly",
1251 reader->error_filename);
1252 }
1253 }
1254}
1255
1256/*
1257 * Supply data to a BlockRefTableBuffer for write to the underlying File,
1258 * and update the running CRC calculation for that data.
1259 */
1260static void
1262{
1263 /* Update running CRC calculation. */
1264 COMP_CRC32C(buffer->crc, data, length);
1265
1266 /* If the new data can't fit into the buffer, flush the buffer. */
1267 if (buffer->used + length > BUFSIZE)
1268 {
1269 buffer->io_callback(buffer->io_callback_arg, buffer->data,
1270 buffer->used);
1271 buffer->used = 0;
1272 }
1273
1274 /* If the new data would fill the buffer, or more, write it directly. */
1275 if (length >= BUFSIZE)
1276 {
1277 buffer->io_callback(buffer->io_callback_arg, data, length);
1278 return;
1279 }
1280
1281 /* Otherwise, copy the new data into the buffer. */
1282 memcpy(&buffer->data[buffer->used], data, length);
1283 buffer->used += length;
1284 Assert(buffer->used <= BUFSIZE);
1285}
1286
1287/*
1288 * Generate the sentinel and CRC required at the end of a block reference
1289 * table file and flush them out of our internal buffer.
1290 */
1291static void
1293{
1294 BlockRefTableSerializedEntry zentry = {{0}};
1295 pg_crc32c crc;
1296
1297 /* Write a sentinel indicating that there are no more entries. */
1298 BlockRefTableWrite(buffer, &zentry,
1300
1301 /*
1302 * Writing the checksum will perturb the ongoing checksum calculation, so
1303 * copy the state first and finalize the computation using the copy.
1304 */
1305 crc = buffer->crc;
1306 FIN_CRC32C(crc);
1307 BlockRefTableWrite(buffer, &crc, sizeof(pg_crc32c));
1308
1309 /* Flush any leftover data out of our buffer. */
1310 BlockRefTableFlush(buffer);
1311}
void BlockRefTableFreeEntry(BlockRefTableEntry *entry)
Definition: blkreftable.c:1122
BlockRefTableEntry * BlockRefTableGetEntry(BlockRefTable *brtab, const RelFileLocator *rlocator, ForkNumber forknum, BlockNumber *limit_block)
Definition: blkreftable.c:340
struct BlockRefTableSerializedEntry BlockRefTableSerializedEntry
bool BlockRefTableReaderNextRelation(BlockRefTableReader *reader, RelFileLocator *rlocator, ForkNumber *forknum, BlockNumber *limit_block)
Definition: blkreftable.c:613
int BlockRefTableEntryGetBlocks(BlockRefTableEntry *entry, BlockNumber start_blkno, BlockNumber stop_blkno, BlockNumber *blocks, int nblocks)
Definition: blkreftable.c:369
void BlockRefTableMarkBlockModified(BlockRefTable *brtab, const RelFileLocator *rlocator, ForkNumber forknum, BlockNumber blknum)
Definition: blkreftable.c:297
BlockRefTableWriter * CreateBlockRefTableWriter(io_callback_fn write_callback, void *write_callback_arg)
Definition: blkreftable.c:790
#define BLOCKS_PER_CHUNK
Definition: blkreftable.c:78
void BlockRefTableWriteEntry(BlockRefTableWriter *writer, BlockRefTableEntry *entry)
Definition: blkreftable.c:817
static void BlockRefTableRead(BlockRefTableReader *reader, void *data, int length)
Definition: blkreftable.c:1196
void BlockRefTableEntryMarkBlockModified(BlockRefTableEntry *entry, ForkNumber forknum, BlockNumber blknum)
Definition: blkreftable.c:965
BlockRefTableReader * CreateBlockRefTableReader(io_callback_fn read_callback, void *read_callback_arg, char *error_filename, report_error_fn error_callback, void *error_callback_arg)
Definition: blkreftable.c:577
unsigned BlockRefTableReaderGetBlocks(BlockRefTableReader *reader, BlockNumber *blocks, int nblocks)
Definition: blkreftable.c:689
#define BLOCKS_PER_ENTRY
Definition: blkreftable.c:79
void BlockRefTableSetLimitBlock(BlockRefTable *brtab, const RelFileLocator *rlocator, ForkNumber forknum, BlockNumber limit_block)
Definition: blkreftable.c:262
void BlockRefTableEntrySetLimitBlock(BlockRefTableEntry *entry, BlockNumber limit_block)
Definition: blkreftable.c:894
BlockRefTable * CreateEmptyBlockRefTable(void)
Definition: blkreftable.c:235
BlockRefTableEntry * CreateBlockRefTableEntry(RelFileLocator rlocator, ForkNumber forknum)
Definition: blkreftable.c:875
struct BlockRefTableBuffer BlockRefTableBuffer
void WriteBlockRefTable(BlockRefTable *brtab, io_callback_fn write_callback, void *write_callback_arg)
Definition: blkreftable.c:474
static void BlockRefTableFileTerminate(BlockRefTableBuffer *buffer)
Definition: blkreftable.c:1292
void DestroyBlockRefTableReader(BlockRefTableReader *reader)
Definition: blkreftable.c:773
#define MAX_ENTRIES_PER_CHUNK
Definition: blkreftable.c:80
void DestroyBlockRefTableWriter(BlockRefTableWriter *writer)
Definition: blkreftable.c:855
#define INITIAL_ENTRIES_PER_CHUNK
Definition: blkreftable.c:81
#define BUFSIZE
Definition: blkreftable.c:166
uint16 * BlockRefTableChunk
Definition: blkreftable.c:82
static void BlockRefTableWrite(BlockRefTableBuffer *buffer, void *data, int length)
Definition: blkreftable.c:1261
static void BlockRefTableFlush(BlockRefTableBuffer *buffer)
Definition: blkreftable.c:1184
struct BlockRefTableKey BlockRefTableKey
static int BlockRefTableComparator(const void *a, const void *b)
Definition: blkreftable.c:1152
void(* report_error_fn)(void *callback_arg, char *msg,...) pg_attribute_printf(2
Definition: blkreftable.h:47
int(* io_callback_fn)(void *callback_arg, void *data, int length)
Definition: blkreftable.h:46
#define BLOCKREFTABLE_MAGIC
Definition: blkreftable.h:30
uint32 BlockNumber
Definition: block.h:31
#define InvalidBlockNumber
Definition: block.h:33
#define Min(x, y)
Definition: c.h:961
#define Max(x, y)
Definition: c.h:955
#define Assert(condition)
Definition: c.h:815
uint16_t uint16
Definition: c.h:487
uint32_t uint32
Definition: c.h:488
uint64 chunk
int b
Definition: isn.c:69
int a
Definition: isn.c:68
int j
Definition: isn.c:73
int i
Definition: isn.c:72
if(TABLE==NULL||TABLE_index==NULL)
Definition: isn.c:76
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:1541
void pfree(void *pointer)
Definition: mcxt.c:1521
void * palloc0(Size size)
Definition: mcxt.c:1347
void * palloc(Size size)
Definition: mcxt.c:1317
MemoryContext CurrentMemoryContext
Definition: mcxt.c:143
static MemoryContext MemoryContextSwitchTo(MemoryContext context)
Definition: palloc.h:124
uint32 pg_crc32c
Definition: pg_crc32c.h:38
#define COMP_CRC32C(crc, data, len)
Definition: pg_crc32c.h:98
#define EQ_CRC32C(c1, c2)
Definition: pg_crc32c.h:42
#define INIT_CRC32C(crc)
Definition: pg_crc32c.h:41
#define FIN_CRC32C(crc)
Definition: pg_crc32c.h:103
const void * data
return crc
#define qsort(a, b, c, d)
Definition: port.h:474
ForkNumber
Definition: relpath.h:56
io_callback_fn io_callback
Definition: blkreftable.c:173
char data[BUFSIZE]
Definition: blkreftable.c:175
BlockRefTableKey key
Definition: blkreftable.c:112
uint16 * chunk_usage
Definition: blkreftable.c:117
BlockNumber limit_block
Definition: blkreftable.c:113
BlockRefTableChunk * chunk_data
Definition: blkreftable.c:118
RelFileLocator rlocator
Definition: blkreftable.c:49
ForkNumber forknum
Definition: blkreftable.c:50
BlockRefTableBuffer buffer
Definition: blkreftable.c:202
uint16 chunk_data[MAX_ENTRIES_PER_CHUNK]
Definition: blkreftable.c:209
report_error_fn error_callback
Definition: blkreftable.c:204
BlockRefTableBuffer buffer
Definition: blkreftable.c:219
blockreftable_hash * hash
Definition: blkreftable.c:146
MemoryContext mcxt
Definition: blkreftable.c:148
RelFileNumber relNumber