bulk__write_8c_source.html

 /*-------------------------------------------------------------------------

  *

  * bulk_write.c

  *    Efficiently and reliably populate a new relation

  *

  * The assumption is that no other backends access the relation while we are

  * loading it, so we can take some shortcuts.  Do not mix operations through

  * the regular buffer manager and the bulk loading interface!

  *

  * We bypass the buffer manager to avoid the locking overhead, and call

  * smgrextend() directly.  A downside is that the pages will need to be

  * re-read into shared buffers on first use after the build finishes.  That's

  * usually a good tradeoff for large relations, and for small relations, the

  * overhead isn't very significant compared to creating the relation in the

  * first place.

  *

  * The pages are WAL-logged if needed.  To save on WAL header overhead, we

  * WAL-log several pages in one record.

  *

  * One tricky point is that because we bypass the buffer manager, we need to

  * register the relation for fsyncing at the next checkpoint ourselves, and

  * make sure that the relation is correctly fsync'd by us or the checkpointer

  * even if a checkpoint happens concurrently.

  *

  *

  * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group

  * Portions Copyright (c) 1994, Regents of the University of California

  *

  *

  * IDENTIFICATION

  *    src/backend/storage/smgr/bulk_write.c

  *

  *-------------------------------------------------------------------------

  */

 #include "postgres.h"


 #include "access/xloginsert.h"

 #include "access/xlogrecord.h"

 #include "storage/bufmgr.h"

 #include "storage/bufpage.h"

 #include "storage/bulk_write.h"

 #include "storage/proc.h"

 #include "storage/smgr.h"

 #include "utils/rel.h"


 #define MAX_PENDING_WRITES XLR_MAX_BLOCK_ID


 static const PGIOAlignedBlock zero_buffer = {{0}};  /* worth BLCKSZ */


 typedef struct PendingWrite

 {

     BulkWriteBuffer buf;

     BlockNumber blkno;

     bool        page_std;

 } PendingWrite;


 /*

  * Bulk writer state for one relation fork.

  */

 struct BulkWriteState

 {

     /* Information about the target relation we're writing */

     SMgrRelation smgr;

     ForkNumber  forknum;

     bool        use_wal;


     /* We keep several writes queued, and WAL-log them in batches */

     int         npending;

     PendingWrite pending_writes[MAX_PENDING_WRITES];


     /* Current size of the relation */

     BlockNumber pages_written;


     /* The RedoRecPtr at the time that the bulk operation started */

     XLogRecPtr  start_RedoRecPtr;


     MemoryContext memcxt;

 };


 static void smgr_bulk_flush(BulkWriteState *bulkstate);


 /*

  * Start a bulk write operation on a relation fork.

  */

 BulkWriteState *

 smgr_bulk_start_rel(Relation rel, ForkNumber forknum)

 {

     return smgr_bulk_start_smgr(RelationGetSmgr(rel),

                                 forknum,

                                 RelationNeedsWAL(rel) || forknum == INIT_FORKNUM);

 }


 /*

  * Start a bulk write operation on a relation fork.

  *

  * This is like smgr_bulk_start_rel, but can be used without a relcache entry.

  */

 BulkWriteState *

 smgr_bulk_start_smgr(SMgrRelation smgr, ForkNumber forknum, bool use_wal)

 {

     BulkWriteState *state;


     state = palloc(sizeof(BulkWriteState));

     state->smgr = smgr;

     state->forknum = forknum;

     state->use_wal = use_wal;


     state->npending = 0;

     state->pages_written = 0;


     state->start_RedoRecPtr = GetRedoRecPtr();


     /*

      * Remember the memory context.  We will use it to allocate all the

      * buffers later.

      */

     state->memcxt = CurrentMemoryContext;


     return state;

 }


 /*

  * Finish bulk write operation.

  *

  * This WAL-logs and flushes any remaining pending writes to disk, and fsyncs

  * the relation if needed.

  */

 void

 smgr_bulk_finish(BulkWriteState *bulkstate)

 {

     /* WAL-log and flush any remaining pages */

     smgr_bulk_flush(bulkstate);


     /*

      * When we wrote out the pages, we passed skipFsync=true to avoid the

      * overhead of registering all the writes with the checkpointer.  Register

      * the whole relation now.

      *

      * There is one hole in that idea: If a checkpoint occurred while we were

      * writing the pages, it already missed fsyncing the pages we had written

      * before the checkpoint started.  A crash later on would replay the WAL

      * starting from the checkpoint, therefore it wouldn't replay our earlier

      * WAL records.  So if a checkpoint started after the bulk write, fsync

      * the files now.

      */

     if (!SmgrIsTemp(bulkstate->smgr))

     {

         /*

          * Prevent a checkpoint from starting between the GetRedoRecPtr() and

          * smgrregistersync() calls.

          */

         Assert((MyProc->delayChkptFlags & DELAY_CHKPT_START) == 0);

         MyProc->delayChkptFlags |= DELAY_CHKPT_START;


         if (bulkstate->start_RedoRecPtr != GetRedoRecPtr())

         {

             /*

              * A checkpoint occurred and it didn't know about our writes, so

              * fsync() the relation ourselves.

              */

             MyProc->delayChkptFlags &= ~DELAY_CHKPT_START;

             smgrimmedsync(bulkstate->smgr, bulkstate->forknum);

             elog(DEBUG1, "flushed relation because a checkpoint occurred concurrently");

         }

         else

         {

             smgrregistersync(bulkstate->smgr, bulkstate->forknum);

             MyProc->delayChkptFlags &= ~DELAY_CHKPT_START;

         }

     }

 }


 static int

 buffer_cmp(const void *a, const void *b)

 {

     const PendingWrite *bufa = (const PendingWrite *) a;

     const PendingWrite *bufb = (const PendingWrite *) b;


     /* We should not see duplicated writes for the same block */

     Assert(bufa->blkno != bufb->blkno);

     if (bufa->blkno > bufb->blkno)

         return 1;

     else

         return -1;

 }


 /*

  * Finish all the pending writes.

  */

 static void

 smgr_bulk_flush(BulkWriteState *bulkstate)

 {

     int         npending = bulkstate->npending;

     PendingWrite *pending_writes = bulkstate->pending_writes;


     if (npending == 0)

         return;


     if (npending > 1)

         qsort(pending_writes, npending, sizeof(PendingWrite), buffer_cmp);


     if (bulkstate->use_wal)

     {

         BlockNumber blknos[MAX_PENDING_WRITES];

         Page        pages[MAX_PENDING_WRITES];

         bool        page_std = true;


         for (int i = 0; i < npending; i++)

         {

             blknos[i] = pending_writes[i].blkno;

             pages[i] = pending_writes[i].buf->data;


             /*

              * If any of the pages use !page_std, we log them all as such.

              * That's a bit wasteful, but in practice, a mix of standard and

              * non-standard page layout is rare.  None of the built-in AMs do

              * that.

              */

             if (!pending_writes[i].page_std)

                 page_std = false;

         }

         log_newpages(&bulkstate->smgr->smgr_rlocator.locator, bulkstate->forknum,

                      npending, blknos, pages, page_std);

     }


     for (int i = 0; i < npending; i++)

     {

         BlockNumber blkno = pending_writes[i].blkno;

         Page        page = pending_writes[i].buf->data;


         PageSetChecksumInplace(page, blkno);


         if (blkno >= bulkstate->pages_written)

         {

             /*

              * If we have to write pages nonsequentially, fill in the space

              * with zeroes until we come back and overwrite.  This is not

              * logically necessary on standard Unix filesystems (unwritten

              * space will read as zeroes anyway), but it should help to avoid

              * fragmentation.  The dummy pages aren't WAL-logged though.

              */

             while (blkno > bulkstate->pages_written)

             {

                 /* don't set checksum for all-zero page */

                 smgrextend(bulkstate->smgr, bulkstate->forknum,

                            bulkstate->pages_written++,

                            &zero_buffer,

                            true);

             }


             smgrextend(bulkstate->smgr, bulkstate->forknum, blkno, page, true);

             bulkstate->pages_written = pending_writes[i].blkno + 1;

         }

         else

             smgrwrite(bulkstate->smgr, bulkstate->forknum, blkno, page, true);

         pfree(page);

     }


     bulkstate->npending = 0;

 }


 /*

  * Queue write of 'buf'.

  *

  * NB: this takes ownership of 'buf'!

  *

  * You are only allowed to write a given block once as part of one bulk write

  * operation.

  */

 void

 smgr_bulk_write(BulkWriteState *bulkstate, BlockNumber blocknum, BulkWriteBuffer buf, bool page_std)

 {

     PendingWrite *w;


     w = &bulkstate->pending_writes[bulkstate->npending++];

     w->buf = buf;

     w->blkno = blocknum;

     w->page_std = page_std;


     if (bulkstate->npending == MAX_PENDING_WRITES)

         smgr_bulk_flush(bulkstate);

 }


 /*

  * Allocate a new buffer which can later be written with smgr_bulk_write().

  *

  * There is no function to free the buffer.  When you pass it to

  * smgr_bulk_write(), it takes ownership and frees it when it's no longer

  * needed.

  *

  * This is currently implemented as a simple palloc, but could be implemented

  * using a ring buffer or larger chunks in the future, so don't rely on it.

  */

 BulkWriteBuffer

 smgr_bulk_get_buf(BulkWriteState *bulkstate)

 {

     return MemoryContextAllocAligned(bulkstate->memcxt, BLCKSZ, PG_IO_ALIGN_SIZE, 0);

 }

BlockNumber
uint32 BlockNumber
Definition: block.h:31

bufmgr.h

PageSetChecksumInplace
void PageSetChecksumInplace(Page page, BlockNumber blkno)
Definition: bufpage.c:1542

bufpage.h

Page
Pointer Page
Definition: bufpage.h:81

smgr_bulk_flush
static void smgr_bulk_flush(BulkWriteState *bulkstate)
Definition: bulk_write.c:191

zero_buffer
static const PGIOAlignedBlock zero_buffer
Definition: bulk_write.c:48

smgr_bulk_write
void smgr_bulk_write(BulkWriteState *bulkstate, BlockNumber blocknum, BulkWriteBuffer buf, bool page_std)
Definition: bulk_write.c:271

smgr_bulk_get_buf
BulkWriteBuffer smgr_bulk_get_buf(BulkWriteState *bulkstate)
Definition: bulk_write.c:295

MAX_PENDING_WRITES
#define MAX_PENDING_WRITES
Definition: bulk_write.c:46

smgr_bulk_start_smgr
BulkWriteState * smgr_bulk_start_smgr(SMgrRelation smgr, ForkNumber forknum, bool use_wal)
Definition: bulk_write.c:99

smgr_bulk_finish
void smgr_bulk_finish(BulkWriteState *bulkstate)
Definition: bulk_write.c:129

PendingWrite
struct PendingWrite PendingWrite

buffer_cmp
static int buffer_cmp(const void *a, const void *b)
Definition: bulk_write.c:174

smgr_bulk_start_rel
BulkWriteState * smgr_bulk_start_rel(Relation rel, ForkNumber forknum)
Definition: bulk_write.c:86

bulk_write.h

Assert
#define Assert(condition)
Definition: c.h:858

DEBUG1
#define DEBUG1
Definition: elog.h:30

elog
#define elog(elevel,...)
Definition: elog.h:224

b
int b
Definition: isn.c:70

a
int a
Definition: isn.c:69

i
int i
Definition: isn.c:73

MemoryContextAllocAligned
void * MemoryContextAllocAligned(MemoryContext context, Size size, Size alignto, int flags)
Definition: mcxt.c:1409

pfree
void pfree(void *pointer)
Definition: mcxt.c:1521

CurrentMemoryContext
MemoryContext CurrentMemoryContext
Definition: mcxt.c:143

palloc
void * palloc(Size size)
Definition: mcxt.c:1317

PG_IO_ALIGN_SIZE
#define PG_IO_ALIGN_SIZE
Definition: pg_config_manual.h:234

buf
static char * buf
Definition: pg_test_fsync.c:73

qsort
#define qsort(a, b, c, d)
Definition: port.h:453

postgres.h

proc.h

DELAY_CHKPT_START
#define DELAY_CHKPT_START
Definition: proc.h:114

rel.h

RelationGetSmgr
static SMgrRelation RelationGetSmgr(Relation rel)
Definition: rel.h:567

RelationNeedsWAL
#define RelationNeedsWAL(relation)
Definition: rel.h:628

ForkNumber
ForkNumber
Definition: relpath.h:48

INIT_FORKNUM
@ INIT_FORKNUM
Definition: relpath.h:53

smgrimmedsync
void smgrimmedsync(SMgrRelation reln, ForkNumber forknum)
Definition: smgr.c:789

smgrextend
void smgrextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const void *buffer, bool skipFsync)
Definition: smgr.c:535

smgrregistersync
void smgrregistersync(SMgrRelation reln, ForkNumber forknum)
Definition: smgr.c:757

smgr.h

SmgrIsTemp
#define SmgrIsTemp(smgr)
Definition: smgr.h:73

smgrwrite
static void smgrwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const void *buffer, bool skipFsync)
Definition: smgr.h:121

MyProc
PGPROC * MyProc
Definition: proc.c:66

BulkWriteState
Definition: bulk_write.c:61

BulkWriteState::pages_written
BlockNumber pages_written
Definition: bulk_write.c:72

BulkWriteState::memcxt
MemoryContext memcxt
Definition: bulk_write.c:77

BulkWriteState::npending
int npending
Definition: bulk_write.c:68

BulkWriteState::use_wal
bool use_wal
Definition: bulk_write.c:65

BulkWriteState::smgr
SMgrRelation smgr
Definition: bulk_write.c:63

BulkWriteState::start_RedoRecPtr
XLogRecPtr start_RedoRecPtr
Definition: bulk_write.c:75

BulkWriteState::forknum
ForkNumber forknum
Definition: bulk_write.c:64

BulkWriteState::pending_writes
PendingWrite pending_writes[MAX_PENDING_WRITES]
Definition: bulk_write.c:69

MemoryContextData
Definition: memnodes.h:118

PGPROC::delayChkptFlags
int delayChkptFlags
Definition: proc.h:235

PendingWrite
Definition: bulk_write.c:51

PendingWrite::buf
BulkWriteBuffer buf
Definition: bulk_write.c:52

PendingWrite::blkno
BlockNumber blkno
Definition: bulk_write.c:53

PendingWrite::page_std
bool page_std
Definition: bulk_write.c:54

RelFileLocatorBackend::locator
RelFileLocator locator
Definition: relfilelocator.h:75

RelationData
Definition: rel.h:56

SMgrRelationData
Definition: smgr.h:35

SMgrRelationData::smgr_rlocator
RelFileLocatorBackend smgr_rlocator
Definition: smgr.h:37

state
Definition: regguts.h:323

PGIOAlignedBlock
Definition: c.h:1133

PGIOAlignedBlock::data
char data[BLCKSZ]
Definition: c.h:1137

GetRedoRecPtr
XLogRecPtr GetRedoRecPtr(void)
Definition: xlog.c:6393

XLogRecPtr
uint64 XLogRecPtr
Definition: xlogdefs.h:21

log_newpages
void log_newpages(RelFileLocator *rlocator, ForkNumber forknum, int num_pages, BlockNumber *blknos, Page *pages, bool page_std)
Definition: xloginsert.c:1175

xloginsert.h

xlogrecord.h