PostgreSQL Source Code git master
Loading...
Searching...
No Matches
read_stream.c File Reference
#include "postgres.h"
#include "miscadmin.h"
#include "executor/instrument_node.h"
#include "storage/aio.h"
#include "storage/fd.h"
#include "storage/smgr.h"
#include "storage/read_stream.h"
#include "utils/memdebug.h"
#include "utils/rel.h"
#include "utils/spccache.h"
Include dependency graph for read_stream.c:

Go to the source code of this file.

Data Structures

struct  InProgressIO
 
struct  ReadStream
 

Typedefs

typedef struct InProgressIO InProgressIO
 

Functions

static voidget_per_buffer_data (ReadStream *stream, int16 buffer_index)
 
BlockNumber block_range_read_stream_cb (ReadStream *stream, void *callback_private_data, void *per_buffer_data)
 
static void read_stream_count_prefetch (ReadStream *stream)
 
static void read_stream_count_io (ReadStream *stream, int nblocks, int in_progress)
 
static void read_stream_count_wait (ReadStream *stream)
 
void read_stream_enable_stats (ReadStream *stream, IOStats *stats)
 
static BlockNumber read_stream_get_block (ReadStream *stream, void *per_buffer_data)
 
static void read_stream_unget_block (ReadStream *stream, BlockNumber blocknum)
 
static bool read_stream_start_pending_read (ReadStream *stream)
 
static bool read_stream_should_look_ahead (ReadStream *stream)
 
static bool read_stream_should_issue_now (ReadStream *stream)
 
static void read_stream_look_ahead (ReadStream *stream)
 
static ReadStreamread_stream_begin_impl (int flags, BufferAccessStrategy strategy, Relation rel, SMgrRelation smgr, char persistence, ForkNumber forknum, ReadStreamBlockNumberCB callback, void *callback_private_data, size_t per_buffer_data_size)
 
ReadStreamread_stream_begin_relation (int flags, BufferAccessStrategy strategy, Relation rel, ForkNumber forknum, ReadStreamBlockNumberCB callback, void *callback_private_data, size_t per_buffer_data_size)
 
ReadStreamread_stream_begin_smgr_relation (int flags, BufferAccessStrategy strategy, SMgrRelation smgr, char smgr_persistence, ForkNumber forknum, ReadStreamBlockNumberCB callback, void *callback_private_data, size_t per_buffer_data_size)
 
Buffer read_stream_next_buffer (ReadStream *stream, void **per_buffer_data)
 
BlockNumber read_stream_next_block (ReadStream *stream, BufferAccessStrategy *strategy)
 
BlockNumber read_stream_pause (ReadStream *stream)
 
void read_stream_resume (ReadStream *stream)
 
void read_stream_reset (ReadStream *stream)
 
void read_stream_end (ReadStream *stream)
 

Typedef Documentation

◆ InProgressIO

Function Documentation

◆ block_range_read_stream_cb()

◆ get_per_buffer_data()

static void * get_per_buffer_data ( ReadStream stream,
int16  buffer_index 
)
inlinestatic

Definition at line 172 of file read_stream.c.

173{
174 return (char *) stream->per_buffer_data +
175 stream->per_buffer_data_size * buffer_index;
176}
void * per_buffer_data
size_t per_buffer_data_size

References ReadStream::per_buffer_data, and ReadStream::per_buffer_data_size.

Referenced by read_stream_look_ahead(), and read_stream_next_buffer().

◆ read_stream_begin_impl()

static ReadStream * read_stream_begin_impl ( int  flags,
BufferAccessStrategy  strategy,
Relation  rel,
SMgrRelation  smgr,
char  persistence,
ForkNumber  forknum,
ReadStreamBlockNumberCB  callback,
void callback_private_data,
size_t  per_buffer_data_size 
)
static

Definition at line 759 of file read_stream.c.

768{
769 ReadStream *stream;
770 size_t size;
771 int16 queue_size;
773 int max_ios;
775 uint32 max_pinned_buffers;
777 Oid tablespace_id;
778
779 /*
780 * Reject attempts to read non-local temporary relations; we would be
781 * likely to get wrong data since we have no visibility into the owning
782 * session's local buffers.
783 */
784 if (rel && RELATION_IS_OTHER_TEMP(rel))
787 errmsg("cannot access temporary tables of other sessions")));
788
789 /*
790 * Decide how many I/Os we will allow to run at the same time. This
791 * number also affects how far we look ahead for opportunities to start
792 * more I/Os.
793 */
794 tablespace_id = smgr->smgr_rlocator.locator.spcOid;
795 if (!OidIsValid(MyDatabaseId) ||
796 (rel && IsCatalogRelation(rel)) ||
798 {
799 /*
800 * Avoid circularity while trying to look up tablespace settings or
801 * before spccache.c is ready.
802 */
803 max_ios = effective_io_concurrency;
804 }
805 else if (flags & READ_STREAM_MAINTENANCE)
806 max_ios = get_tablespace_maintenance_io_concurrency(tablespace_id);
807 else
808 max_ios = get_tablespace_io_concurrency(tablespace_id);
809
810 /* Cap to INT16_MAX to avoid overflowing below */
811 max_ios = Min(max_ios, PG_INT16_MAX);
812
813 /*
814 * If starting a multi-block I/O near the end of the queue, we might
815 * temporarily need extra space for overflowing buffers before they are
816 * moved to regular circular position. This is the maximum extra space we
817 * could need.
818 */
820
821 /*
822 * Choose the maximum number of buffers we're prepared to pin. We try to
823 * pin fewer if we can, though. We add one so that we can make progress
824 * even if max_ios is set to 0 (see also further down). For max_ios > 0,
825 * this also allows an extra full I/O's worth of buffers: after an I/O
826 * finishes we don't want to have to wait for its buffers to be consumed
827 * before starting a new one.
828 *
829 * Be careful not to allow int16 to overflow. That is possible with the
830 * current GUC range limits, so this is an artificial limit of ~32k
831 * buffers and we'd need to adjust the types to exceed that. We also have
832 * to allow for the spare entry and the overflow space.
833 */
834 max_pinned_buffers = (max_ios + 1) * io_combine_limit;
835 max_pinned_buffers = Min(max_pinned_buffers,
837
838 /* Give the strategy a chance to limit the number of buffers we pin. */
840 max_pinned_buffers = Min(strategy_pin_limit, max_pinned_buffers);
841
842 /*
843 * Also limit our queue to the maximum number of pins we could ever be
844 * allowed to acquire according to the buffer manager. We may not really
845 * be able to use them all due to other pins held by this backend, but
846 * we'll check that later in read_stream_start_pending_read().
847 */
848 if (SmgrIsTemp(smgr))
850 else
852 max_pinned_buffers = Min(max_pinned_buffers, max_possible_buffer_limit);
853
854 /*
855 * The limit might be zero on a system configured with too few buffers for
856 * the number of connections. We need at least one to make progress.
857 */
858 max_pinned_buffers = Max(1, max_pinned_buffers);
859
860 /*
861 * We need one extra entry for buffers and per-buffer data, because users
862 * of per-buffer data have access to the object until the next call to
863 * read_stream_next_buffer(), so we need a gap between the head and tail
864 * of the queue so that we don't clobber it.
865 */
866 queue_size = max_pinned_buffers + 1;
867
868 /*
869 * Allocate the object, the buffers, the ios and per_buffer_data space in
870 * one big chunk. Though we have queue_size buffers, we want to be able
871 * to assume that all the buffers for a single read are contiguous (i.e.
872 * don't wrap around halfway through), so we allow temporary overflows of
873 * up to the maximum possible overflow size.
874 */
875 size = offsetof(ReadStream, buffers);
876 size += sizeof(Buffer) * (queue_size + queue_overflow);
877 size += sizeof(InProgressIO) * Max(1, max_ios);
878 size += per_buffer_data_size * queue_size;
879 size += MAXIMUM_ALIGNOF * 2;
880 stream = (ReadStream *) palloc(size);
881 memset(stream, 0, offsetof(ReadStream, buffers));
882 stream->ios = (InProgressIO *)
883 MAXALIGN(&stream->buffers[queue_size + queue_overflow]);
884 if (per_buffer_data_size > 0)
885 stream->per_buffer_data = (void *)
886 MAXALIGN(&stream->ios[Max(1, max_ios)]);
887
888 stream->sync_mode = io_method == IOMETHOD_SYNC;
889 stream->batch_mode = flags & READ_STREAM_USE_BATCHING;
890
891#ifdef USE_PREFETCH
892
893 /*
894 * Read-ahead advice simulating asynchronous I/O with synchronous calls.
895 * Issue advice only if AIO is not used, direct I/O isn't enabled, the
896 * caller hasn't promised sequential access (overriding our detection
897 * heuristics), and max_ios hasn't been set to zero.
898 */
899 if (stream->sync_mode &&
901 (flags & READ_STREAM_SEQUENTIAL) == 0 &&
902 max_ios > 0)
903 stream->advice_enabled = true;
904#endif
905
906 /*
907 * Setting max_ios to zero disables AIO and advice-based pseudo AIO, but
908 * we still need to allocate space to combine and run one I/O. Bump it up
909 * to one, and remember to ask for synchronous I/O only.
910 */
911 if (max_ios == 0)
912 {
913 max_ios = 1;
915 }
916
917 /*
918 * Capture stable values for these two GUC-derived numbers for the
919 * lifetime of this stream, so we don't have to worry about the GUCs
920 * changing underneath us beyond this point.
921 */
922 stream->max_ios = max_ios;
924
925 stream->per_buffer_data_size = per_buffer_data_size;
926 stream->max_pinned_buffers = max_pinned_buffers;
927 stream->queue_size = queue_size;
928 stream->callback = callback;
929 stream->callback_private_data = callback_private_data;
933 stream->temporary = SmgrIsTemp(smgr);
934 stream->distance_decay_holdoff = 0;
935
936 /*
937 * Skip the initial ramp-up phase if the caller says we're going to be
938 * reading the whole relation. This way we start out assuming we'll be
939 * doing full io_combine_limit sized reads.
940 */
941 if (flags & READ_STREAM_FULL)
942 {
943 stream->readahead_distance = Min(max_pinned_buffers, stream->io_combine_limit);
944 stream->combine_distance = Min(max_pinned_buffers, stream->io_combine_limit);
945 }
946 else
947 {
948 stream->readahead_distance = 1;
949 stream->combine_distance = 1;
950 }
953
954 /*
955 * Since we always access the same relation, we can initialize parts of
956 * the ReadBuffersOperation objects and leave them that way, to avoid
957 * wasting CPU cycles writing to them for each read.
958 */
959 for (int i = 0; i < max_ios; ++i)
960 {
961 stream->ios[i].op.rel = rel;
962 stream->ios[i].op.smgr = smgr;
963 stream->ios[i].op.persistence = persistence;
964 stream->ios[i].op.forknum = forknum;
965 stream->ios[i].op.strategy = strategy;
966 }
967
968 return stream;
969}
int io_method
Definition aio.c:74
@ IOMETHOD_SYNC
Definition aio.h:34
int Buffer
Definition buf.h:23
int effective_io_concurrency
Definition bufmgr.c:200
int io_combine_limit
Definition bufmgr.c:215
uint32 GetPinLimit(void)
Definition bufmgr.c:2695
#define READ_BUFFERS_SYNCHRONOUSLY
Definition bufmgr.h:128
#define Min(x, y)
Definition c.h:1091
#define MAXALIGN(LEN)
Definition c.h:896
#define Max(x, y)
Definition c.h:1085
int16_t int16
Definition c.h:619
uint32_t uint32
Definition c.h:624
#define PG_INT16_MAX
Definition c.h:670
#define OidIsValid(objectId)
Definition c.h:858
bool IsCatalogRelation(Relation relation)
Definition catalog.c:104
bool IsCatalogRelationOid(Oid relid)
Definition catalog.c:121
int errcode(int sqlerrcode)
Definition elog.c:875
#define ERROR
Definition elog.h:40
#define ereport(elevel,...)
Definition elog.h:152
int io_direct_flags
Definition fd.c:172
#define IO_DIRECT_DATA
Definition fd.h:54
int GetAccessStrategyPinLimit(BufferAccessStrategy strategy)
Definition freelist.c:574
Oid MyDatabaseId
Definition globals.c:96
int i
Definition isn.c:77
uint32 GetLocalPinLimit(void)
Definition localbuf.c:308
void * palloc(Size size)
Definition mcxt.c:1390
static char * errmsg
unsigned int Oid
static int fb(int x)
#define READ_STREAM_MAINTENANCE
Definition read_stream.h:28
#define READ_STREAM_USE_BATCHING
Definition read_stream.h:64
#define READ_STREAM_FULL
Definition read_stream.h:43
#define READ_STREAM_SEQUENTIAL
Definition read_stream.h:36
#define RELATION_IS_OTHER_TEMP(relation)
Definition rel.h:678
#define SmgrIsTemp(smgr)
Definition smgr.h:74
int get_tablespace_io_concurrency(Oid spcid)
Definition spccache.c:216
int get_tablespace_maintenance_io_concurrency(Oid spcid)
Definition spccache.c:230
ReadBuffersOperation op
Definition read_stream.c:89
ForkNumber forknum
Definition bufmgr.h:137
SMgrRelation smgr
Definition bufmgr.h:135
BufferAccessStrategy strategy
Definition bufmgr.h:138
int16 io_combine_limit
Definition read_stream.c:98
uint16 distance_decay_holdoff
BlockNumber seq_until_processed
int16 max_ios
Definition read_stream.c:97
BlockNumber seq_blocknum
bool batch_mode
bool advice_enabled
int16 max_pinned_buffers
InProgressIO * ios
int16 combine_distance
int16 readahead_distance
int read_buffers_flags
int16 resume_readahead_distance
BlockNumber buffered_blocknum
int16 queue_size
int16 resume_combine_distance
ReadStreamBlockNumberCB callback
void * callback_private_data
Buffer buffers[FLEXIBLE_ARRAY_MEMBER]
RelFileLocator locator
RelFileNumber relNumber
RelFileLocatorBackend smgr_rlocator
Definition smgr.h:38
static void callback(struct sockaddr *addr, struct sockaddr *mask, void *unused)

References ReadStream::advice_enabled, ReadStream::batch_mode, ReadStream::buffered_blocknum, ReadStream::buffers, ReadStream::callback, callback(), ReadStream::callback_private_data, ReadStream::combine_distance, ReadStream::distance_decay_holdoff, effective_io_concurrency, ereport, errcode(), errmsg, ERROR, fb(), ReadBuffersOperation::forknum, get_tablespace_io_concurrency(), get_tablespace_maintenance_io_concurrency(), GetAccessStrategyPinLimit(), GetLocalPinLimit(), GetPinLimit(), i, InvalidBlockNumber, ReadStream::io_combine_limit, io_combine_limit, IO_DIRECT_DATA, io_direct_flags, io_method, IOMETHOD_SYNC, ReadStream::ios, IsCatalogRelation(), IsCatalogRelationOid(), RelFileLocatorBackend::locator, Max, ReadStream::max_ios, ReadStream::max_pinned_buffers, MAXALIGN, Min, MyDatabaseId, OidIsValid, InProgressIO::op, palloc(), ReadStream::per_buffer_data, ReadStream::per_buffer_data_size, ReadBuffersOperation::persistence, PG_INT16_MAX, ReadStream::queue_size, ReadStream::read_buffers_flags, READ_BUFFERS_SYNCHRONOUSLY, READ_STREAM_FULL, READ_STREAM_MAINTENANCE, READ_STREAM_SEQUENTIAL, READ_STREAM_USE_BATCHING, ReadStream::readahead_distance, ReadBuffersOperation::rel, RELATION_IS_OTHER_TEMP, RelFileLocator::relNumber, ReadStream::resume_combine_distance, ReadStream::resume_readahead_distance, ReadStream::seq_blocknum, ReadStream::seq_until_processed, ReadBuffersOperation::smgr, SMgrRelationData::smgr_rlocator, SmgrIsTemp, RelFileLocator::spcOid, ReadBuffersOperation::strategy, ReadStream::sync_mode, and ReadStream::temporary.

Referenced by read_stream_begin_relation(), and read_stream_begin_smgr_relation().

◆ read_stream_begin_relation()

ReadStream * read_stream_begin_relation ( int  flags,
BufferAccessStrategy  strategy,
Relation  rel,
ForkNumber  forknum,
ReadStreamBlockNumberCB  callback,
void callback_private_data,
size_t  per_buffer_data_size 
)

Definition at line 976 of file read_stream.c.

983{
984 return read_stream_begin_impl(flags,
985 strategy,
986 rel,
987 RelationGetSmgr(rel),
988 rel->rd_rel->relpersistence,
989 forknum,
990 callback,
991 callback_private_data,
992 per_buffer_data_size);
993}
static ReadStream * read_stream_begin_impl(int flags, BufferAccessStrategy strategy, Relation rel, SMgrRelation smgr, char persistence, ForkNumber forknum, ReadStreamBlockNumberCB callback, void *callback_private_data, size_t per_buffer_data_size)
static SMgrRelation RelationGetSmgr(Relation rel)
Definition rel.h:578
Form_pg_class rd_rel
Definition rel.h:111

References callback(), RelationData::rd_rel, read_stream_begin_impl(), and RelationGetSmgr().

Referenced by acquire_sample_rows(), autoprewarm_database_main(), blbulkdelete(), blgetbitmap(), blvacuumcleanup(), brin_vacuum_scan(), btvacuumscan(), collect_corrupt_items(), collect_visibility_data(), ginvacuumcleanup(), gistvacuumscan(), hashbulkdelete(), heap_beginscan(), lazy_scan_heap(), lazy_vacuum_heap_rel(), pg_prewarm(), pgstathashindex(), pgstatindex_impl(), read_stream_for_blocks(), spgvacuumscan(), statapprox_heap(), and verify_heapam().

◆ read_stream_begin_smgr_relation()

ReadStream * read_stream_begin_smgr_relation ( int  flags,
BufferAccessStrategy  strategy,
SMgrRelation  smgr,
char  smgr_persistence,
ForkNumber  forknum,
ReadStreamBlockNumberCB  callback,
void callback_private_data,
size_t  per_buffer_data_size 
)

Definition at line 1000 of file read_stream.c.

1008{
1009 return read_stream_begin_impl(flags,
1010 strategy,
1011 NULL,
1012 smgr,
1014 forknum,
1015 callback,
1016 callback_private_data,
1017 per_buffer_data_size);
1018}

References callback(), fb(), and read_stream_begin_impl().

Referenced by RelationCopyStorageUsingBuffer().

◆ read_stream_count_io()

static void read_stream_count_io ( ReadStream stream,
int  nblocks,
int  in_progress 
)
inlinestatic

Definition at line 223 of file read_stream.c.

224{
225 IOStats *stats = stream->stats;
226
227 if (stats == NULL)
228 return;
229
230 stats->io_count++;
231 stats->io_nblocks += nblocks;
232 stats->io_in_progress += in_progress;
233}
uint64 io_count
uint64 io_in_progress
uint64 io_nblocks
IOStats * stats

References fb(), IOStats::io_count, IOStats::io_in_progress, IOStats::io_nblocks, and ReadStream::stats.

Referenced by read_stream_next_buffer(), and read_stream_start_pending_read().

◆ read_stream_count_prefetch()

static void read_stream_count_prefetch ( ReadStream stream)
inlinestatic

Definition at line 203 of file read_stream.c.

204{
205 IOStats *stats = stream->stats;
206
207 if (stats == NULL)
208 return;
209
210 stats->prefetch_count++;
211 stats->distance_sum += stream->pinned_buffers;
212 if (stream->pinned_buffers > stats->distance_max)
213 stats->distance_max = stream->pinned_buffers;
214}
int16 distance_max
uint64 distance_sum
uint64 prefetch_count
int16 pinned_buffers

References IOStats::distance_max, IOStats::distance_sum, fb(), ReadStream::pinned_buffers, IOStats::prefetch_count, and ReadStream::stats.

Referenced by read_stream_next_buffer().

◆ read_stream_count_wait()

static void read_stream_count_wait ( ReadStream stream)
inlinestatic

Definition at line 241 of file read_stream.c.

242{
243 IOStats *stats = stream->stats;
244
245 if (stats == NULL)
246 return;
247
248 stats->wait_count++;
249}
uint64 wait_count

References fb(), ReadStream::stats, and IOStats::wait_count.

Referenced by read_stream_next_buffer().

◆ read_stream_enable_stats()

void read_stream_enable_stats ( ReadStream stream,
IOStats stats 
)

Definition at line 255 of file read_stream.c.

256{
257 stream->stats = stats;
258 if (stream->stats)
259 stream->stats->distance_capacity = stream->max_pinned_buffers;
260}
int16 distance_capacity

References IOStats::distance_capacity, ReadStream::max_pinned_buffers, and ReadStream::stats.

Referenced by heap_beginscan().

◆ read_stream_end()

◆ read_stream_get_block()

static BlockNumber read_stream_get_block ( ReadStream stream,
void per_buffer_data 
)
inlinestatic

Definition at line 267 of file read_stream.c.

268{
269 BlockNumber blocknum;
270
271 blocknum = stream->buffered_blocknum;
272 if (blocknum != InvalidBlockNumber)
274 else
275 {
276 /*
277 * Tell Valgrind that the per-buffer data is undefined. That replaces
278 * the "noaccess" state that was set when the consumer moved past this
279 * entry last time around the queue, and should also catch callbacks
280 * that fail to initialize data that the buffer consumer later
281 * accesses. On the first go around, it is undefined already.
282 */
283 VALGRIND_MAKE_MEM_UNDEFINED(per_buffer_data,
284 stream->per_buffer_data_size);
285 blocknum = stream->callback(stream,
286 stream->callback_private_data,
287 per_buffer_data);
288 }
289
290 return blocknum;
291}
uint32 BlockNumber
Definition block.h:31
#define VALGRIND_MAKE_MEM_UNDEFINED(addr, size)
Definition memdebug.h:28

References ReadStream::buffered_blocknum, ReadStream::callback, ReadStream::callback_private_data, InvalidBlockNumber, ReadStream::per_buffer_data_size, and VALGRIND_MAKE_MEM_UNDEFINED.

Referenced by read_stream_look_ahead(), read_stream_next_block(), and read_stream_next_buffer().

◆ read_stream_look_ahead()

static void read_stream_look_ahead ( ReadStream stream)
static

Definition at line 658 of file read_stream.c.

659{
660 /*
661 * Allow amortizing the cost of submitting IO over multiple IOs. This
662 * requires that we don't do any operations that could lead to a deadlock
663 * with staged-but-unsubmitted IO. The callback needs to opt-in to being
664 * careful.
665 */
666 if (stream->batch_mode)
668
669 while (read_stream_should_look_ahead(stream))
670 {
671 BlockNumber blocknum;
672 int16 buffer_index;
673 void *per_buffer_data;
674
676 {
678 continue;
679 }
680
681 /*
682 * See which block the callback wants next in the stream. We need to
683 * compute the index of the Nth block of the pending read including
684 * wrap-around, but we don't want to use the expensive % operator.
685 */
686 buffer_index = stream->next_buffer_index + stream->pending_read_nblocks;
687 if (buffer_index >= stream->queue_size)
688 buffer_index -= stream->queue_size;
689 Assert(buffer_index >= 0 && buffer_index < stream->queue_size);
690 per_buffer_data = get_per_buffer_data(stream, buffer_index);
691 blocknum = read_stream_get_block(stream, per_buffer_data);
692 if (blocknum == InvalidBlockNumber)
693 {
694 /* End of stream. */
695 stream->readahead_distance = 0;
696 stream->combine_distance = 0;
697 break;
698 }
699
700 /* Can we merge it with the pending read? */
701 if (stream->pending_read_nblocks > 0 &&
702 stream->pending_read_blocknum + stream->pending_read_nblocks == blocknum)
703 {
704 stream->pending_read_nblocks++;
705 continue;
706 }
707
708 /* We have to start the pending read before we can build another. */
709 while (stream->pending_read_nblocks > 0)
710 {
711 if (!read_stream_start_pending_read(stream) ||
712 stream->ios_in_progress == stream->max_ios)
713 {
714 /* We've hit the buffer or I/O limit. Rewind and stop here. */
715 read_stream_unget_block(stream, blocknum);
716 if (stream->batch_mode)
718 return;
719 }
720 }
721
722 /* This is the start of a new pending read. */
723 stream->pending_read_blocknum = blocknum;
724 stream->pending_read_nblocks = 1;
725 }
726
727 /*
728 * Check if the pending read should be issued now, or if we should give it
729 * another chance to grow to the full size.
730 *
731 * Note that the pending read can exceed the distance goal, if the latter
732 * was reduced after hitting the per-backend buffer limit.
733 */
736
737 /*
738 * There should always be something pinned when we leave this function,
739 * whether started by this call or not, unless we've hit the end of the
740 * stream. In the worst case we can always make progress one buffer at a
741 * time.
742 */
743 Assert(stream->pinned_buffers > 0 || stream->readahead_distance == 0);
744
745 if (stream->batch_mode)
747}
void pgaio_enter_batchmode(void)
Definition aio.c:1091
void pgaio_exit_batchmode(void)
Definition aio.c:1102
#define Assert(condition)
Definition c.h:943
static void * get_per_buffer_data(ReadStream *stream, int16 buffer_index)
static bool read_stream_start_pending_read(ReadStream *stream)
static bool read_stream_should_issue_now(ReadStream *stream)
static bool read_stream_should_look_ahead(ReadStream *stream)
static BlockNumber read_stream_get_block(ReadStream *stream, void *per_buffer_data)
static void read_stream_unget_block(ReadStream *stream, BlockNumber blocknum)
int16 ios_in_progress
Definition read_stream.c:99
BlockNumber pending_read_blocknum
int16 next_buffer_index
int16 pending_read_nblocks

References Assert, ReadStream::batch_mode, ReadStream::combine_distance, fb(), get_per_buffer_data(), InvalidBlockNumber, ReadStream::ios_in_progress, ReadStream::max_ios, ReadStream::next_buffer_index, ReadStream::pending_read_blocknum, ReadStream::pending_read_nblocks, pgaio_enter_batchmode(), pgaio_exit_batchmode(), ReadStream::pinned_buffers, ReadStream::queue_size, read_stream_get_block(), read_stream_should_issue_now(), read_stream_should_look_ahead(), read_stream_start_pending_read(), read_stream_unget_block(), and ReadStream::readahead_distance.

Referenced by read_stream_next_buffer().

◆ read_stream_next_block()

BlockNumber read_stream_next_block ( ReadStream stream,
BufferAccessStrategy strategy 
)

Definition at line 1377 of file read_stream.c.

1378{
1379 *strategy = stream->ios[0].op.strategy;
1380 return read_stream_get_block(stream, NULL);
1381}

References fb(), ReadStream::ios, InProgressIO::op, read_stream_get_block(), and ReadBuffersOperation::strategy.

◆ read_stream_next_buffer()

Buffer read_stream_next_buffer ( ReadStream stream,
void **  per_buffer_data 
)

Definition at line 1030 of file read_stream.c.

1031{
1032 Buffer buffer;
1033 int16 oldest_buffer_index;
1034
1035#ifndef READ_STREAM_DISABLE_FAST_PATH
1036
1037 /*
1038 * A fast path for all-cached scans. This is the same as the usual
1039 * algorithm, but it is specialized for no I/O and no per-buffer data, so
1040 * we can skip the queue management code, stay in the same buffer slot and
1041 * use singular StartReadBuffer().
1042 */
1043 if (likely(stream->fast_path))
1044 {
1046
1047 /* Fast path assumptions. */
1048 Assert(stream->ios_in_progress == 0);
1049 Assert(stream->forwarded_buffers == 0);
1050 Assert(stream->pinned_buffers == 1);
1051 Assert(stream->readahead_distance == 1);
1052 Assert(stream->combine_distance == 1);
1053 Assert(stream->pending_read_nblocks == 0);
1054 Assert(stream->per_buffer_data_size == 0);
1056
1057 /* We're going to return the buffer we pinned last time. */
1058 oldest_buffer_index = stream->oldest_buffer_index;
1059 Assert((oldest_buffer_index + 1) % stream->queue_size ==
1060 stream->next_buffer_index);
1061 buffer = stream->buffers[oldest_buffer_index];
1062 Assert(buffer != InvalidBuffer);
1063
1064 /* Choose the next block to pin. */
1066
1068 {
1069 int flags = stream->read_buffers_flags;
1070
1071 if (stream->advice_enabled)
1073
1074 /*
1075 * While in fast-path, execute any IO that we might encounter
1076 * synchronously. Because we are, right now, only looking one
1077 * block ahead, dispatching any occasional IO to workers would
1078 * have the overhead of dispatching to workers, without any
1079 * realistic chance of the IO completing before we need it. We
1080 * will switch to non-synchronous IO after this.
1081 *
1082 * Arguably we should do so only for worker, as there's far less
1083 * dispatch overhead with io_uring. However, tests so far have not
1084 * shown a clear downside and additional io_method awareness here
1085 * seems not great from an abstraction POV.
1086 */
1088
1089 /*
1090 * Pin a buffer for the next call. Same buffer entry, and
1091 * arbitrary I/O entry (they're all free). We don't have to
1092 * adjust pinned_buffers because we're transferring one to caller
1093 * but pinning one more.
1094 *
1095 * In the fast path we don't need to check the pin limit. We're
1096 * always allowed at least one pin so that progress can be made,
1097 * and that's all we need here. Although two pins are momentarily
1098 * held at the same time, the model used here is that the stream
1099 * holds only one, and the other now belongs to the caller.
1100 */
1101 if (likely(!StartReadBuffer(&stream->ios[0].op,
1102 &stream->buffers[oldest_buffer_index],
1104 flags)))
1105 {
1106 /* Fast return. */
1108 return buffer;
1109 }
1110
1111 /* Next call must wait for I/O for the newly pinned buffer. */
1112 stream->oldest_io_index = 0;
1113 stream->next_io_index = stream->max_ios > 1 ? 1 : 0;
1114 stream->ios_in_progress = 1;
1115 stream->ios[0].buffer_index = oldest_buffer_index;
1116 stream->seq_blocknum = next_blocknum + 1;
1117
1118 /*
1119 * XXX: It might be worth triggering additional read-ahead here,
1120 * to avoid having to effectively do another synchronous IO for
1121 * the next block (if it were also a miss).
1122 */
1123
1124 /* update I/O stats */
1125 read_stream_count_io(stream, 1, stream->ios_in_progress);
1126
1127 /* update prefetch distance */
1129 }
1130 else
1131 {
1132 /* No more blocks, end of stream. */
1133 stream->readahead_distance = 0;
1134 stream->combine_distance = 0;
1135 stream->oldest_buffer_index = stream->next_buffer_index;
1136 stream->pinned_buffers = 0;
1137 stream->buffers[oldest_buffer_index] = InvalidBuffer;
1138 }
1139
1140 stream->fast_path = false;
1141 return buffer;
1142 }
1143#endif
1144
1145 if (unlikely(stream->pinned_buffers == 0))
1146 {
1147 Assert(stream->oldest_buffer_index == stream->next_buffer_index);
1148
1149 /* End of stream reached? */
1150 if (stream->readahead_distance == 0)
1151 return InvalidBuffer;
1152
1153 /*
1154 * The usual order of operations is that we look ahead at the bottom
1155 * of this function after potentially finishing an I/O and making
1156 * space for more, but if we're just starting up we'll need to crank
1157 * the handle to get started.
1158 */
1159 read_stream_look_ahead(stream);
1160
1161 /* End of stream reached? */
1162 if (stream->pinned_buffers == 0)
1163 {
1164 Assert(stream->readahead_distance == 0);
1165 return InvalidBuffer;
1166 }
1167 }
1168
1169 /* Grab the oldest pinned buffer and associated per-buffer data. */
1170 Assert(stream->pinned_buffers > 0);
1171 oldest_buffer_index = stream->oldest_buffer_index;
1172 Assert(oldest_buffer_index >= 0 &&
1174 buffer = stream->buffers[oldest_buffer_index];
1175 if (per_buffer_data)
1176 *per_buffer_data = get_per_buffer_data(stream, oldest_buffer_index);
1177
1178 Assert(BufferIsValid(buffer));
1179
1180 /* Do we have to wait for an associated I/O first? */
1181 if (stream->ios_in_progress > 0 &&
1182 stream->ios[stream->oldest_io_index].buffer_index == oldest_buffer_index)
1183 {
1184 int16 io_index = stream->oldest_io_index;
1185 bool needed_wait;
1186
1187 /* Sanity check that we still agree on the buffers. */
1188 Assert(stream->ios[io_index].op.buffers ==
1189 &stream->buffers[oldest_buffer_index]);
1190
1192
1193 Assert(stream->ios_in_progress > 0);
1194 stream->ios_in_progress--;
1195 if (++stream->oldest_io_index == stream->max_ios)
1196 stream->oldest_io_index = 0;
1197
1198 /*
1199 * If the IO was executed synchronously, we will never see
1200 * WaitReadBuffers() block. Treat it as if it did block. This is
1201 * particularly crucial when effective_io_concurrency=0 is used, as
1202 * all IO will be synchronous. Without treating synchronous IO as
1203 * having waited, we'd never allow the distance to get large enough to
1204 * allow for IO combining, resulting in bad performance.
1205 */
1207 needed_wait = true;
1208
1209 /* Count it as a wait if we need to wait for IO */
1210 if (needed_wait)
1211 read_stream_count_wait(stream);
1212
1213 /*
1214 * Have the read-ahead distance ramp up rapidly after we needed to
1215 * wait for IO. We only increase the read-ahead-distance when we
1216 * needed to wait, to avoid increasing the distance further than
1217 * necessary, as looking ahead too far can be costly, both due to the
1218 * cost of unnecessarily pinning many buffers and due to doing IOs
1219 * that may never be consumed if the stream is ended/reset before
1220 * completion.
1221 *
1222 * If we did not need to wait, the current distance was evidently
1223 * sufficient.
1224 *
1225 * NB: Must not increase the distance if we already reached the end of
1226 * the stream, as stream->readahead_distance == 0 is used to keep
1227 * track of having reached the end.
1228 */
1229 if (stream->readahead_distance > 0 && needed_wait)
1230 {
1231 /* wider temporary value, due to overflow risk */
1232 int32 readahead_distance;
1233
1234 readahead_distance = stream->readahead_distance * 2;
1235 readahead_distance = Min(readahead_distance, stream->max_pinned_buffers);
1236 stream->readahead_distance = readahead_distance;
1237 }
1238
1239 /*
1240 * As we needed IO, prevent distances from being reduced within our
1241 * maximum look-ahead window. This avoids collapsing distances too
1242 * quickly in workloads where most of the required blocks are cached,
1243 * but where the remaining IOs are a sufficient enough factor to cause
1244 * a substantial slowdown if executed synchronously.
1245 *
1246 * There are valid arguments for preventing decay for max_ios or for
1247 * max_pinned_buffers. But the argument for max_pinned_buffers seems
1248 * clearer - if we can't see any misses within the maximum look-ahead
1249 * distance, we can't do any useful read-ahead.
1250 */
1252
1253 /*
1254 * Whether we needed to wait or not, allow for more IO combining if we
1255 * needed to do IO. The reason to do so independent of needing to wait
1256 * is that when the data is resident in the kernel page cache, IO
1257 * combining reduces the syscall / dispatch overhead, making it
1258 * worthwhile regardless of needing to wait.
1259 *
1260 * It is also important with io_uring as it will never signal the need
1261 * to wait for reads if all the data is in the page cache. There are
1262 * heuristics to deal with that in method_io_uring.c, but they only
1263 * work when the IO gets large enough.
1264 */
1265 if (stream->combine_distance > 0 &&
1266 stream->combine_distance < stream->io_combine_limit)
1267 {
1268 /* wider temporary value, due to overflow risk */
1269 int32 combine_distance;
1270
1271 combine_distance = stream->combine_distance * 2;
1272 combine_distance = Min(combine_distance, stream->io_combine_limit);
1273 combine_distance = Min(combine_distance, stream->max_pinned_buffers);
1274 stream->combine_distance = combine_distance;
1275 }
1276
1277 /*
1278 * If we've reached the first block of a sequential region we're
1279 * issuing advice for, cancel that until the next jump. The kernel
1280 * will see the sequential preadv() pattern starting here.
1281 */
1282 if (stream->advice_enabled &&
1283 stream->ios[io_index].op.blocknum == stream->seq_until_processed)
1285 }
1286
1287 /*
1288 * We must zap this queue entry, or else it would appear as a forwarded
1289 * buffer. If it's potentially in the overflow zone (ie from a
1290 * multi-block I/O that wrapped around the queue), also zap the copy.
1291 */
1292 stream->buffers[oldest_buffer_index] = InvalidBuffer;
1294 stream->buffers[stream->queue_size + oldest_buffer_index] =
1296
1297#if defined(CLOBBER_FREED_MEMORY) || defined(USE_VALGRIND)
1298
1299 /*
1300 * The caller will get access to the per-buffer data, until the next call.
1301 * We wipe the one before, which is never occupied because queue_size
1302 * allowed one extra element. This will hopefully trip up client code
1303 * that is holding a dangling pointer to it.
1304 */
1305 if (stream->per_buffer_data)
1306 {
1307 void *per_buffer_data;
1308
1309 per_buffer_data = get_per_buffer_data(stream,
1310 oldest_buffer_index == 0 ?
1311 stream->queue_size - 1 :
1312 oldest_buffer_index - 1);
1313
1314#if defined(CLOBBER_FREED_MEMORY)
1315 /* This also tells Valgrind the memory is "noaccess". */
1316 wipe_mem(per_buffer_data, stream->per_buffer_data_size);
1317#elif defined(USE_VALGRIND)
1318 /* Tell it ourselves. */
1319 VALGRIND_MAKE_MEM_NOACCESS(per_buffer_data,
1320 stream->per_buffer_data_size);
1321#endif
1322 }
1323#endif
1324
1326
1327 /* Pin transferred to caller. */
1328 Assert(stream->pinned_buffers > 0);
1329 stream->pinned_buffers--;
1330
1331 /* Advance oldest buffer, with wrap-around. */
1332 stream->oldest_buffer_index++;
1333 if (stream->oldest_buffer_index == stream->queue_size)
1334 stream->oldest_buffer_index = 0;
1335
1336 /* Prepare for the next call. */
1337 read_stream_look_ahead(stream);
1338
1339#ifndef READ_STREAM_DISABLE_FAST_PATH
1340 /* See if we can take the fast path for all-cached scans next time. */
1341 if (stream->ios_in_progress == 0 &&
1342 stream->forwarded_buffers == 0 &&
1343 stream->pinned_buffers == 1 &&
1344 stream->readahead_distance == 1 &&
1345 stream->combine_distance == 1 &&
1346 stream->pending_read_nblocks == 0 &&
1347 stream->per_buffer_data_size == 0)
1348 {
1349 /*
1350 * The fast path spins on one buffer entry repeatedly instead of
1351 * rotating through the whole queue and clearing the entries behind
1352 * it. If the buffer it starts with happened to be forwarded between
1353 * StartReadBuffers() calls and also wrapped around the circular queue
1354 * partway through, then a copy also exists in the overflow zone, and
1355 * it won't clear it out as the regular path would. Do that now, so
1356 * it doesn't need code for that.
1357 */
1358 if (stream->oldest_buffer_index < stream->io_combine_limit - 1)
1359 stream->buffers[stream->queue_size + stream->oldest_buffer_index] =
1361
1362 stream->fast_path = true;
1363 }
1364#endif
1365
1366 return buffer;
1367}
#define InvalidBuffer
Definition buf.h:25
bool WaitReadBuffers(ReadBuffersOperation *operation)
Definition bufmgr.c:1759
bool StartReadBuffer(ReadBuffersOperation *operation, Buffer *buffer, BlockNumber blocknum, int flags)
Definition bufmgr.c:1637
#define READ_BUFFERS_ISSUE_ADVICE
Definition bufmgr.h:124
static bool BufferIsValid(Buffer bufnum)
Definition bufmgr.h:419
#define likely(x)
Definition c.h:437
int32_t int32
Definition c.h:620
#define unlikely(x)
Definition c.h:438
#define VALGRIND_MAKE_MEM_NOACCESS(addr, size)
Definition memdebug.h:27
static void read_stream_look_ahead(ReadStream *stream)
static void read_stream_count_wait(ReadStream *stream)
static void read_stream_count_io(ReadStream *stream, int nblocks, int in_progress)
static void read_stream_count_prefetch(ReadStream *stream)
int16 buffer_index
Definition read_stream.c:88
BlockNumber blocknum
Definition bufmgr.h:146
int16 oldest_buffer_index
int16 oldest_io_index
int16 initialized_buffers
int16 forwarded_buffers
int16 next_io_index

References ReadStream::advice_enabled, Assert, ReadBuffersOperation::blocknum, InProgressIO::buffer_index, BufferIsValid(), ReadStream::buffers, ReadBuffersOperation::buffers, ReadStream::combine_distance, ReadStream::distance_decay_holdoff, ReadStream::fast_path, fb(), ReadBuffersOperation::flags, ReadStream::forwarded_buffers, get_per_buffer_data(), ReadStream::initialized_buffers, InvalidBlockNumber, InvalidBuffer, ReadStream::io_combine_limit, io_combine_limit, ReadStream::ios, ReadStream::ios_in_progress, likely, ReadStream::max_ios, ReadStream::max_pinned_buffers, Min, ReadStream::next_buffer_index, ReadStream::next_io_index, ReadStream::oldest_buffer_index, ReadStream::oldest_io_index, InProgressIO::op, ReadStream::pending_read_nblocks, ReadStream::per_buffer_data, ReadStream::per_buffer_data_size, ReadStream::pinned_buffers, ReadStream::queue_size, ReadStream::read_buffers_flags, READ_BUFFERS_ISSUE_ADVICE, READ_BUFFERS_SYNCHRONOUSLY, read_stream_count_io(), read_stream_count_prefetch(), read_stream_count_wait(), read_stream_get_block(), read_stream_look_ahead(), ReadStream::readahead_distance, ReadStream::seq_blocknum, ReadStream::seq_until_processed, StartReadBuffer(), unlikely, VALGRIND_MAKE_MEM_NOACCESS, and WaitReadBuffers().

Referenced by autoprewarm_database_main(), BitmapHeapScanNextBlock(), blbulkdelete(), blgetbitmap(), blvacuumcleanup(), brin_vacuum_scan(), btvacuumscan(), collect_corrupt_items(), collect_visibility_data(), ginvacuumcleanup(), gistvacuumscan(), hashbulkdelete(), heap_fetch_next_buffer(), heapam_scan_analyze_next_block(), lazy_scan_heap(), lazy_vacuum_heap_rel(), pg_prewarm(), pgstathashindex(), pgstatindex_impl(), read_stream_for_blocks(), read_stream_reset(), RelationCopyStorageUsingBuffer(), spgvacuumscan(), statapprox_heap(), and verify_heapam().

◆ read_stream_pause()

◆ read_stream_reset()

void read_stream_reset ( ReadStream stream)

Definition at line 1417 of file read_stream.c.

1418{
1419 int16 index;
1420 Buffer buffer;
1421
1422 /* Stop looking ahead. */
1423 stream->readahead_distance = 0;
1424 stream->combine_distance = 0;
1425
1426 /* Forget buffered block number and fast path state. */
1428 stream->fast_path = false;
1429
1430 /* Unpin anything that wasn't consumed. */
1431 while ((buffer = read_stream_next_buffer(stream, NULL)) != InvalidBuffer)
1432 ReleaseBuffer(buffer);
1433
1434 /* Unpin any unused forwarded buffers. */
1435 index = stream->next_buffer_index;
1436 while (index < stream->initialized_buffers &&
1437 (buffer = stream->buffers[index]) != InvalidBuffer)
1438 {
1439 Assert(stream->forwarded_buffers > 0);
1440 stream->forwarded_buffers--;
1441 ReleaseBuffer(buffer);
1442
1443 stream->buffers[index] = InvalidBuffer;
1445 stream->buffers[stream->queue_size + index] = InvalidBuffer;
1446
1447 if (++index == stream->queue_size)
1448 index = 0;
1449 }
1450
1451 Assert(stream->forwarded_buffers == 0);
1452 Assert(stream->pinned_buffers == 0);
1453 Assert(stream->ios_in_progress == 0);
1454
1455 /* Start off assuming data is cached. */
1456 stream->readahead_distance = 1;
1457 stream->combine_distance = 1;
1459 stream->resume_combine_distance = stream->combine_distance;
1460 stream->distance_decay_holdoff = 0;
1461}
void ReleaseBuffer(Buffer buffer)
Definition bufmgr.c:5595
Buffer read_stream_next_buffer(ReadStream *stream, void **per_buffer_data)
Definition type.h:97

References Assert, ReadStream::buffered_blocknum, ReadStream::buffers, ReadStream::combine_distance, ReadStream::distance_decay_holdoff, ReadStream::fast_path, fb(), ReadStream::forwarded_buffers, InvalidBlockNumber, InvalidBuffer, io_combine_limit, ReadStream::ios_in_progress, ReadStream::next_buffer_index, ReadStream::pinned_buffers, ReadStream::queue_size, read_stream_next_buffer(), ReadStream::readahead_distance, ReleaseBuffer(), ReadStream::resume_combine_distance, and ReadStream::resume_readahead_distance.

Referenced by btvacuumscan(), gistvacuumscan(), hashbulkdelete(), heap_fetch_next_buffer(), heap_rescan(), read_stream_end(), and spgvacuumscan().

◆ read_stream_resume()

◆ read_stream_should_issue_now()

static bool read_stream_should_issue_now ( ReadStream stream)
inlinestatic

Definition at line 619 of file read_stream.c.

620{
621 int16 pending_read_nblocks = stream->pending_read_nblocks;
622
623 /* there is no pending IO that could be issued */
624 if (pending_read_nblocks == 0)
625 return false;
626
627 /* never start more IOs than our cap */
628 if (stream->ios_in_progress >= stream->max_ios)
629 return false;
630
631 /*
632 * If the callback has signaled end-of-stream, start the pending read
633 * immediately. There is no further potential for IO combining.
634 */
635 if (stream->readahead_distance == 0)
636 return true;
637
638 /*
639 * If we've already reached combine_distance, there's no chance of growing
640 * the read further.
641 */
642 if (pending_read_nblocks >= stream->combine_distance)
643 return true;
644
645 /*
646 * If we currently have no reads in flight or prepared, issue the IO once
647 * we are not looking ahead further. This ensures there's always at least
648 * one IO prepared.
649 */
650 if (stream->pinned_buffers == 0 &&
652 return true;
653
654 return false;
655}

References ReadStream::combine_distance, ReadStream::ios_in_progress, ReadStream::max_ios, ReadStream::pending_read_nblocks, ReadStream::pinned_buffers, read_stream_should_look_ahead(), and ReadStream::readahead_distance.

Referenced by read_stream_look_ahead().

◆ read_stream_should_look_ahead()

static bool read_stream_should_look_ahead ( ReadStream stream)
inlinestatic

Definition at line 554 of file read_stream.c.

555{
556 /* If the callback has signaled end-of-stream, we're done */
557 if (stream->readahead_distance == 0)
558 return false;
559
560 /* never start more IOs than our cap */
561 if (stream->ios_in_progress >= stream->max_ios)
562 return false;
563
564 /*
565 * Allow looking further ahead if we are in the process of building a
566 * larger IO, the IO is not yet big enough, and we don't yet have IO in
567 * flight.
568 *
569 * We do so to allow building larger reads when readahead_distance is
570 * small (e.g. because the I/O subsystem is keeping up or
571 * effective_io_concurrency is small). That's a useful goal because larger
572 * reads are more CPU efficient than smaller reads, even if the system is
573 * not IO bound.
574 *
575 * The reason we do *not* do so when we already have a read prepared (i.e.
576 * why we check for pinned_buffers == 0) is once we are actually reading
577 * ahead, we don't need it:
578 *
579 * - We won't issue unnecessarily small reads as
580 * read_stream_should_issue_now() will return false until the IO is
581 * suitably sized. The issuance of the pending read will be delayed until
582 * enough buffers have been consumed.
583 *
584 * - If we are not reading ahead aggressively enough, future
585 * WaitReadBuffers() calls will return true, leading to readahead_distance
586 * being increased. After that more full-sized IOs can be issued.
587 *
588 * Furthermore, if we did not have the pinned_buffers == 0 condition, we
589 * might end up issuing I/O more aggressively than we need.
590 *
591 * Note that a return of true here can lead to exceeding the read-ahead
592 * limit, but we won't exceed the buffer pin limit (because pinned_buffers
593 * == 0 and combine_distance is capped by max_pinned_buffers).
594 */
595 if (stream->pending_read_nblocks > 0 &&
596 stream->pinned_buffers == 0 &&
597 stream->pending_read_nblocks < stream->combine_distance)
598 return true;
599
600 /*
601 * Don't start more read-ahead if that'd put us over the distance limit
602 * for doing read-ahead. As stream->readahead_distance is capped by
603 * max_pinned_buffers, this prevents us from looking ahead so far that it
604 * would put us over the pin limit.
605 */
606 if (stream->pinned_buffers + stream->pending_read_nblocks >= stream->readahead_distance)
607 return false;
608
609 return true;
610}

References ReadStream::combine_distance, ReadStream::ios_in_progress, ReadStream::max_ios, ReadStream::pending_read_nblocks, ReadStream::pinned_buffers, and ReadStream::readahead_distance.

Referenced by read_stream_look_ahead(), and read_stream_should_issue_now().

◆ read_stream_start_pending_read()

static bool read_stream_start_pending_read ( ReadStream stream)
static

Definition at line 318 of file read_stream.c.

319{
320 bool need_wait;
322 int nblocks;
323 int flags;
324 int forwarded;
326 int16 overflow;
327 int16 buffer_index;
328 int buffer_limit;
329
330 /* This should only be called with a pending read. */
331 Assert(stream->pending_read_nblocks > 0);
332 Assert(stream->pending_read_nblocks <= stream->io_combine_limit);
333
334 /* We had better not exceed the per-stream buffer limit with this read. */
335 Assert(stream->pinned_buffers + stream->pending_read_nblocks <=
336 stream->max_pinned_buffers);
337
338#ifdef USE_ASSERT_CHECKING
339 /* We had better not be overwriting an existing pinned buffer. */
340 if (stream->pinned_buffers > 0)
341 Assert(stream->next_buffer_index != stream->oldest_buffer_index);
342 else
343 Assert(stream->next_buffer_index == stream->oldest_buffer_index);
344
345 /*
346 * Pinned buffers forwarded by a preceding StartReadBuffers() call that
347 * had to split the operation should match the leading blocks of this
348 * following StartReadBuffers() call.
349 */
351 for (int i = 0; i < stream->forwarded_buffers; ++i)
353 stream->pending_read_blocknum + i);
354
355 /*
356 * Check that we've cleared the queue/overflow entries corresponding to
357 * the rest of the blocks covered by this read, unless it's the first go
358 * around and we haven't even initialized them yet.
359 */
360 for (int i = stream->forwarded_buffers; i < stream->pending_read_nblocks; ++i)
361 Assert(stream->next_buffer_index + i >= stream->initialized_buffers ||
362 stream->buffers[stream->next_buffer_index + i] == InvalidBuffer);
363#endif
364
365 /* Do we need to issue read-ahead advice? */
366 flags = stream->read_buffers_flags;
367 if (stream->advice_enabled)
368 {
369 if (stream->pending_read_blocknum == stream->seq_blocknum)
370 {
371 /*
372 * Sequential: Issue advice until the preadv() calls have caught
373 * up with the first advice issued for this sequential region, and
374 * then stay out of the way of the kernel's own read-ahead.
375 */
378 }
379 else
380 {
381 /*
382 * Random jump: Note the starting location of a new potential
383 * sequential region and start issuing advice. Skip it this time
384 * if the preadv() follows immediately, eg first block in stream.
385 */
387 if (stream->pinned_buffers > 0)
389 }
390 }
391
392 /*
393 * How many more buffers is this backend allowed?
394 *
395 * Forwarded buffers are already pinned and map to the leading blocks of
396 * the pending read (the remaining portion of an earlier short read that
397 * we're about to continue). They are not counted in pinned_buffers, but
398 * they are counted as pins already held by this backend according to the
399 * buffer manager, so they must be added to the limit it grants us.
400 */
401 if (stream->temporary)
403 else
406
409
410 if (buffer_limit == 0 && stream->pinned_buffers == 0)
411 buffer_limit = 1; /* guarantee progress */
412
413 /* Does the per-backend limit affect this read? */
414 nblocks = stream->pending_read_nblocks;
415 if (buffer_limit < nblocks)
416 {
418
419 /* Shrink distance: no more look-ahead until buffers are released. */
421 if (stream->readahead_distance > new_distance)
423
424 /* Unless we have nothing to give the consumer, stop here. */
425 if (stream->pinned_buffers > 0)
426 return false;
427
428 /* A short read is required to make progress. */
429 nblocks = buffer_limit;
430 }
431
432 /*
433 * We say how many blocks we want to read, but it may be smaller on return
434 * if the buffer manager decides to shorten the read. Initialize buffers
435 * to InvalidBuffer (= not a forwarded buffer) as input on first use only,
436 * and keep the original nblocks number so we can check for forwarded
437 * buffers as output, below.
438 */
439 buffer_index = stream->next_buffer_index;
440 io_index = stream->next_io_index;
441 while (stream->initialized_buffers < buffer_index + nblocks)
442 stream->buffers[stream->initialized_buffers++] = InvalidBuffer;
443 requested_nblocks = nblocks;
445 &stream->buffers[buffer_index],
446 stream->pending_read_blocknum,
447 &nblocks,
448 flags);
449 stream->pinned_buffers += nblocks;
450
451 /* Remember whether we need to wait before returning this buffer. */
452 if (!need_wait)
453 {
454 /*
455 * If there currently is no IO in progress, and we have not needed to
456 * issue IO recently, decay the look-ahead distance. We detect if we
457 * had to issue IO recently by having a decay holdoff that's set to
458 * the max look-ahead distance whenever we need to do IO. This is
459 * important to ensure we eventually reach a high enough distance to
460 * perform IO asynchronously when starting out with a small look-ahead
461 * distance.
462 */
463 if (stream->ios_in_progress == 0)
464 {
465 if (stream->distance_decay_holdoff > 0)
466 stream->distance_decay_holdoff--;
467 else
468 {
469 if (stream->readahead_distance > 1)
470 stream->readahead_distance--;
471
472 /*
473 * For now we reduce the IO combine distance after
474 * sufficiently many buffer hits. There is no clear
475 * performance argument for doing so, but at the moment we
476 * need to do so to make the entrance into fast_path work
477 * correctly: We require combine_distance == 1 to enter
478 * fast-path, as without that condition we would wrongly
479 * re-enter fast-path when readahead_distance == 1 and
480 * pinned_buffers == 1, as we would not yet have prepared
481 * another IO in that situation.
482 */
483 if (stream->combine_distance > 1)
484 stream->combine_distance--;
485 }
486 }
487 }
488 else
489 {
490 /*
491 * Remember to call WaitReadBuffers() before returning head buffer.
492 * Look-ahead distance will be adjusted after waiting.
493 */
494 stream->ios[io_index].buffer_index = buffer_index;
495 if (++stream->next_io_index == stream->max_ios)
496 stream->next_io_index = 0;
497 Assert(stream->ios_in_progress < stream->max_ios);
498 stream->ios_in_progress++;
499 stream->seq_blocknum = stream->pending_read_blocknum + nblocks;
500
501 /* update I/O stats */
502 read_stream_count_io(stream, nblocks, stream->ios_in_progress);
503 }
504
505 /*
506 * How many pins were acquired but forwarded to the next call? These need
507 * to be passed to the next StartReadBuffers() call by leaving them
508 * exactly where they are in the queue, or released if the stream ends
509 * early. We need the number for accounting purposes, since they are not
510 * counted in stream->pinned_buffers but we already hold them.
511 */
512 forwarded = 0;
513 while (nblocks + forwarded < requested_nblocks &&
514 stream->buffers[buffer_index + nblocks + forwarded] != InvalidBuffer)
515 forwarded++;
517
518 /*
519 * We gave a contiguous range of buffer space to StartReadBuffers(), but
520 * we want it to wrap around at queue_size. Copy overflowing buffers to
521 * the front of the array where they'll be consumed, but also leave a copy
522 * in the overflow zone which the I/O operation has a pointer to (it needs
523 * a contiguous array). Both copies will be cleared when the buffers are
524 * handed to the consumer.
525 */
526 overflow = (buffer_index + nblocks + forwarded) - stream->queue_size;
527 if (overflow > 0)
528 {
529 Assert(overflow < stream->queue_size); /* can't overlap */
530 memcpy(&stream->buffers[0],
531 &stream->buffers[stream->queue_size],
532 sizeof(stream->buffers[0]) * overflow);
533 }
534
535 /* Compute location of start of next read, without using % operator. */
536 buffer_index += nblocks;
537 if (buffer_index >= stream->queue_size)
538 buffer_index -= stream->queue_size;
539 Assert(buffer_index >= 0 && buffer_index < stream->queue_size);
540 stream->next_buffer_index = buffer_index;
541
542 /* Adjust the pending read to cover the remaining portion, if any. */
543 stream->pending_read_blocknum += nblocks;
544 stream->pending_read_nblocks -= nblocks;
545
546 return true;
547}
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition bufmgr.c:4455
bool StartReadBuffers(ReadBuffersOperation *operation, Buffer *buffers, BlockNumber blockNum, int *nblocks, int flags)
Definition bufmgr.c:1618
uint32 GetAdditionalPinLimit(void)
Definition bufmgr.c:2707
memcpy(sums, checksumBaseOffsets, sizeof(checksumBaseOffsets))
uint32 GetAdditionalLocalPinLimit(void)
Definition localbuf.c:319

References ReadStream::advice_enabled, Assert, InProgressIO::buffer_index, BufferGetBlockNumber(), ReadStream::buffers, ReadStream::combine_distance, ReadStream::distance_decay_holdoff, fb(), ReadStream::forwarded_buffers, GetAdditionalLocalPinLimit(), GetAdditionalPinLimit(), i, ReadStream::initialized_buffers, InvalidBlockNumber, InvalidBuffer, ReadStream::io_combine_limit, ReadStream::ios, ReadStream::ios_in_progress, ReadStream::max_ios, ReadStream::max_pinned_buffers, memcpy(), Min, ReadStream::next_buffer_index, ReadStream::next_io_index, ReadStream::oldest_buffer_index, InProgressIO::op, ReadStream::pending_read_blocknum, ReadStream::pending_read_nblocks, PG_INT16_MAX, ReadStream::pinned_buffers, ReadStream::queue_size, ReadStream::read_buffers_flags, READ_BUFFERS_ISSUE_ADVICE, read_stream_count_io(), ReadStream::readahead_distance, ReadStream::seq_blocknum, ReadStream::seq_until_processed, StartReadBuffers(), and ReadStream::temporary.

Referenced by read_stream_look_ahead().

◆ read_stream_unget_block()

static void read_stream_unget_block ( ReadStream stream,
BlockNumber  blocknum 
)
inlinestatic

Definition at line 299 of file read_stream.c.

300{
301 /* We shouldn't ever unget more than one block. */
303 Assert(blocknum != InvalidBlockNumber);
304 stream->buffered_blocknum = blocknum;
305}

References Assert, ReadStream::buffered_blocknum, and InvalidBlockNumber.

Referenced by read_stream_look_ahead().