PostgreSQL Source Code git master
pgstat_io.c
Go to the documentation of this file.
1/* -------------------------------------------------------------------------
2 *
3 * pgstat_io.c
4 * Implementation of IO statistics.
5 *
6 * This file contains the implementation of IO statistics. It is kept separate
7 * from pgstat.c to enforce the line between the statistics access / storage
8 * implementation and the details about individual types of statistics.
9 *
10 * Copyright (c) 2021-2025, PostgreSQL Global Development Group
11 *
12 * IDENTIFICATION
13 * src/backend/utils/activity/pgstat_io.c
14 * -------------------------------------------------------------------------
15 */
16
17#include "postgres.h"
18
19#include "executor/instrument.h"
20#include "storage/bufmgr.h"
22
24static bool have_iostats = false;
25
26/*
27 * Check that stats have not been counted for any combination of IOObject,
28 * IOContext, and IOOp which are not tracked for the passed-in BackendType. If
29 * stats are tracked for this combination and IO times are non-zero, counts
30 * should be non-zero.
31 *
32 * The passed-in PgStat_BktypeIO must contain stats from the BackendType
33 * specified by the second parameter. Caller is responsible for locking the
34 * passed-in PgStat_BktypeIO, if needed.
35 */
36bool
38 BackendType bktype)
39{
40 for (int io_object = 0; io_object < IOOBJECT_NUM_TYPES; io_object++)
41 {
42 for (int io_context = 0; io_context < IOCONTEXT_NUM_TYPES; io_context++)
43 {
44 for (int io_op = 0; io_op < IOOP_NUM_TYPES; io_op++)
45 {
46 /* we do track it */
47 if (pgstat_tracks_io_op(bktype, io_object, io_context, io_op))
48 {
49 /* ensure that if IO times are non-zero, counts are > 0 */
50 if (backend_io->times[io_object][io_context][io_op] != 0 &&
51 backend_io->counts[io_object][io_context][io_op] <= 0)
52 return false;
53
54 continue;
55 }
56
57 /* we don't track it, and it is not 0 */
58 if (backend_io->counts[io_object][io_context][io_op] != 0)
59 return false;
60 }
61 }
62 }
63
64 return true;
65}
66
67void
68pgstat_count_io_op(IOObject io_object, IOContext io_context, IOOp io_op,
69 uint32 cnt, uint64 bytes)
70{
71 Assert((unsigned int) io_object < IOOBJECT_NUM_TYPES);
72 Assert((unsigned int) io_context < IOCONTEXT_NUM_TYPES);
73 Assert(pgstat_is_ioop_tracked_in_bytes(io_op) || bytes == 0);
74 Assert(pgstat_tracks_io_op(MyBackendType, io_object, io_context, io_op));
75
76 PendingIOStats.counts[io_object][io_context][io_op] += cnt;
77 PendingIOStats.bytes[io_object][io_context][io_op] += bytes;
78
79 /* Add the per-backend counts */
80 pgstat_count_backend_io_op(io_object, io_context, io_op, cnt, bytes);
81
82 have_iostats = true;
83}
84
85/*
86 * Initialize the internal timing for an IO operation, depending on an
87 * IO timing GUC.
88 */
90pgstat_prepare_io_time(bool track_io_guc)
91{
92 instr_time io_start;
93
94 if (track_io_guc)
95 INSTR_TIME_SET_CURRENT(io_start);
96 else
97 {
98 /*
99 * There is no need to set io_start when an IO timing GUC is disabled,
100 * still initialize it to zero to avoid compiler warnings.
101 */
102 INSTR_TIME_SET_ZERO(io_start);
103 }
104
105 return io_start;
106}
107
108/*
109 * Like pgstat_count_io_op() except it also accumulates time.
110 */
111void
112pgstat_count_io_op_time(IOObject io_object, IOContext io_context, IOOp io_op,
114{
115 if (track_io_timing)
116 {
117 instr_time io_time;
118
119 INSTR_TIME_SET_CURRENT(io_time);
121
122 if (io_op == IOOP_WRITE || io_op == IOOP_EXTEND)
123 {
125 if (io_object == IOOBJECT_RELATION)
127 else if (io_object == IOOBJECT_TEMP_RELATION)
129 }
130 else if (io_op == IOOP_READ)
131 {
133 if (io_object == IOOBJECT_RELATION)
135 else if (io_object == IOOBJECT_TEMP_RELATION)
137 }
138
139 INSTR_TIME_ADD(PendingIOStats.pending_times[io_object][io_context][io_op],
140 io_time);
141
142 /* Add the per-backend count */
143 pgstat_count_backend_io_op_time(io_object, io_context, io_op,
144 io_time);
145 }
146
147 pgstat_count_io_op(io_object, io_context, io_op, cnt, bytes);
148}
149
150PgStat_IO *
152{
154
155 return &pgStatLocal.snapshot.io;
156}
157
158/*
159 * Check if there any IO stats waiting for flush.
160 */
161bool
163{
164 return have_iostats;
165}
166
167/*
168 * Simpler wrapper of pgstat_io_flush_cb()
169 */
170void
171pgstat_flush_io(bool nowait)
172{
173 (void) pgstat_io_flush_cb(nowait);
174}
175
176/*
177 * Flush out locally pending IO statistics
178 *
179 * If no stats have been recorded, this function returns false.
180 *
181 * If nowait is true, this function returns true if the lock could not be
182 * acquired. Otherwise, return false.
183 */
184bool
186{
187 LWLock *bktype_lock;
188 PgStat_BktypeIO *bktype_shstats;
189
190 if (!have_iostats)
191 return false;
192
193 bktype_lock = &pgStatLocal.shmem->io.locks[MyBackendType];
194 bktype_shstats =
196
197 if (!nowait)
198 LWLockAcquire(bktype_lock, LW_EXCLUSIVE);
199 else if (!LWLockConditionalAcquire(bktype_lock, LW_EXCLUSIVE))
200 return true;
201
202 for (int io_object = 0; io_object < IOOBJECT_NUM_TYPES; io_object++)
203 {
204 for (int io_context = 0; io_context < IOCONTEXT_NUM_TYPES; io_context++)
205 {
206 for (int io_op = 0; io_op < IOOP_NUM_TYPES; io_op++)
207 {
208 instr_time time;
209
210 bktype_shstats->counts[io_object][io_context][io_op] +=
211 PendingIOStats.counts[io_object][io_context][io_op];
212
213 bktype_shstats->bytes[io_object][io_context][io_op] +=
214 PendingIOStats.bytes[io_object][io_context][io_op];
215
216 time = PendingIOStats.pending_times[io_object][io_context][io_op];
217
218 bktype_shstats->times[io_object][io_context][io_op] +=
220 }
221 }
222 }
223
225
226 LWLockRelease(bktype_lock);
227
228 memset(&PendingIOStats, 0, sizeof(PendingIOStats));
229
230 have_iostats = false;
231
232 return false;
233}
234
235const char *
237{
238 switch (io_context)
239 {
241 return "bulkread";
243 return "bulkwrite";
244 case IOCONTEXT_NORMAL:
245 return "normal";
246 case IOCONTEXT_VACUUM:
247 return "vacuum";
248 }
249
250 elog(ERROR, "unrecognized IOContext value: %d", io_context);
252}
253
254const char *
256{
257 switch (io_object)
258 {
260 return "relation";
262 return "temp relation";
263 }
264
265 elog(ERROR, "unrecognized IOObject value: %d", io_object);
267}
268
269void
271{
272 PgStatShared_IO *stat_shmem = (PgStatShared_IO *) stats;
273
274 for (int i = 0; i < BACKEND_NUM_TYPES; i++)
276}
277
278void
280{
281 for (int i = 0; i < BACKEND_NUM_TYPES; i++)
282 {
283 LWLock *bktype_lock = &pgStatLocal.shmem->io.locks[i];
284 PgStat_BktypeIO *bktype_shstats = &pgStatLocal.shmem->io.stats.stats[i];
285
286 LWLockAcquire(bktype_lock, LW_EXCLUSIVE);
287
288 /*
289 * Use the lock in the first BackendType's PgStat_BktypeIO to protect
290 * the reset timestamp as well.
291 */
292 if (i == 0)
294
295 memset(bktype_shstats, 0, sizeof(*bktype_shstats));
296 LWLockRelease(bktype_lock);
297 }
298}
299
300void
302{
303 for (int i = 0; i < BACKEND_NUM_TYPES; i++)
304 {
305 LWLock *bktype_lock = &pgStatLocal.shmem->io.locks[i];
306 PgStat_BktypeIO *bktype_shstats = &pgStatLocal.shmem->io.stats.stats[i];
307 PgStat_BktypeIO *bktype_snap = &pgStatLocal.snapshot.io.stats[i];
308
309 LWLockAcquire(bktype_lock, LW_SHARED);
310
311 /*
312 * Use the lock in the first BackendType's PgStat_BktypeIO to protect
313 * the reset timestamp as well.
314 */
315 if (i == 0)
318
319 /* using struct assignment due to better type safety */
320 *bktype_snap = *bktype_shstats;
321 LWLockRelease(bktype_lock);
322 }
323}
324
325/*
326* IO statistics are not collected for all BackendTypes.
327*
328* The following BackendTypes do not participate in the cumulative stats
329* subsystem or do not perform IO on which we currently track:
330* - Dead-end backend because it is not connected to shared memory and
331* doesn't do any IO
332* - Syslogger because it is not connected to shared memory
333* - Archiver because most relevant archiving IO is delegated to a
334* specialized command or module
335* - WAL Receiver, WAL Writer, and WAL Summarizer IO are not tracked in
336* pg_stat_io for now
337*
338* Function returns true if BackendType participates in the cumulative stats
339* subsystem for IO and false if it does not.
340*
341* When adding a new BackendType, also consider adding relevant restrictions to
342* pgstat_tracks_io_object() and pgstat_tracks_io_op().
343*/
344bool
346{
347 /*
348 * List every type so that new backend types trigger a warning about
349 * needing to adjust this switch.
350 */
351 switch (bktype)
352 {
353 case B_INVALID:
355 case B_ARCHIVER:
356 case B_LOGGER:
357 case B_WAL_RECEIVER:
358 case B_WAL_WRITER:
359 case B_WAL_SUMMARIZER:
360 return false;
361
363 case B_AUTOVAC_WORKER:
364 case B_BACKEND:
365 case B_BG_WORKER:
366 case B_BG_WRITER:
367 case B_CHECKPOINTER:
370 case B_STARTUP:
371 case B_WAL_SENDER:
372 return true;
373 }
374
375 return false;
376}
377
378/*
379 * Some BackendTypes do not perform IO on certain IOObjects or in certain
380 * IOContexts. Some IOObjects are never operated on in some IOContexts. Check
381 * that the given BackendType is expected to do IO in the given IOContext and
382 * on the given IOObject and that the given IOObject is expected to be operated
383 * on in the given IOContext.
384 */
385bool
387 IOContext io_context)
388{
389 bool no_temp_rel;
390
391 /*
392 * Some BackendTypes should never track IO statistics.
393 */
394 if (!pgstat_tracks_io_bktype(bktype))
395 return false;
396
397 /*
398 * Currently, IO on temporary relations can only occur in the
399 * IOCONTEXT_NORMAL IOContext.
400 */
401 if (io_context != IOCONTEXT_NORMAL &&
402 io_object == IOOBJECT_TEMP_RELATION)
403 return false;
404
405 /*
406 * In core Postgres, only regular backends and WAL Sender processes
407 * executing queries will use local buffers and operate on temporary
408 * relations. Parallel workers will not use local buffers (see
409 * InitLocalBuffers()); however, extensions leveraging background workers
410 * have no such limitation, so track IO on IOOBJECT_TEMP_RELATION for
411 * BackendType B_BG_WORKER.
412 */
413 no_temp_rel = bktype == B_AUTOVAC_LAUNCHER || bktype == B_BG_WRITER ||
414 bktype == B_CHECKPOINTER || bktype == B_AUTOVAC_WORKER ||
415 bktype == B_STANDALONE_BACKEND || bktype == B_STARTUP;
416
417 if (no_temp_rel && io_context == IOCONTEXT_NORMAL &&
418 io_object == IOOBJECT_TEMP_RELATION)
419 return false;
420
421 /*
422 * Some BackendTypes do not currently perform any IO in certain
423 * IOContexts, and, while it may not be inherently incorrect for them to
424 * do so, excluding those rows from the view makes the view easier to use.
425 */
426 if ((bktype == B_CHECKPOINTER || bktype == B_BG_WRITER) &&
427 (io_context == IOCONTEXT_BULKREAD ||
428 io_context == IOCONTEXT_BULKWRITE ||
429 io_context == IOCONTEXT_VACUUM))
430 return false;
431
432 if (bktype == B_AUTOVAC_LAUNCHER && io_context == IOCONTEXT_VACUUM)
433 return false;
434
435 if ((bktype == B_AUTOVAC_WORKER || bktype == B_AUTOVAC_LAUNCHER) &&
436 io_context == IOCONTEXT_BULKWRITE)
437 return false;
438
439 return true;
440}
441
442/*
443 * Some BackendTypes will never do certain IOOps and some IOOps should not
444 * occur in certain IOContexts or on certain IOObjects. Check that the given
445 * IOOp is valid for the given BackendType in the given IOContext and on the
446 * given IOObject. Note that there are currently no cases of an IOOp being
447 * invalid for a particular BackendType only within a certain IOContext and/or
448 * only on a certain IOObject.
449 */
450bool
452 IOContext io_context, IOOp io_op)
453{
454 bool strategy_io_context;
455
456 /* if (io_context, io_object) will never collect stats, we're done */
457 if (!pgstat_tracks_io_object(bktype, io_object, io_context))
458 return false;
459
460 /*
461 * Some BackendTypes will not do certain IOOps.
462 */
463 if ((bktype == B_BG_WRITER || bktype == B_CHECKPOINTER) &&
464 (io_op == IOOP_READ || io_op == IOOP_EVICT || io_op == IOOP_HIT))
465 return false;
466
467 if ((bktype == B_AUTOVAC_LAUNCHER || bktype == B_BG_WRITER ||
468 bktype == B_CHECKPOINTER) && io_op == IOOP_EXTEND)
469 return false;
470
471 /*
472 * Temporary tables are not logged and thus do not require fsync'ing.
473 * Writeback is not requested for temporary tables.
474 */
475 if (io_object == IOOBJECT_TEMP_RELATION &&
476 (io_op == IOOP_FSYNC || io_op == IOOP_WRITEBACK))
477 return false;
478
479 /*
480 * Some IOOps are not valid in certain IOContexts and some IOOps are only
481 * valid in certain contexts.
482 */
483 if (io_context == IOCONTEXT_BULKREAD && io_op == IOOP_EXTEND)
484 return false;
485
486 strategy_io_context = io_context == IOCONTEXT_BULKREAD ||
487 io_context == IOCONTEXT_BULKWRITE || io_context == IOCONTEXT_VACUUM;
488
489 /*
490 * IOOP_REUSE is only relevant when a BufferAccessStrategy is in use.
491 */
492 if (!strategy_io_context && io_op == IOOP_REUSE)
493 return false;
494
495 /*
496 * IOOP_FSYNC IOOps done by a backend using a BufferAccessStrategy are
497 * counted in the IOCONTEXT_NORMAL IOContext. See comment in
498 * register_dirty_segment() for more details.
499 */
500 if (strategy_io_context && io_op == IOOP_FSYNC)
501 return false;
502
503
504 return true;
505}
bool track_io_timing
Definition: bufmgr.c:143
#define Assert(condition)
Definition: c.h:815
uint64_t uint64
Definition: c.h:489
#define pg_unreachable()
Definition: c.h:318
uint32_t uint32
Definition: c.h:488
int64 TimestampTz
Definition: timestamp.h:39
#define ERROR
Definition: elog.h:39
#define elog(elevel,...)
Definition: elog.h:225
#define INSTR_TIME_SET_CURRENT(t)
Definition: instr_time.h:122
#define INSTR_TIME_ADD(x, y)
Definition: instr_time.h:178
#define INSTR_TIME_SUBTRACT(x, y)
Definition: instr_time.h:181
#define INSTR_TIME_GET_MICROSEC(t)
Definition: instr_time.h:194
#define INSTR_TIME_SET_ZERO(t)
Definition: instr_time.h:172
BufferUsage pgBufferUsage
Definition: instrument.c:20
int i
Definition: isn.c:72
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1168
void LWLockRelease(LWLock *lock)
Definition: lwlock.c:1781
void LWLockInitialize(LWLock *lock, int tranche_id)
Definition: lwlock.c:707
bool LWLockConditionalAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1339
@ LWTRANCHE_PGSTATS_DATA
Definition: lwlock.h:205
@ LW_SHARED
Definition: lwlock.h:115
@ LW_EXCLUSIVE
Definition: lwlock.h:114
#define BACKEND_NUM_TYPES
Definition: miscadmin.h:375
BackendType
Definition: miscadmin.h:337
@ B_WAL_SUMMARIZER
Definition: miscadmin.h:365
@ B_WAL_WRITER
Definition: miscadmin.h:366
@ B_WAL_RECEIVER
Definition: miscadmin.h:364
@ B_CHECKPOINTER
Definition: miscadmin.h:362
@ B_WAL_SENDER
Definition: miscadmin.h:346
@ B_LOGGER
Definition: miscadmin.h:372
@ B_STARTUP
Definition: miscadmin.h:363
@ B_BG_WORKER
Definition: miscadmin.h:345
@ B_INVALID
Definition: miscadmin.h:338
@ B_STANDALONE_BACKEND
Definition: miscadmin.h:349
@ B_BG_WRITER
Definition: miscadmin.h:361
@ B_BACKEND
Definition: miscadmin.h:341
@ B_ARCHIVER
Definition: miscadmin.h:360
@ B_AUTOVAC_LAUNCHER
Definition: miscadmin.h:343
@ B_SLOTSYNC_WORKER
Definition: miscadmin.h:347
@ B_DEAD_END_BACKEND
Definition: miscadmin.h:342
@ B_AUTOVAC_WORKER
Definition: miscadmin.h:344
BackendType MyBackendType
Definition: miscinit.c:64
static time_t start_time
Definition: pg_ctl.c:95
void pgstat_snapshot_fixed(PgStat_Kind kind)
Definition: pgstat.c:1079
PgStat_LocalState pgStatLocal
Definition: pgstat.c:213
#define pgstat_count_buffer_read_time(n)
Definition: pgstat.h:610
IOObject
Definition: pgstat.h:274
@ IOOBJECT_RELATION
Definition: pgstat.h:275
@ IOOBJECT_TEMP_RELATION
Definition: pgstat.h:276
#define pgstat_is_ioop_tracked_in_bytes(io_op)
Definition: pgstat.h:317
#define pgstat_count_buffer_write_time(n)
Definition: pgstat.h:612
#define IOOP_NUM_TYPES
Definition: pgstat.h:315
IOContext
Definition: pgstat.h:282
@ IOCONTEXT_NORMAL
Definition: pgstat.h:285
@ IOCONTEXT_VACUUM
Definition: pgstat.h:286
@ IOCONTEXT_BULKREAD
Definition: pgstat.h:283
@ IOCONTEXT_BULKWRITE
Definition: pgstat.h:284
#define IOCONTEXT_NUM_TYPES
Definition: pgstat.h:289
IOOp
Definition: pgstat.h:301
@ IOOP_EXTEND
Definition: pgstat.h:310
@ IOOP_FSYNC
Definition: pgstat.h:304
@ IOOP_READ
Definition: pgstat.h:311
@ IOOP_WRITEBACK
Definition: pgstat.h:307
@ IOOP_HIT
Definition: pgstat.h:305
@ IOOP_EVICT
Definition: pgstat.h:303
@ IOOP_REUSE
Definition: pgstat.h:306
@ IOOP_WRITE
Definition: pgstat.h:312
#define IOOBJECT_NUM_TYPES
Definition: pgstat.h:279
void pgstat_count_backend_io_op(IOObject io_object, IOContext io_context, IOOp io_op, uint32 cnt, uint64 bytes)
void pgstat_count_backend_io_op_time(IOObject io_object, IOContext io_context, IOOp io_op, instr_time io_time)
instr_time pgstat_prepare_io_time(bool track_io_guc)
Definition: pgstat_io.c:90
void pgstat_count_io_op(IOObject io_object, IOContext io_context, IOOp io_op, uint32 cnt, uint64 bytes)
Definition: pgstat_io.c:68
void pgstat_flush_io(bool nowait)
Definition: pgstat_io.c:171
void pgstat_count_io_op_time(IOObject io_object, IOContext io_context, IOOp io_op, instr_time start_time, uint32 cnt, uint64 bytes)
Definition: pgstat_io.c:112
PgStat_IO * pgstat_fetch_stat_io(void)
Definition: pgstat_io.c:151
const char * pgstat_get_io_context_name(IOContext io_context)
Definition: pgstat_io.c:236
bool pgstat_tracks_io_bktype(BackendType bktype)
Definition: pgstat_io.c:345
const char * pgstat_get_io_object_name(IOObject io_object)
Definition: pgstat_io.c:255
bool pgstat_io_have_pending_cb(void)
Definition: pgstat_io.c:162
bool pgstat_io_flush_cb(bool nowait)
Definition: pgstat_io.c:185
void pgstat_io_reset_all_cb(TimestampTz ts)
Definition: pgstat_io.c:279
bool pgstat_bktype_io_stats_valid(PgStat_BktypeIO *backend_io, BackendType bktype)
Definition: pgstat_io.c:37
static PgStat_PendingIO PendingIOStats
Definition: pgstat_io.c:23
bool pgstat_tracks_io_op(BackendType bktype, IOObject io_object, IOContext io_context, IOOp io_op)
Definition: pgstat_io.c:451
static bool have_iostats
Definition: pgstat_io.c:24
void pgstat_io_snapshot_cb(void)
Definition: pgstat_io.c:301
void pgstat_io_init_shmem_cb(void *stats)
Definition: pgstat_io.c:270
bool pgstat_tracks_io_object(BackendType bktype, IOObject io_object, IOContext io_context)
Definition: pgstat_io.c:386
#define PGSTAT_KIND_IO
Definition: pgstat_kind.h:38
instr_time local_blk_read_time
Definition: instrument.h:38
instr_time shared_blk_read_time
Definition: instrument.h:36
instr_time shared_blk_write_time
Definition: instrument.h:37
instr_time local_blk_write_time
Definition: instrument.h:39
Definition: lwlock.h:42
LWLock locks[BACKEND_NUM_TYPES]
PgStat_Counter times[IOOBJECT_NUM_TYPES][IOCONTEXT_NUM_TYPES][IOOP_NUM_TYPES]
Definition: pgstat.h:325
uint64 bytes[IOOBJECT_NUM_TYPES][IOCONTEXT_NUM_TYPES][IOOP_NUM_TYPES]
Definition: pgstat.h:323
PgStat_Counter counts[IOOBJECT_NUM_TYPES][IOCONTEXT_NUM_TYPES][IOOP_NUM_TYPES]
Definition: pgstat.h:324
PgStat_BktypeIO stats[BACKEND_NUM_TYPES]
Definition: pgstat.h:338
TimestampTz stat_reset_timestamp
Definition: pgstat.h:337
PgStat_Snapshot snapshot
PgStat_ShmemControl * shmem
PgStat_Counter counts[IOOBJECT_NUM_TYPES][IOCONTEXT_NUM_TYPES][IOOP_NUM_TYPES]
Definition: pgstat.h:331
uint64 bytes[IOOBJECT_NUM_TYPES][IOCONTEXT_NUM_TYPES][IOOP_NUM_TYPES]
Definition: pgstat.h:330
instr_time pending_times[IOOBJECT_NUM_TYPES][IOCONTEXT_NUM_TYPES][IOOP_NUM_TYPES]
Definition: pgstat.h:332
PgStatShared_IO io