PostgreSQL Source Code git master
Loading...
Searching...
No Matches
instr_time.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * instr_time.c
4 * Non-inline parts of the portable high-precision interval timing
5 * implementation
6 *
7 * Portions Copyright (c) 2026, PostgreSQL Global Development Group
8 *
9 *
10 * IDENTIFICATION
11 * src/common/instr_time.c
12 *
13 *-------------------------------------------------------------------------
14 */
15#ifndef FRONTEND
16#include "postgres.h"
17#else
18#include "postgres_fe.h"
19#endif
20
21#include <math.h>
22
23#include "port/pg_cpu.h"
25
26/*
27 * Stores what the number of ticks needs to be multiplied with to end up
28 * with nanoseconds using integer math.
29 *
30 * In certain cases (TSC on x86-64, and QueryPerformanceCounter on Windows)
31 * the ticks to nanoseconds conversion requires floating point math because:
32 *
33 * sec = ticks / frequency_hz
34 * ns = ticks / frequency_hz * 1,000,000,000
35 * ns = ticks * (1,000,000,000 / frequency_hz)
36 * ns = ticks * (1,000,000 / frequency_khz) <-- now in kilohertz
37 *
38 * Here, 'ns' is usually a floating point number. For example for a 2.5 GHz CPU
39 * the scaling factor becomes 1,000,000 / 2,500,000 = 0.4.
40 *
41 * To be able to use integer math we work around the lack of precision. We
42 * first scale the integer up (left shift by TICKS_TO_NS_SHIFT) and after the
43 * multiplication by the number of ticks in pg_ticks_to_ns() we shift right by
44 * the same amount.
45 *
46 * We remember the maximum number of ticks that can be multiplied by the scale
47 * factor without overflowing so we can check via a * b > max <=> a > max / b.
48 *
49 * However, as this is meant for interval measurements, it is unlikely that the
50 * overflow path is actually taken in typical scenarios, since overflows would
51 * only occur for intervals longer than 6.5 days.
52 *
53 * Note we utilize unsigned integers even though ticks are stored as a signed
54 * value to encourage compilers to generate better assembly, since we can be
55 * sure these values are not negative.
56 *
57 * In all other cases we are using clock_gettime(), which uses nanoseconds
58 * as ticks. Hence, we set the multiplier to zero, which causes pg_ticks_to_ns
59 * to return the original value.
60 */
63bool timing_initialized = false;
65
66bool timing_tsc_enabled = false;
68
69static void set_ticks_per_ns(void);
70static void set_ticks_per_ns_system(void);
71
72#if PG_INSTR_TSC_CLOCK
73static bool tsc_use_by_default(void);
74static void set_ticks_per_ns_for_tsc(void);
75#endif
76
77/*
78 * Initializes timing infrastructure. Must be called before making any use
79 * of INSTR* macros.
80 */
81void
83{
85 return;
86
88 timing_initialized = true;
89}
90
91bool
93{
95
96#if PG_INSTR_TSC_CLOCK
98
99 switch (source)
100 {
103 break;
105 timing_tsc_enabled = false;
106 break;
108 /* Tell caller TSC is not usable */
110 return false;
111 timing_tsc_enabled = true;
112 break;
113 }
114#endif
115
118 return true;
119}
120
121static void
123{
124#if PG_INSTR_TSC_CLOCK
126 {
128 return;
129 }
130#endif
132}
133
134#ifndef WIN32
135
136static void
142
143#else /* WIN32 */
144
145/* GetTimerFrequency returns counts per second */
146static inline double
148{
150
152 return (double) f.QuadPart;
153}
154
155static void
157{
160}
161
162#endif /* WIN32 */
163
164/* TSC specific logic */
165
166#if PG_INSTR_TSC_CLOCK
167
168static void tsc_detect_frequency(void);
169
170/*
171 * Initialize the TSC clock source by determining its usability and frequency.
172 *
173 * This can be called multiple times without causing repeated work, as
174 * timing_tsc_frequency_khz will be set to 0 if a prior call determined the
175 * TSC is not usable. On EXEC_BACKEND (Windows), the TSC frequency may also be
176 * set by restore_backend_variables.
177 */
178void
180{
183}
184
185static void
187{
190}
191
192/*
193 * Detect the TSC frequency and whether RDTSCP is available on x86-64.
194 *
195 * This can't be reliably determined at compile time, since the
196 * availability of an "invariant" TSC (that is not affected by CPU
197 * frequency changes) is dependent on the CPU architecture. Additionally,
198 * there are cases where TSC availability is impacted by virtualization,
199 * where a simple cpuid feature check would not be enough.
200 */
201static void
203{
205
206 /* We require RDTSCP support and an invariant TSC, bail if not available */
208 return;
209
210 /* Determine speed at which the TSC advances */
213 return;
214
215 /*
216 * CPUID did not give us the TSC frequency. We can instead measure the
217 * frequency by comparing ticks against walltime in a calibration loop.
218 */
220}
221
222/*
223 * Decides whether to use the TSC clock source if the user did not specify it
224 * one way or the other, and it is available (checked separately).
225 *
226 * Inspired by the Linux kernel's clocksource watchdog disable logic as updated
227 * in 2021 to reflect the reliability of the TSC on Intel platforms, see
228 * check_system_tsc_reliable() in arch/x86/kernel/tsc.c, as well as discussion
229 * in https://lore.kernel.org/lkml/87eekfk8bd.fsf@nanos.tec.linutronix.de/
230 * and https://lore.kernel.org/lkml/87a6pimt1f.ffs@nanos.tec.linutronix.de/
231 * for reference.
232 *
233 * When tsc_detect_frequency determines the TSC is viable (invariant, etc.), and
234 * we're on an Intel platform (determined via TSC_ADJUST), we consider the TSC
235 * trustworthy by default, matching the Linux kernel.
236 *
237 * On other CPU platforms (e.g. AMD), or in some virtual machines, we don't have
238 * an easy way to determine the TSC's reliability. If on Linux, we can check if
239 * TSC is the active clocksource, based on it having run the watchdog logic to
240 * monitor TSC correctness. For other platforms the user must explicitly enable
241 * it via GUC instead.
242 */
243static bool
245{
247 return true;
248
249#if defined(__linux__)
250 {
251 FILE *fp;
252 char buf[128];
253
254 fp = fopen("/sys/devices/system/clocksource/clocksource0/current_clocksource", "r");
255 if (fp)
256 {
257 bool is_tsc = (fgets(buf, sizeof(buf), fp) != NULL &&
258 strcmp(buf, "tsc\n") == 0);
259
260 fclose(fp);
261 if (is_tsc)
262 return true;
263 }
264 }
265#endif
266
267 return false;
268}
269
270/*
271 * Calibrate the TSC frequency by comparing TSC ticks against walltime.
272 *
273 * Takes initial TSC and system clock snapshots, then loops, recomputing the
274 * frequency each TSC_CALIBRATION_SKIPS iterations from cumulative TSC
275 * ticks divided by elapsed time.
276 *
277 * Once the frequency estimate stabilizes (consecutive iterations agree), we
278 * consider it converged and the frequency in KHz is returned. If either too
279 * many iterations or a time limit passes without convergence, 0 is returned.
280 */
281#define TSC_CALIBRATION_MAX_NS (50 * NS_PER_MS)
282#define TSC_CALIBRATION_ITERATIONS 1000000
283#define TSC_CALIBRATION_SKIPS 100
284#define TSC_CALIBRATION_STABLE_CYCLES 10
285uint32
287{
290 double freq_khz = 0;
291 double prev_freq_khz = 0;
292 int stable_count = 0;
295
296 /*
297 * Frequency must be initialized to avoid recursion via
298 * pg_set_timing_clock_source.
299 */
301
302 /* Ensure INSTR_* calls below work on system time */
304
306
309
310 for (int i = 0; i < TSC_CALIBRATION_ITERATIONS; i++)
311 {
316
318
319 now_tsc = pg_rdtscp();
320
323
324 /* Safety: bail out if we've taken too long */
326 break;
327
329
330 /*
331 * Skip if TSC hasn't advanced, or we walked backwards for some
332 * reason.
333 */
334 if (now_tsc == prev_tsc || elapsed_ns <= 0 || elapsed_ticks <= 0)
335 continue;
336
337 /*
338 * We only measure frequency every TSC_CALIBRATION_SKIPS to avoid
339 * stabilizing based on just a handful of RDTSC instructions.
340 */
341 if (i % TSC_CALIBRATION_SKIPS != 0)
342 continue;
343
344 freq_khz = ((double) elapsed_ticks / elapsed_ns) * 1000 * 1000;
345
346 /*
347 * Once freq_khz / prev_freq_khz is small, check if it stays that way.
348 * If it does for long enough, we've got a winner frequency.
349 */
350 if (prev_freq_khz != 0 && fabs(1 - freq_khz / prev_freq_khz) < 0.0001)
351 {
352 stable_count++;
354 break;
355 }
356 else
357 stable_count = 0;
358
361 }
362
363 /* Restore the previous clock source */
365
367 return 0; /* did not converge */
368
369 return (uint32) freq_khz;
370}
371
372#endif /* PG_INSTR_TSC_CLOCK */
#define Assert(condition)
Definition c.h:943
int64_t int64
Definition c.h:621
int32_t int32
Definition c.h:620
#define PG_INT64_MAX
Definition c.h:676
uint64_t uint64
Definition c.h:625
uint32_t uint32
Definition c.h:624
int timing_clock_source
Definition instr_time.c:64
static void set_ticks_per_ns_system(void)
Definition instr_time.c:137
uint64 max_ticks_no_overflow
Definition instr_time.c:62
static void set_ticks_per_ns(void)
Definition instr_time.c:122
void pg_initialize_timing(void)
Definition instr_time.c:82
int32 timing_tsc_frequency_khz
Definition instr_time.c:67
uint64 ticks_per_ns_scaled
Definition instr_time.c:61
bool pg_set_timing_clock_source(TimingClockSourceType source)
Definition instr_time.c:92
bool timing_initialized
Definition instr_time.c:63
bool timing_tsc_enabled
Definition instr_time.c:66
#define TICKS_TO_NS_SHIFT
Definition instr_time.h:89
#define INSTR_TIME_SET_CURRENT(t)
Definition instr_time.h:426
#define INSTR_TIME_GET_NANOSEC(t)
Definition instr_time.h:445
#define INSTR_TIME_SUBTRACT(x, y)
Definition instr_time.h:436
TimingClockSourceType
Definition instr_time.h:124
@ TIMING_CLOCK_SOURCE_SYSTEM
Definition instr_time.h:126
@ TIMING_CLOCK_SOURCE_AUTO
Definition instr_time.h:125
int i
Definition isn.c:77
static rewind_source * source
Definition pg_rewind.c:89
static char buf[DEFAULT_XLOG_SEG_SIZE]
static int fb(int x)
#define NS_PER_S
Definition uuid.c:31