PostgreSQL Source Code git master
Loading...
Searching...
No Matches
instr_time.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * instr_time.c
4 * Non-inline parts of the portable high-precision interval timing
5 * implementation
6 *
7 * Portions Copyright (c) 2026, PostgreSQL Global Development Group
8 *
9 *
10 * IDENTIFICATION
11 * src/common/instr_time.c
12 *
13 *-------------------------------------------------------------------------
14 */
15#ifndef FRONTEND
16#include "postgres.h"
17#else
18#include "postgres_fe.h"
19#endif
20
21#include <math.h>
22
23#include "port/pg_cpu.h"
25
26/*
27 * Stores what the number of ticks needs to be multiplied with to end up
28 * with nanoseconds using integer math.
29 *
30 * In certain cases (TSC on x86-64, and QueryPerformanceCounter on Windows)
31 * the ticks to nanoseconds conversion requires floating point math because:
32 *
33 * sec = ticks / frequency_hz
34 * ns = ticks / frequency_hz * 1,000,000,000
35 * ns = ticks * (1,000,000,000 / frequency_hz)
36 * ns = ticks * (1,000,000 / frequency_khz) <-- now in kilohertz
37 *
38 * Here, 'ns' is usually a floating point number. For example for a 2.5 GHz CPU
39 * the scaling factor becomes 1,000,000 / 2,500,000 = 0.4.
40 *
41 * To be able to use integer math we work around the lack of precision. We
42 * first scale the integer up (left shift by TICKS_TO_NS_SHIFT) and after the
43 * multiplication by the number of ticks in pg_ticks_to_ns() we shift right by
44 * the same amount.
45 *
46 * We remember the maximum number of ticks that can be multiplied by the scale
47 * factor without overflowing so we can check via a * b > max <=> a > max / b.
48 *
49 * However, as this is meant for interval measurements, it is unlikely that the
50 * overflow path is actually taken in typical scenarios, since overflows would
51 * only occur for intervals longer than 6.5 days.
52 *
53 * Note we utilize unsigned integers even though ticks are stored as a signed
54 * value to encourage compilers to generate better assembly, since we can be
55 * sure these values are not negative.
56 *
57 * In all other cases we are using clock_gettime(), which uses nanoseconds
58 * as ticks. Hence, we set the multiplier to zero, which causes pg_ticks_to_ns
59 * to return the original value.
60 */
63bool timing_initialized = false;
65
66bool timing_tsc_enabled = false;
68
69static void set_ticks_per_ns(void);
70static void set_ticks_per_ns_system(void);
71
72#if PG_INSTR_TSC_CLOCK
73static TscClockSourceInfo tsc_info = {.calibrated_frequency_khz = -1};
74
75static bool tsc_use_by_default(void);
76static void set_ticks_per_ns_for_tsc(void);
77#endif
78
79/*
80 * Initializes timing infrastructure. Must be called before making any use
81 * of INSTR* macros.
82 */
83void
85{
87 return;
88
90 timing_initialized = true;
91}
92
93bool
95{
97
98#if PG_INSTR_TSC_CLOCK
100
101 switch (source)
102 {
105 break;
107 timing_tsc_enabled = false;
108 break;
110 /* Tell caller TSC is not usable */
112 return false;
113 timing_tsc_enabled = true;
114 break;
115 }
116#endif
117
120 return true;
121}
122
123static void
125{
126#if PG_INSTR_TSC_CLOCK
128 {
130 return;
131 }
132#endif
134}
135
136#ifndef WIN32
137
138static void
144
145#else /* WIN32 */
146
147/* GetTimerFrequency returns counts per second */
148static inline double
150{
152
154 return (double) f.QuadPart;
155}
156
157static void
159{
162}
163
164#endif /* WIN32 */
165
166/* TSC specific logic */
167
168#if PG_INSTR_TSC_CLOCK
169
170static void tsc_detect_frequency(void);
172
173/*
174 * Initialize the TSC clock source by determining its usability and frequency.
175 *
176 * This can be called multiple times without causing repeated work, as
177 * timing_tsc_frequency_khz will be set to 0 if a prior call determined the
178 * TSC is not usable. On EXEC_BACKEND (Windows), the TSC frequency may also be
179 * set by restore_backend_variables.
180 */
181void
183{
186}
187
188static void
190{
193}
194
195/*
196 * Detect the TSC frequency and whether RDTSCP is available on x86-64.
197 *
198 * This can't be reliably determined at compile time, since the
199 * availability of an "invariant" TSC (that is not affected by CPU
200 * frequency changes) is dependent on the CPU architecture. Additionally,
201 * there are cases where TSC availability is impacted by virtualization,
202 * where a simple cpuid feature check would not be enough.
203 */
204static void
206{
208 tsc_info.frequency_khz = 0;
209 tsc_info.frequency_source[0] = '\0';
210
211 strlcat(tsc_info.frequency_source, "x86",
212 sizeof(tsc_info.frequency_source));
213
214 /* We require RDTSCP support and an invariant TSC, bail if not available */
216 {
217 strlcat(tsc_info.frequency_source, ", no rdtscp",
218 sizeof(tsc_info.frequency_source));
219 return;
220 }
221
223 {
224 strlcat(tsc_info.frequency_source, ", not invariant",
225 sizeof(tsc_info.frequency_source));
226 return;
227 }
228
229 /* Determine speed at which the TSC advances */
231 sizeof(tsc_info.frequency_source));
233 {
234 tsc_info.frequency_khz = timing_tsc_frequency_khz;
235 return;
236 }
237
238 /*
239 * CPUID did not give us the TSC frequency. We can instead measure the
240 * frequency by comparing ticks against walltime in a calibration loop.
241 */
242 if (tsc_info.calibrated_frequency_khz < 0)
243 tsc_info.calibrated_frequency_khz = pg_tsc_calibrate_frequency();
244
245 timing_tsc_frequency_khz = tsc_info.calibrated_frequency_khz;
247 {
248 strlcat(tsc_info.frequency_source, ", calibration",
249 sizeof(tsc_info.frequency_source));
250 tsc_info.frequency_khz = timing_tsc_frequency_khz;
251 }
252}
253
254/*
255 * Decides whether to use the TSC clock source if the user did not specify it
256 * one way or the other, and it is available (checked separately).
257 *
258 * Inspired by the Linux kernel's clocksource watchdog disable logic as updated
259 * in 2021 to reflect the reliability of the TSC on Intel platforms, see
260 * check_system_tsc_reliable() in arch/x86/kernel/tsc.c, as well as discussion
261 * in https://lore.kernel.org/lkml/87eekfk8bd.fsf@nanos.tec.linutronix.de/
262 * and https://lore.kernel.org/lkml/87a6pimt1f.ffs@nanos.tec.linutronix.de/
263 * for reference.
264 *
265 * When tsc_detect_frequency determines the TSC is viable (invariant, etc.), and
266 * we're on an Intel platform (determined via TSC_ADJUST), we consider the TSC
267 * trustworthy by default, matching the Linux kernel.
268 *
269 * On other CPU platforms (e.g. AMD), or in some virtual machines, we don't have
270 * an easy way to determine the TSC's reliability. If on Linux, we can check if
271 * TSC is the active clocksource, based on it having run the watchdog logic to
272 * monitor TSC correctness. For other platforms the user must explicitly enable
273 * it via GUC instead.
274 */
275static bool
277{
279 return true;
280
281#if defined(__linux__)
282 {
283 FILE *fp;
284 char buf[128];
285
286 fp = fopen("/sys/devices/system/clocksource/clocksource0/current_clocksource", "r");
287 if (fp)
288 {
289 bool is_tsc = (fgets(buf, sizeof(buf), fp) != NULL &&
290 strcmp(buf, "tsc\n") == 0);
291
292 fclose(fp);
293 if (is_tsc)
294 return true;
295 }
296 }
297#endif
298
299 return false;
300}
301
302/*
303 * Calibrate the TSC frequency by comparing TSC ticks against walltime.
304 *
305 * Takes initial TSC and system clock snapshots, then loops, recomputing the
306 * frequency each TSC_CALIBRATION_SKIPS iterations from cumulative TSC
307 * ticks divided by elapsed time.
308 *
309 * Once the frequency estimate stabilizes (consecutive iterations agree), we
310 * consider it converged and the frequency in KHz is returned. If either too
311 * many iterations or a time limit passes without convergence, 0 is returned.
312 */
313#define TSC_CALIBRATION_MAX_NS (50 * NS_PER_MS)
314#define TSC_CALIBRATION_ITERATIONS 1000000
315#define TSC_CALIBRATION_SKIPS 100
316#define TSC_CALIBRATION_STABLE_CYCLES 10
317static uint32
319{
322 double freq_khz = 0;
323 double prev_freq_khz = 0;
324 int stable_count = 0;
327
328 /*
329 * Frequency must be initialized to avoid recursion via
330 * pg_set_timing_clock_source.
331 */
333
334 /* Ensure INSTR_* calls below work on system time */
336
338
341
342 for (int i = 0; i < TSC_CALIBRATION_ITERATIONS; i++)
343 {
348
350
351 now_tsc = pg_rdtscp();
352
355
356 /* Safety: bail out if we've taken too long */
358 break;
359
361
362 /*
363 * Skip if TSC hasn't advanced, or we walked backwards for some
364 * reason.
365 */
366 if (now_tsc == prev_tsc || elapsed_ns <= 0 || elapsed_ticks <= 0)
367 continue;
368
369 /*
370 * We only measure frequency every TSC_CALIBRATION_SKIPS to avoid
371 * stabilizing based on just a handful of RDTSC instructions.
372 */
373 if (i % TSC_CALIBRATION_SKIPS != 0)
374 continue;
375
376 freq_khz = ((double) elapsed_ticks / elapsed_ns) * 1000 * 1000;
377
378 /*
379 * Once freq_khz / prev_freq_khz is small, check if it stays that way.
380 * If it does for long enough, we've got a winner frequency.
381 */
382 if (prev_freq_khz != 0 && fabs(1 - freq_khz / prev_freq_khz) < 0.0001)
383 {
384 stable_count++;
386 break;
387 }
388 else
389 stable_count = 0;
390
393 }
394
395 /* Restore the previous clock source */
397
399 return 0; /* did not converge */
400
401 return (uint32) freq_khz;
402}
403
404/*
405 * Returns TSC clock source information for diagnostic purposes.
406 *
407 * On first call, may run the TSC calibration loop (if not already done during
408 * frequency detection) which can take up to TSC_CALIBRATION_MAX_NS.
409 * Subsequent calls return cached results.
410 *
411 * Note: This won't return the right info in EXEC_BACKEND builds if this were
412 * used in the backend (which it currently is not), as tsc_info is not copied
413 * using read_backend_variables - only the TSC frequency is.
414 */
415const TscClockSourceInfo *
417{
418 if (tsc_info.frequency_khz > 0 && tsc_info.calibrated_frequency_khz < 0)
419 tsc_info.calibrated_frequency_khz = pg_tsc_calibrate_frequency();
420
421 return &tsc_info;
422}
423
424#endif /* PG_INSTR_TSC_CLOCK */
#define Assert(condition)
Definition c.h:943
int64_t int64
Definition c.h:621
int32_t int32
Definition c.h:620
#define PG_INT64_MAX
Definition c.h:676
uint64_t uint64
Definition c.h:625
uint32_t uint32
Definition c.h:624
int timing_clock_source
Definition instr_time.c:64
static void set_ticks_per_ns_system(void)
Definition instr_time.c:139
uint64 max_ticks_no_overflow
Definition instr_time.c:62
static void set_ticks_per_ns(void)
Definition instr_time.c:124
void pg_initialize_timing(void)
Definition instr_time.c:84
int32 timing_tsc_frequency_khz
Definition instr_time.c:67
uint64 ticks_per_ns_scaled
Definition instr_time.c:61
bool pg_set_timing_clock_source(TimingClockSourceType source)
Definition instr_time.c:94
bool timing_initialized
Definition instr_time.c:63
bool timing_tsc_enabled
Definition instr_time.c:66
#define TICKS_TO_NS_SHIFT
Definition instr_time.h:89
#define INSTR_TIME_SET_CURRENT(t)
Definition instr_time.h:434
#define INSTR_TIME_GET_NANOSEC(t)
Definition instr_time.h:453
#define INSTR_TIME_SUBTRACT(x, y)
Definition instr_time.h:444
TimingClockSourceType
Definition instr_time.h:124
@ TIMING_CLOCK_SOURCE_SYSTEM
Definition instr_time.h:126
@ TIMING_CLOCK_SOURCE_AUTO
Definition instr_time.h:125
int i
Definition isn.c:77
static rewind_source * source
Definition pg_rewind.c:89
static char buf[DEFAULT_XLOG_SEG_SIZE]
size_t strlcat(char *dst, const char *src, size_t siz)
Definition strlcat.c:34
static int fb(int x)
#define NS_PER_S
Definition uuid.c:31