PostgreSQL Source Code  git master
pgstat.c
Go to the documentation of this file.
1 /* ----------
2  * pgstat.c
3  *
4  * All the statistics collector stuff hacked up in one big, ugly file.
5  *
6  * TODO: - Separate collector, postmaster and backend stuff
7  * into different files.
8  *
9  * - Add some automatic call for pgstat vacuuming.
10  *
11  * - Add a pgstat config column to pg_database, so this
12  * entire thing can be enabled/disabled on a per db basis.
13  *
14  * Copyright (c) 2001-2020, PostgreSQL Global Development Group
15  *
16  * src/backend/postmaster/pgstat.c
17  * ----------
18  */
19 #include "postgres.h"
20 
21 #include <unistd.h>
22 #include <fcntl.h>
23 #include <sys/param.h>
24 #include <sys/time.h>
25 #include <sys/socket.h>
26 #include <netdb.h>
27 #include <netinet/in.h>
28 #include <arpa/inet.h>
29 #include <signal.h>
30 #include <time.h>
31 #ifdef HAVE_SYS_SELECT_H
32 #include <sys/select.h>
33 #endif
34 
35 #include "access/heapam.h"
36 #include "access/htup_details.h"
37 #include "access/tableam.h"
38 #include "access/transam.h"
39 #include "access/twophase_rmgr.h"
40 #include "access/xact.h"
41 #include "catalog/pg_database.h"
42 #include "catalog/pg_proc.h"
43 #include "common/ip.h"
44 #include "libpq/libpq.h"
45 #include "libpq/pqsignal.h"
46 #include "mb/pg_wchar.h"
47 #include "miscadmin.h"
48 #include "pg_trace.h"
49 #include "pgstat.h"
50 #include "postmaster/autovacuum.h"
52 #include "postmaster/interrupt.h"
53 #include "postmaster/postmaster.h"
54 #include "replication/walsender.h"
55 #include "storage/backendid.h"
56 #include "storage/dsm.h"
57 #include "storage/fd.h"
58 #include "storage/ipc.h"
59 #include "storage/latch.h"
60 #include "storage/lmgr.h"
61 #include "storage/pg_shmem.h"
62 #include "storage/procsignal.h"
63 #include "storage/sinvaladt.h"
64 #include "utils/ascii.h"
65 #include "utils/guc.h"
66 #include "utils/memutils.h"
67 #include "utils/ps_status.h"
68 #include "utils/rel.h"
69 #include "utils/snapmgr.h"
70 #include "utils/timestamp.h"
71 
72 /* ----------
73  * Timer definitions.
74  * ----------
75  */
76 #define PGSTAT_STAT_INTERVAL 500 /* Minimum time between stats file
77  * updates; in milliseconds. */
78 
79 #define PGSTAT_RETRY_DELAY 10 /* How long to wait between checks for a
80  * new file; in milliseconds. */
81 
82 #define PGSTAT_MAX_WAIT_TIME 10000 /* Maximum time to wait for a stats
83  * file update; in milliseconds. */
84 
85 #define PGSTAT_INQ_INTERVAL 640 /* How often to ping the collector for a
86  * new file; in milliseconds. */
87 
88 #define PGSTAT_RESTART_INTERVAL 60 /* How often to attempt to restart a
89  * failed statistics collector; in
90  * seconds. */
91 
92 #define PGSTAT_POLL_LOOP_COUNT (PGSTAT_MAX_WAIT_TIME / PGSTAT_RETRY_DELAY)
93 #define PGSTAT_INQ_LOOP_COUNT (PGSTAT_INQ_INTERVAL / PGSTAT_RETRY_DELAY)
94 
95 /* Minimum receive buffer size for the collector's socket. */
96 #define PGSTAT_MIN_RCVBUF (100 * 1024)
97 
98 
99 /* ----------
100  * The initial size hints for the hash tables used in the collector.
101  * ----------
102  */
103 #define PGSTAT_DB_HASH_SIZE 16
104 #define PGSTAT_TAB_HASH_SIZE 512
105 #define PGSTAT_FUNCTION_HASH_SIZE 512
106 
107 
108 /* ----------
109  * Total number of backends including auxiliary
110  *
111  * We reserve a slot for each possible BackendId, plus one for each
112  * possible auxiliary process type. (This scheme assumes there is not
113  * more than one of any auxiliary process type at a time.) MaxBackends
114  * includes autovacuum workers and background workers as well.
115  * ----------
116  */
117 #define NumBackendStatSlots (MaxBackends + NUM_AUXPROCTYPES)
118 
119 
120 /* ----------
121  * GUC parameters
122  * ----------
123  */
125 bool pgstat_track_counts = false;
128 
129 /* ----------
130  * Built from GUC parameter
131  * ----------
132  */
134 char *pgstat_stat_filename = NULL;
135 char *pgstat_stat_tmpname = NULL;
136 
137 /*
138  * BgWriter global statistics counters (unused in other processes).
139  * Stored directly in a stats message structure so it can be sent
140  * without needing to copy things around. We assume this inits to zeroes.
141  */
143 
144 /*
145  * List of SLRU names that we keep stats for. There is no central registry of
146  * SLRUs, so we use this fixed list instead. The "other" entry is used for
147  * all SLRUs without an explicit entry (e.g. SLRUs in extensions).
148  */
149 static const char *const slru_names[] = {
150  "CommitTs",
151  "MultiXactMember",
152  "MultiXactOffset",
153  "Notify",
154  "Serial",
155  "Subtrans",
156  "Xact",
157  "other" /* has to be last */
158 };
159 
160 #define SLRU_NUM_ELEMENTS lengthof(slru_names)
161 
162 /*
163  * SLRU statistics counts waiting to be sent to the collector. These are
164  * stored directly in stats message format so they can be sent without needing
165  * to copy things around. We assume this variable inits to zeroes. Entries
166  * are one-to-one with slru_names[].
167  */
169 
170 /* ----------
171  * Local data
172  * ----------
173  */
175 
177 
179 
180 static bool pgStatRunningInCollector = false;
181 
182 /*
183  * Structures in which backends store per-table info that's waiting to be
184  * sent to the collector.
185  *
186  * NOTE: once allocated, TabStatusArray structures are never moved or deleted
187  * for the life of the backend. Also, we zero out the t_id fields of the
188  * contained PgStat_TableStatus structs whenever they are not actively in use.
189  * This allows relcache pgstat_info pointers to be treated as long-lived data,
190  * avoiding repeated searches in pgstat_initstats() when a relation is
191  * repeatedly opened during a transaction.
192  */
193 #define TABSTAT_QUANTUM 100 /* we alloc this many at a time */
194 
195 typedef struct TabStatusArray
196 {
197  struct TabStatusArray *tsa_next; /* link to next array, if any */
198  int tsa_used; /* # entries currently used */
201 
203 
204 /*
205  * pgStatTabHash entry: map from relation OID to PgStat_TableStatus pointer
206  */
207 typedef struct TabStatHashEntry
208 {
212 
213 /*
214  * Hash table for O(1) t_id -> tsa_entry lookup
215  */
216 static HTAB *pgStatTabHash = NULL;
217 
218 /*
219  * Backends store per-function info that's waiting to be sent to the collector
220  * in this hash table (indexed by function OID).
221  */
222 static HTAB *pgStatFunctions = NULL;
223 
224 /*
225  * Indicates if backend has some function stats that it hasn't yet
226  * sent to the collector.
227  */
228 static bool have_function_stats = false;
229 
230 /*
231  * Tuple insertion/deletion counts for an open transaction can't be propagated
232  * into PgStat_TableStatus counters until we know if it is going to commit
233  * or abort. Hence, we keep these counts in per-subxact structs that live
234  * in TopTransactionContext. This data structure is designed on the assumption
235  * that subxacts won't usually modify very many tables.
236  */
237 typedef struct PgStat_SubXactStatus
238 {
239  int nest_level; /* subtransaction nest level */
240  struct PgStat_SubXactStatus *prev; /* higher-level subxact if any */
241  PgStat_TableXactStatus *first; /* head of list for this subxact */
243 
245 
246 static int pgStatXactCommit = 0;
247 static int pgStatXactRollback = 0;
250 
251 /* Record that's written to 2PC state file when pgstat state is persisted */
252 typedef struct TwoPhasePgStatRecord
253 {
254  PgStat_Counter tuples_inserted; /* tuples inserted in xact */
255  PgStat_Counter tuples_updated; /* tuples updated in xact */
256  PgStat_Counter tuples_deleted; /* tuples deleted in xact */
257  PgStat_Counter inserted_pre_trunc; /* tuples inserted prior to truncate */
258  PgStat_Counter updated_pre_trunc; /* tuples updated prior to truncate */
259  PgStat_Counter deleted_pre_trunc; /* tuples deleted prior to truncate */
260  Oid t_id; /* table's OID */
261  bool t_shared; /* is it a shared catalog? */
262  bool t_truncated; /* was the relation truncated? */
264 
265 /*
266  * Info about current "snapshot" of stats file
267  */
269 static HTAB *pgStatDBHash = NULL;
270 
271 /* Status for backends including auxiliary */
273 
274 /* Total number of backends including auxiliary */
275 static int localNumBackends = 0;
276 
277 /*
278  * Cluster wide statistics, kept in the stats collector.
279  * Contains statistics that are not collected per database
280  * or per table.
281  */
285 
286 /*
287  * List of OIDs of databases we need to write out. If an entry is InvalidOid,
288  * it means to write only the shared-catalog stats ("DB 0"); otherwise, we
289  * will write both that DB's data and the shared stats.
290  */
292 
293 /*
294  * Total time charged to functions so far in the current backend.
295  * We use this to help separate "self" and "other" time charges.
296  * (We assume this initializes to zero.)
297  */
299 
300 
301 /* ----------
302  * Local function forward declarations
303  * ----------
304  */
305 #ifdef EXEC_BACKEND
306 static pid_t pgstat_forkexec(void);
307 #endif
308 
309 NON_EXEC_STATIC void PgstatCollectorMain(int argc, char *argv[]) pg_attribute_noreturn();
310 static void pgstat_beshutdown_hook(int code, Datum arg);
311 
312 static PgStat_StatDBEntry *pgstat_get_db_entry(Oid databaseid, bool create);
314  Oid tableoid, bool create);
315 static void pgstat_write_statsfiles(bool permanent, bool allDbs);
316 static void pgstat_write_db_statsfile(PgStat_StatDBEntry *dbentry, bool permanent);
317 static HTAB *pgstat_read_statsfiles(Oid onlydb, bool permanent, bool deep);
318 static void pgstat_read_db_statsfile(Oid databaseid, HTAB *tabhash, HTAB *funchash, bool permanent);
319 static void backend_read_statsfile(void);
320 static void pgstat_read_current_status(void);
321 
322 static bool pgstat_write_statsfile_needed(void);
323 static bool pgstat_db_requested(Oid databaseid);
324 
325 static void pgstat_send_tabstat(PgStat_MsgTabstat *tsmsg);
326 static void pgstat_send_funcstats(void);
327 static void pgstat_send_slru(void);
328 static HTAB *pgstat_collect_oids(Oid catalogid, AttrNumber anum_oid);
329 
330 static PgStat_TableStatus *get_tabstat_entry(Oid rel_id, bool isshared);
331 
332 static void pgstat_setup_memcxt(void);
333 
334 static const char *pgstat_get_wait_activity(WaitEventActivity w);
335 static const char *pgstat_get_wait_client(WaitEventClient w);
336 static const char *pgstat_get_wait_ipc(WaitEventIPC w);
337 static const char *pgstat_get_wait_timeout(WaitEventTimeout w);
338 static const char *pgstat_get_wait_io(WaitEventIO w);
339 
340 static void pgstat_setheader(PgStat_MsgHdr *hdr, StatMsgType mtype);
341 static void pgstat_send(void *msg, int len);
342 
343 static void pgstat_recv_inquiry(PgStat_MsgInquiry *msg, int len);
344 static void pgstat_recv_tabstat(PgStat_MsgTabstat *msg, int len);
345 static void pgstat_recv_tabpurge(PgStat_MsgTabpurge *msg, int len);
346 static void pgstat_recv_dropdb(PgStat_MsgDropdb *msg, int len);
347 static void pgstat_recv_resetcounter(PgStat_MsgResetcounter *msg, int len);
351 static void pgstat_recv_autovac(PgStat_MsgAutovacStart *msg, int len);
352 static void pgstat_recv_vacuum(PgStat_MsgVacuum *msg, int len);
353 static void pgstat_recv_analyze(PgStat_MsgAnalyze *msg, int len);
354 static void pgstat_recv_archiver(PgStat_MsgArchiver *msg, int len);
355 static void pgstat_recv_bgwriter(PgStat_MsgBgWriter *msg, int len);
356 static void pgstat_recv_slru(PgStat_MsgSLRU *msg, int len);
357 static void pgstat_recv_funcstat(PgStat_MsgFuncstat *msg, int len);
358 static void pgstat_recv_funcpurge(PgStat_MsgFuncpurge *msg, int len);
360 static void pgstat_recv_deadlock(PgStat_MsgDeadlock *msg, int len);
362 static void pgstat_recv_tempfile(PgStat_MsgTempFile *msg, int len);
363 
364 /* ------------------------------------------------------------
365  * Public functions called from postmaster follow
366  * ------------------------------------------------------------
367  */
368 
369 /* ----------
370  * pgstat_init() -
371  *
372  * Called from postmaster at startup. Create the resources required
373  * by the statistics collector process. If unable to do so, do not
374  * fail --- better to let the postmaster start with stats collection
375  * disabled.
376  * ----------
377  */
378 void
380 {
381  ACCEPT_TYPE_ARG3 alen;
382  struct addrinfo *addrs = NULL,
383  *addr,
384  hints;
385  int ret;
386  fd_set rset;
387  struct timeval tv;
388  char test_byte;
389  int sel_res;
390  int tries = 0;
391 
392 #define TESTBYTEVAL ((char) 199)
393 
394  /*
395  * This static assertion verifies that we didn't mess up the calculations
396  * involved in selecting maximum payload sizes for our UDP messages.
397  * Because the only consequence of overrunning PGSTAT_MAX_MSG_SIZE would
398  * be silent performance loss from fragmentation, it seems worth having a
399  * compile-time cross-check that we didn't.
400  */
402  "maximum stats message size exceeds PGSTAT_MAX_MSG_SIZE");
403 
404  /*
405  * Create the UDP socket for sending and receiving statistic messages
406  */
407  hints.ai_flags = AI_PASSIVE;
408  hints.ai_family = AF_UNSPEC;
409  hints.ai_socktype = SOCK_DGRAM;
410  hints.ai_protocol = 0;
411  hints.ai_addrlen = 0;
412  hints.ai_addr = NULL;
413  hints.ai_canonname = NULL;
414  hints.ai_next = NULL;
415  ret = pg_getaddrinfo_all("localhost", NULL, &hints, &addrs);
416  if (ret || !addrs)
417  {
418  ereport(LOG,
419  (errmsg("could not resolve \"localhost\": %s",
420  gai_strerror(ret))));
421  goto startup_failed;
422  }
423 
424  /*
425  * On some platforms, pg_getaddrinfo_all() may return multiple addresses
426  * only one of which will actually work (eg, both IPv6 and IPv4 addresses
427  * when kernel will reject IPv6). Worse, the failure may occur at the
428  * bind() or perhaps even connect() stage. So we must loop through the
429  * results till we find a working combination. We will generate LOG
430  * messages, but no error, for bogus combinations.
431  */
432  for (addr = addrs; addr; addr = addr->ai_next)
433  {
434 #ifdef HAVE_UNIX_SOCKETS
435  /* Ignore AF_UNIX sockets, if any are returned. */
436  if (addr->ai_family == AF_UNIX)
437  continue;
438 #endif
439 
440  if (++tries > 1)
441  ereport(LOG,
442  (errmsg("trying another address for the statistics collector")));
443 
444  /*
445  * Create the socket.
446  */
447  if ((pgStatSock = socket(addr->ai_family, SOCK_DGRAM, 0)) == PGINVALID_SOCKET)
448  {
449  ereport(LOG,
451  errmsg("could not create socket for statistics collector: %m")));
452  continue;
453  }
454 
455  /*
456  * Bind it to a kernel assigned port on localhost and get the assigned
457  * port via getsockname().
458  */
459  if (bind(pgStatSock, addr->ai_addr, addr->ai_addrlen) < 0)
460  {
461  ereport(LOG,
463  errmsg("could not bind socket for statistics collector: %m")));
466  continue;
467  }
468 
469  alen = sizeof(pgStatAddr);
470  if (getsockname(pgStatSock, (struct sockaddr *) &pgStatAddr, &alen) < 0)
471  {
472  ereport(LOG,
474  errmsg("could not get address of socket for statistics collector: %m")));
477  continue;
478  }
479 
480  /*
481  * Connect the socket to its own address. This saves a few cycles by
482  * not having to respecify the target address on every send. This also
483  * provides a kernel-level check that only packets from this same
484  * address will be received.
485  */
486  if (connect(pgStatSock, (struct sockaddr *) &pgStatAddr, alen) < 0)
487  {
488  ereport(LOG,
490  errmsg("could not connect socket for statistics collector: %m")));
493  continue;
494  }
495 
496  /*
497  * Try to send and receive a one-byte test message on the socket. This
498  * is to catch situations where the socket can be created but will not
499  * actually pass data (for instance, because kernel packet filtering
500  * rules prevent it).
501  */
502  test_byte = TESTBYTEVAL;
503 
504 retry1:
505  if (send(pgStatSock, &test_byte, 1, 0) != 1)
506  {
507  if (errno == EINTR)
508  goto retry1; /* if interrupted, just retry */
509  ereport(LOG,
511  errmsg("could not send test message on socket for statistics collector: %m")));
514  continue;
515  }
516 
517  /*
518  * There could possibly be a little delay before the message can be
519  * received. We arbitrarily allow up to half a second before deciding
520  * it's broken.
521  */
522  for (;;) /* need a loop to handle EINTR */
523  {
524  FD_ZERO(&rset);
525  FD_SET(pgStatSock, &rset);
526 
527  tv.tv_sec = 0;
528  tv.tv_usec = 500000;
529  sel_res = select(pgStatSock + 1, &rset, NULL, NULL, &tv);
530  if (sel_res >= 0 || errno != EINTR)
531  break;
532  }
533  if (sel_res < 0)
534  {
535  ereport(LOG,
537  errmsg("select() failed in statistics collector: %m")));
540  continue;
541  }
542  if (sel_res == 0 || !FD_ISSET(pgStatSock, &rset))
543  {
544  /*
545  * This is the case we actually think is likely, so take pains to
546  * give a specific message for it.
547  *
548  * errno will not be set meaningfully here, so don't use it.
549  */
550  ereport(LOG,
551  (errcode(ERRCODE_CONNECTION_FAILURE),
552  errmsg("test message did not get through on socket for statistics collector")));
555  continue;
556  }
557 
558  test_byte++; /* just make sure variable is changed */
559 
560 retry2:
561  if (recv(pgStatSock, &test_byte, 1, 0) != 1)
562  {
563  if (errno == EINTR)
564  goto retry2; /* if interrupted, just retry */
565  ereport(LOG,
567  errmsg("could not receive test message on socket for statistics collector: %m")));
570  continue;
571  }
572 
573  if (test_byte != TESTBYTEVAL) /* strictly paranoia ... */
574  {
575  ereport(LOG,
576  (errcode(ERRCODE_INTERNAL_ERROR),
577  errmsg("incorrect test message transmission on socket for statistics collector")));
580  continue;
581  }
582 
583  /* If we get here, we have a working socket */
584  break;
585  }
586 
587  /* Did we find a working address? */
588  if (!addr || pgStatSock == PGINVALID_SOCKET)
589  goto startup_failed;
590 
591  /*
592  * Set the socket to non-blocking IO. This ensures that if the collector
593  * falls behind, statistics messages will be discarded; backends won't
594  * block waiting to send messages to the collector.
595  */
597  {
598  ereport(LOG,
600  errmsg("could not set statistics collector socket to nonblocking mode: %m")));
601  goto startup_failed;
602  }
603 
604  /*
605  * Try to ensure that the socket's receive buffer is at least
606  * PGSTAT_MIN_RCVBUF bytes, so that it won't easily overflow and lose
607  * data. Use of UDP protocol means that we are willing to lose data under
608  * heavy load, but we don't want it to happen just because of ridiculously
609  * small default buffer sizes (such as 8KB on older Windows versions).
610  */
611  {
612  int old_rcvbuf;
613  int new_rcvbuf;
614  ACCEPT_TYPE_ARG3 rcvbufsize = sizeof(old_rcvbuf);
615 
616  if (getsockopt(pgStatSock, SOL_SOCKET, SO_RCVBUF,
617  (char *) &old_rcvbuf, &rcvbufsize) < 0)
618  {
619  elog(LOG, "getsockopt(SO_RCVBUF) failed: %m");
620  /* if we can't get existing size, always try to set it */
621  old_rcvbuf = 0;
622  }
623 
624  new_rcvbuf = PGSTAT_MIN_RCVBUF;
625  if (old_rcvbuf < new_rcvbuf)
626  {
627  if (setsockopt(pgStatSock, SOL_SOCKET, SO_RCVBUF,
628  (char *) &new_rcvbuf, sizeof(new_rcvbuf)) < 0)
629  elog(LOG, "setsockopt(SO_RCVBUF) failed: %m");
630  }
631  }
632 
633  pg_freeaddrinfo_all(hints.ai_family, addrs);
634 
635  /* Now that we have a long-lived socket, tell fd.c about it. */
637 
638  return;
639 
640 startup_failed:
641  ereport(LOG,
642  (errmsg("disabling statistics collector for lack of working socket")));
643 
644  if (addrs)
645  pg_freeaddrinfo_all(hints.ai_family, addrs);
646 
650 
651  /*
652  * Adjust GUC variables to suppress useless activity, and for debugging
653  * purposes (seeing track_counts off is a clue that we failed here). We
654  * use PGC_S_OVERRIDE because there is no point in trying to turn it back
655  * on from postgresql.conf without a restart.
656  */
657  SetConfigOption("track_counts", "off", PGC_INTERNAL, PGC_S_OVERRIDE);
658 }
659 
660 /*
661  * subroutine for pgstat_reset_all
662  */
663 static void
665 {
666  DIR *dir;
667  struct dirent *entry;
668  char fname[MAXPGPATH * 2];
669 
670  dir = AllocateDir(directory);
671  while ((entry = ReadDir(dir, directory)) != NULL)
672  {
673  int nchars;
674  Oid tmp_oid;
675 
676  /*
677  * Skip directory entries that don't match the file names we write.
678  * See get_dbstat_filename for the database-specific pattern.
679  */
680  if (strncmp(entry->d_name, "global.", 7) == 0)
681  nchars = 7;
682  else
683  {
684  nchars = 0;
685  (void) sscanf(entry->d_name, "db_%u.%n",
686  &tmp_oid, &nchars);
687  if (nchars <= 0)
688  continue;
689  /* %u allows leading whitespace, so reject that */
690  if (strchr("0123456789", entry->d_name[3]) == NULL)
691  continue;
692  }
693 
694  if (strcmp(entry->d_name + nchars, "tmp") != 0 &&
695  strcmp(entry->d_name + nchars, "stat") != 0)
696  continue;
697 
698  snprintf(fname, sizeof(fname), "%s/%s", directory,
699  entry->d_name);
700  unlink(fname);
701  }
702  FreeDir(dir);
703 }
704 
705 /*
706  * pgstat_reset_all() -
707  *
708  * Remove the stats files. This is currently used only if WAL
709  * recovery is needed after a crash.
710  */
711 void
713 {
716 }
717 
718 #ifdef EXEC_BACKEND
719 
720 /*
721  * pgstat_forkexec() -
722  *
723  * Format up the arglist for, then fork and exec, statistics collector process
724  */
725 static pid_t
726 pgstat_forkexec(void)
727 {
728  char *av[10];
729  int ac = 0;
730 
731  av[ac++] = "postgres";
732  av[ac++] = "--forkcol";
733  av[ac++] = NULL; /* filled in by postmaster_forkexec */
734 
735  av[ac] = NULL;
736  Assert(ac < lengthof(av));
737 
738  return postmaster_forkexec(ac, av);
739 }
740 #endif /* EXEC_BACKEND */
741 
742 
743 /*
744  * pgstat_start() -
745  *
746  * Called from postmaster at startup or after an existing collector
747  * died. Attempt to fire up a fresh statistics collector.
748  *
749  * Returns PID of child process, or 0 if fail.
750  *
751  * Note: if fail, we will be called again from the postmaster main loop.
752  */
753 int
755 {
756  time_t curtime;
757  pid_t pgStatPid;
758 
759  /*
760  * Check that the socket is there, else pgstat_init failed and we can do
761  * nothing useful.
762  */
764  return 0;
765 
766  /*
767  * Do nothing if too soon since last collector start. This is a safety
768  * valve to protect against continuous respawn attempts if the collector
769  * is dying immediately at launch. Note that since we will be re-called
770  * from the postmaster main loop, we will get another chance later.
771  */
772  curtime = time(NULL);
773  if ((unsigned int) (curtime - last_pgstat_start_time) <
774  (unsigned int) PGSTAT_RESTART_INTERVAL)
775  return 0;
776  last_pgstat_start_time = curtime;
777 
778  /*
779  * Okay, fork off the collector.
780  */
781 #ifdef EXEC_BACKEND
782  switch ((pgStatPid = pgstat_forkexec()))
783 #else
784  switch ((pgStatPid = fork_process()))
785 #endif
786  {
787  case -1:
788  ereport(LOG,
789  (errmsg("could not fork statistics collector: %m")));
790  return 0;
791 
792 #ifndef EXEC_BACKEND
793  case 0:
794  /* in postmaster child ... */
796 
797  /* Close the postmaster's sockets */
798  ClosePostmasterPorts(false);
799 
800  /* Drop our connection to postmaster's shared memory, as well */
801  dsm_detach_all();
803 
804  PgstatCollectorMain(0, NULL);
805  break;
806 #endif
807 
808  default:
809  return (int) pgStatPid;
810  }
811 
812  /* shouldn't get here */
813  return 0;
814 }
815 
816 void
818 {
820 }
821 
822 /* ------------------------------------------------------------
823  * Public functions used by backends follow
824  *------------------------------------------------------------
825  */
826 
827 
828 /* ----------
829  * pgstat_report_stat() -
830  *
831  * Must be called by processes that performs DML: tcop/postgres.c, logical
832  * receiver processes, SPI worker, etc. to send the so far collected
833  * per-table and function usage statistics to the collector. Note that this
834  * is called only when not within a transaction, so it is fair to use
835  * transaction stop time as an approximation of current time.
836  * ----------
837  */
838 void
840 {
841  /* we assume this inits to all zeroes: */
842  static const PgStat_TableCounts all_zeroes;
843  static TimestampTz last_report = 0;
844 
846  PgStat_MsgTabstat regular_msg;
847  PgStat_MsgTabstat shared_msg;
848  TabStatusArray *tsa;
849  int i;
850 
851  /* Don't expend a clock check if nothing to do */
852  if ((pgStatTabList == NULL || pgStatTabList->tsa_used == 0) &&
853  pgStatXactCommit == 0 && pgStatXactRollback == 0 &&
855  return;
856 
857  /*
858  * Don't send a message unless it's been at least PGSTAT_STAT_INTERVAL
859  * msec since we last sent one, or the caller wants to force stats out.
860  */
862  if (!force &&
864  return;
865  last_report = now;
866 
867  /*
868  * Destroy pgStatTabHash before we start invalidating PgStat_TableEntry
869  * entries it points to. (Should we fail partway through the loop below,
870  * it's okay to have removed the hashtable already --- the only
871  * consequence is we'd get multiple entries for the same table in the
872  * pgStatTabList, and that's safe.)
873  */
874  if (pgStatTabHash)
875  hash_destroy(pgStatTabHash);
876  pgStatTabHash = NULL;
877 
878  /*
879  * Scan through the TabStatusArray struct(s) to find tables that actually
880  * have counts, and build messages to send. We have to separate shared
881  * relations from regular ones because the databaseid field in the message
882  * header has to depend on that.
883  */
884  regular_msg.m_databaseid = MyDatabaseId;
885  shared_msg.m_databaseid = InvalidOid;
886  regular_msg.m_nentries = 0;
887  shared_msg.m_nentries = 0;
888 
889  for (tsa = pgStatTabList; tsa != NULL; tsa = tsa->tsa_next)
890  {
891  for (i = 0; i < tsa->tsa_used; i++)
892  {
893  PgStat_TableStatus *entry = &tsa->tsa_entries[i];
894  PgStat_MsgTabstat *this_msg;
895  PgStat_TableEntry *this_ent;
896 
897  /* Shouldn't have any pending transaction-dependent counts */
898  Assert(entry->trans == NULL);
899 
900  /*
901  * Ignore entries that didn't accumulate any actual counts, such
902  * as indexes that were opened by the planner but not used.
903  */
904  if (memcmp(&entry->t_counts, &all_zeroes,
905  sizeof(PgStat_TableCounts)) == 0)
906  continue;
907 
908  /*
909  * OK, insert data into the appropriate message, and send if full.
910  */
911  this_msg = entry->t_shared ? &shared_msg : &regular_msg;
912  this_ent = &this_msg->m_entry[this_msg->m_nentries];
913  this_ent->t_id = entry->t_id;
914  memcpy(&this_ent->t_counts, &entry->t_counts,
915  sizeof(PgStat_TableCounts));
916  if (++this_msg->m_nentries >= PGSTAT_NUM_TABENTRIES)
917  {
918  pgstat_send_tabstat(this_msg);
919  this_msg->m_nentries = 0;
920  }
921  }
922  /* zero out PgStat_TableStatus structs after use */
923  MemSet(tsa->tsa_entries, 0,
924  tsa->tsa_used * sizeof(PgStat_TableStatus));
925  tsa->tsa_used = 0;
926  }
927 
928  /*
929  * Send partial messages. Make sure that any pending xact commit/abort
930  * gets counted, even if there are no table stats to send.
931  */
932  if (regular_msg.m_nentries > 0 ||
934  pgstat_send_tabstat(&regular_msg);
935  if (shared_msg.m_nentries > 0)
936  pgstat_send_tabstat(&shared_msg);
937 
938  /* Now, send function statistics */
940 
941  /* Finally send SLRU statistics */
943 }
944 
945 /*
946  * Subroutine for pgstat_report_stat: finish and send a tabstat message
947  */
948 static void
950 {
951  int n;
952  int len;
953 
954  /* It's unlikely we'd get here with no socket, but maybe not impossible */
956  return;
957 
958  /*
959  * Report and reset accumulated xact commit/rollback and I/O timings
960  * whenever we send a normal tabstat message
961  */
962  if (OidIsValid(tsmsg->m_databaseid))
963  {
968  pgStatXactCommit = 0;
969  pgStatXactRollback = 0;
972  }
973  else
974  {
975  tsmsg->m_xact_commit = 0;
976  tsmsg->m_xact_rollback = 0;
977  tsmsg->m_block_read_time = 0;
978  tsmsg->m_block_write_time = 0;
979  }
980 
981  n = tsmsg->m_nentries;
982  len = offsetof(PgStat_MsgTabstat, m_entry[0]) +
983  n * sizeof(PgStat_TableEntry);
984 
986  pgstat_send(tsmsg, len);
987 }
988 
989 /*
990  * Subroutine for pgstat_report_stat: populate and send a function stat message
991  */
992 static void
994 {
995  /* we assume this inits to all zeroes: */
996  static const PgStat_FunctionCounts all_zeroes;
997 
998  PgStat_MsgFuncstat msg;
1000  HASH_SEQ_STATUS fstat;
1001 
1002  if (pgStatFunctions == NULL)
1003  return;
1004 
1006  msg.m_databaseid = MyDatabaseId;
1007  msg.m_nentries = 0;
1008 
1009  hash_seq_init(&fstat, pgStatFunctions);
1010  while ((entry = (PgStat_BackendFunctionEntry *) hash_seq_search(&fstat)) != NULL)
1011  {
1012  PgStat_FunctionEntry *m_ent;
1013 
1014  /* Skip it if no counts accumulated since last time */
1015  if (memcmp(&entry->f_counts, &all_zeroes,
1016  sizeof(PgStat_FunctionCounts)) == 0)
1017  continue;
1018 
1019  /* need to convert format of time accumulators */
1020  m_ent = &msg.m_entry[msg.m_nentries];
1021  m_ent->f_id = entry->f_id;
1022  m_ent->f_numcalls = entry->f_counts.f_numcalls;
1025 
1026  if (++msg.m_nentries >= PGSTAT_NUM_FUNCENTRIES)
1027  {
1028  pgstat_send(&msg, offsetof(PgStat_MsgFuncstat, m_entry[0]) +
1029  msg.m_nentries * sizeof(PgStat_FunctionEntry));
1030  msg.m_nentries = 0;
1031  }
1032 
1033  /* reset the entry's counts */
1034  MemSet(&entry->f_counts, 0, sizeof(PgStat_FunctionCounts));
1035  }
1036 
1037  if (msg.m_nentries > 0)
1038  pgstat_send(&msg, offsetof(PgStat_MsgFuncstat, m_entry[0]) +
1039  msg.m_nentries * sizeof(PgStat_FunctionEntry));
1040 
1041  have_function_stats = false;
1042 }
1043 
1044 
1045 /* ----------
1046  * pgstat_vacuum_stat() -
1047  *
1048  * Will tell the collector about objects he can get rid of.
1049  * ----------
1050  */
1051 void
1053 {
1054  HTAB *htab;
1055  PgStat_MsgTabpurge msg;
1056  PgStat_MsgFuncpurge f_msg;
1057  HASH_SEQ_STATUS hstat;
1058  PgStat_StatDBEntry *dbentry;
1059  PgStat_StatTabEntry *tabentry;
1060  PgStat_StatFuncEntry *funcentry;
1061  int len;
1062 
1064  return;
1065 
1066  /*
1067  * If not done for this transaction, read the statistics collector stats
1068  * file into some hash tables.
1069  */
1071 
1072  /*
1073  * Read pg_database and make a list of OIDs of all existing databases
1074  */
1075  htab = pgstat_collect_oids(DatabaseRelationId, Anum_pg_database_oid);
1076 
1077  /*
1078  * Search the database hash table for dead databases and tell the
1079  * collector to drop them.
1080  */
1081  hash_seq_init(&hstat, pgStatDBHash);
1082  while ((dbentry = (PgStat_StatDBEntry *) hash_seq_search(&hstat)) != NULL)
1083  {
1084  Oid dbid = dbentry->databaseid;
1085 
1087 
1088  /* the DB entry for shared tables (with InvalidOid) is never dropped */
1089  if (OidIsValid(dbid) &&
1090  hash_search(htab, (void *) &dbid, HASH_FIND, NULL) == NULL)
1091  pgstat_drop_database(dbid);
1092  }
1093 
1094  /* Clean up */
1095  hash_destroy(htab);
1096 
1097  /*
1098  * Lookup our own database entry; if not found, nothing more to do.
1099  */
1100  dbentry = (PgStat_StatDBEntry *) hash_search(pgStatDBHash,
1101  (void *) &MyDatabaseId,
1102  HASH_FIND, NULL);
1103  if (dbentry == NULL || dbentry->tables == NULL)
1104  return;
1105 
1106  /*
1107  * Similarly to above, make a list of all known relations in this DB.
1108  */
1109  htab = pgstat_collect_oids(RelationRelationId, Anum_pg_class_oid);
1110 
1111  /*
1112  * Initialize our messages table counter to zero
1113  */
1114  msg.m_nentries = 0;
1115 
1116  /*
1117  * Check for all tables listed in stats hashtable if they still exist.
1118  */
1119  hash_seq_init(&hstat, dbentry->tables);
1120  while ((tabentry = (PgStat_StatTabEntry *) hash_seq_search(&hstat)) != NULL)
1121  {
1122  Oid tabid = tabentry->tableid;
1123 
1125 
1126  if (hash_search(htab, (void *) &tabid, HASH_FIND, NULL) != NULL)
1127  continue;
1128 
1129  /*
1130  * Not there, so add this table's Oid to the message
1131  */
1132  msg.m_tableid[msg.m_nentries++] = tabid;
1133 
1134  /*
1135  * If the message is full, send it out and reinitialize to empty
1136  */
1137  if (msg.m_nentries >= PGSTAT_NUM_TABPURGE)
1138  {
1139  len = offsetof(PgStat_MsgTabpurge, m_tableid[0])
1140  + msg.m_nentries * sizeof(Oid);
1141 
1143  msg.m_databaseid = MyDatabaseId;
1144  pgstat_send(&msg, len);
1145 
1146  msg.m_nentries = 0;
1147  }
1148  }
1149 
1150  /*
1151  * Send the rest
1152  */
1153  if (msg.m_nentries > 0)
1154  {
1155  len = offsetof(PgStat_MsgTabpurge, m_tableid[0])
1156  + msg.m_nentries * sizeof(Oid);
1157 
1159  msg.m_databaseid = MyDatabaseId;
1160  pgstat_send(&msg, len);
1161  }
1162 
1163  /* Clean up */
1164  hash_destroy(htab);
1165 
1166  /*
1167  * Now repeat the above steps for functions. However, we needn't bother
1168  * in the common case where no function stats are being collected.
1169  */
1170  if (dbentry->functions != NULL &&
1171  hash_get_num_entries(dbentry->functions) > 0)
1172  {
1173  htab = pgstat_collect_oids(ProcedureRelationId, Anum_pg_proc_oid);
1174 
1176  f_msg.m_databaseid = MyDatabaseId;
1177  f_msg.m_nentries = 0;
1178 
1179  hash_seq_init(&hstat, dbentry->functions);
1180  while ((funcentry = (PgStat_StatFuncEntry *) hash_seq_search(&hstat)) != NULL)
1181  {
1182  Oid funcid = funcentry->functionid;
1183 
1185 
1186  if (hash_search(htab, (void *) &funcid, HASH_FIND, NULL) != NULL)
1187  continue;
1188 
1189  /*
1190  * Not there, so add this function's Oid to the message
1191  */
1192  f_msg.m_functionid[f_msg.m_nentries++] = funcid;
1193 
1194  /*
1195  * If the message is full, send it out and reinitialize to empty
1196  */
1197  if (f_msg.m_nentries >= PGSTAT_NUM_FUNCPURGE)
1198  {
1199  len = offsetof(PgStat_MsgFuncpurge, m_functionid[0])
1200  + f_msg.m_nentries * sizeof(Oid);
1201 
1202  pgstat_send(&f_msg, len);
1203 
1204  f_msg.m_nentries = 0;
1205  }
1206  }
1207 
1208  /*
1209  * Send the rest
1210  */
1211  if (f_msg.m_nentries > 0)
1212  {
1213  len = offsetof(PgStat_MsgFuncpurge, m_functionid[0])
1214  + f_msg.m_nentries * sizeof(Oid);
1215 
1216  pgstat_send(&f_msg, len);
1217  }
1218 
1219  hash_destroy(htab);
1220  }
1221 }
1222 
1223 
1224 /* ----------
1225  * pgstat_collect_oids() -
1226  *
1227  * Collect the OIDs of all objects listed in the specified system catalog
1228  * into a temporary hash table. Caller should hash_destroy the result
1229  * when done with it. (However, we make the table in CurrentMemoryContext
1230  * so that it will be freed properly in event of an error.)
1231  * ----------
1232  */
1233 static HTAB *
1234 pgstat_collect_oids(Oid catalogid, AttrNumber anum_oid)
1235 {
1236  HTAB *htab;
1237  HASHCTL hash_ctl;
1238  Relation rel;
1239  TableScanDesc scan;
1240  HeapTuple tup;
1241  Snapshot snapshot;
1242 
1243  memset(&hash_ctl, 0, sizeof(hash_ctl));
1244  hash_ctl.keysize = sizeof(Oid);
1245  hash_ctl.entrysize = sizeof(Oid);
1246  hash_ctl.hcxt = CurrentMemoryContext;
1247  htab = hash_create("Temporary table of OIDs",
1249  &hash_ctl,
1251 
1252  rel = table_open(catalogid, AccessShareLock);
1253  snapshot = RegisterSnapshot(GetLatestSnapshot());
1254  scan = table_beginscan(rel, snapshot, 0, NULL);
1255  while ((tup = heap_getnext(scan, ForwardScanDirection)) != NULL)
1256  {
1257  Oid thisoid;
1258  bool isnull;
1259 
1260  thisoid = heap_getattr(tup, anum_oid, RelationGetDescr(rel), &isnull);
1261  Assert(!isnull);
1262 
1264 
1265  (void) hash_search(htab, (void *) &thisoid, HASH_ENTER, NULL);
1266  }
1267  table_endscan(scan);
1268  UnregisterSnapshot(snapshot);
1270 
1271  return htab;
1272 }
1273 
1274 
1275 /* ----------
1276  * pgstat_drop_database() -
1277  *
1278  * Tell the collector that we just dropped a database.
1279  * (If the message gets lost, we will still clean the dead DB eventually
1280  * via future invocations of pgstat_vacuum_stat().)
1281  * ----------
1282  */
1283 void
1285 {
1286  PgStat_MsgDropdb msg;
1287 
1289  return;
1290 
1292  msg.m_databaseid = databaseid;
1293  pgstat_send(&msg, sizeof(msg));
1294 }
1295 
1296 
1297 /* ----------
1298  * pgstat_drop_relation() -
1299  *
1300  * Tell the collector that we just dropped a relation.
1301  * (If the message gets lost, we will still clean the dead entry eventually
1302  * via future invocations of pgstat_vacuum_stat().)
1303  *
1304  * Currently not used for lack of any good place to call it; we rely
1305  * entirely on pgstat_vacuum_stat() to clean out stats for dead rels.
1306  * ----------
1307  */
1308 #ifdef NOT_USED
1309 void
1310 pgstat_drop_relation(Oid relid)
1311 {
1312  PgStat_MsgTabpurge msg;
1313  int len;
1314 
1316  return;
1317 
1318  msg.m_tableid[0] = relid;
1319  msg.m_nentries = 1;
1320 
1321  len = offsetof(PgStat_MsgTabpurge, m_tableid[0]) + sizeof(Oid);
1322 
1324  msg.m_databaseid = MyDatabaseId;
1325  pgstat_send(&msg, len);
1326 }
1327 #endif /* NOT_USED */
1328 
1329 
1330 /* ----------
1331  * pgstat_reset_counters() -
1332  *
1333  * Tell the statistics collector to reset counters for our database.
1334  *
1335  * Permission checking for this function is managed through the normal
1336  * GRANT system.
1337  * ----------
1338  */
1339 void
1341 {
1343 
1345  return;
1346 
1348  msg.m_databaseid = MyDatabaseId;
1349  pgstat_send(&msg, sizeof(msg));
1350 }
1351 
1352 /* ----------
1353  * pgstat_reset_shared_counters() -
1354  *
1355  * Tell the statistics collector to reset cluster-wide shared counters.
1356  *
1357  * Permission checking for this function is managed through the normal
1358  * GRANT system.
1359  * ----------
1360  */
1361 void
1362 pgstat_reset_shared_counters(const char *target)
1363 {
1365 
1367  return;
1368 
1369  if (strcmp(target, "archiver") == 0)
1371  else if (strcmp(target, "bgwriter") == 0)
1373  else
1374  ereport(ERROR,
1375  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1376  errmsg("unrecognized reset target: \"%s\"", target),
1377  errhint("Target must be \"archiver\" or \"bgwriter\".")));
1378 
1380  pgstat_send(&msg, sizeof(msg));
1381 }
1382 
1383 /* ----------
1384  * pgstat_reset_single_counter() -
1385  *
1386  * Tell the statistics collector to reset a single counter.
1387  *
1388  * Permission checking for this function is managed through the normal
1389  * GRANT system.
1390  * ----------
1391  */
1392 void
1394 {
1396 
1398  return;
1399 
1401  msg.m_databaseid = MyDatabaseId;
1402  msg.m_resettype = type;
1403  msg.m_objectid = objoid;
1404 
1405  pgstat_send(&msg, sizeof(msg));
1406 }
1407 
1408 /* ----------
1409  * pgstat_reset_slru_counter() -
1410  *
1411  * Tell the statistics collector to reset a single SLRU counter, or all
1412  * SLRU counters (when name is null).
1413  *
1414  * Permission checking for this function is managed through the normal
1415  * GRANT system.
1416  * ----------
1417  */
1418 void
1420 {
1422 
1424  return;
1425 
1427  msg.m_index = (name) ? pgstat_slru_index(name) : -1;
1428 
1429  pgstat_send(&msg, sizeof(msg));
1430 }
1431 
1432 /* ----------
1433  * pgstat_report_autovac() -
1434  *
1435  * Called from autovacuum.c to report startup of an autovacuum process.
1436  * We are called before InitPostgres is done, so can't rely on MyDatabaseId;
1437  * the db OID must be passed in, instead.
1438  * ----------
1439  */
1440 void
1442 {
1444 
1446  return;
1447 
1449  msg.m_databaseid = dboid;
1451 
1452  pgstat_send(&msg, sizeof(msg));
1453 }
1454 
1455 
1456 /* ---------
1457  * pgstat_report_vacuum() -
1458  *
1459  * Tell the collector about the table we just vacuumed.
1460  * ---------
1461  */
1462 void
1463 pgstat_report_vacuum(Oid tableoid, bool shared,
1464  PgStat_Counter livetuples, PgStat_Counter deadtuples)
1465 {
1466  PgStat_MsgVacuum msg;
1467 
1469  return;
1470 
1472  msg.m_databaseid = shared ? InvalidOid : MyDatabaseId;
1473  msg.m_tableoid = tableoid;
1476  msg.m_live_tuples = livetuples;
1477  msg.m_dead_tuples = deadtuples;
1478  pgstat_send(&msg, sizeof(msg));
1479 }
1480 
1481 /* --------
1482  * pgstat_report_analyze() -
1483  *
1484  * Tell the collector about the table we just analyzed.
1485  *
1486  * Caller must provide new live- and dead-tuples estimates, as well as a
1487  * flag indicating whether to reset the changes_since_analyze counter.
1488  * --------
1489  */
1490 void
1492  PgStat_Counter livetuples, PgStat_Counter deadtuples,
1493  bool resetcounter)
1494 {
1495  PgStat_MsgAnalyze msg;
1496 
1498  return;
1499 
1500  /*
1501  * Unlike VACUUM, ANALYZE might be running inside a transaction that has
1502  * already inserted and/or deleted rows in the target table. ANALYZE will
1503  * have counted such rows as live or dead respectively. Because we will
1504  * report our counts of such rows at transaction end, we should subtract
1505  * off these counts from what we send to the collector now, else they'll
1506  * be double-counted after commit. (This approach also ensures that the
1507  * collector ends up with the right numbers if we abort instead of
1508  * committing.)
1509  */
1510  if (rel->pgstat_info != NULL)
1511  {
1513 
1514  for (trans = rel->pgstat_info->trans; trans; trans = trans->upper)
1515  {
1516  livetuples -= trans->tuples_inserted - trans->tuples_deleted;
1517  deadtuples -= trans->tuples_updated + trans->tuples_deleted;
1518  }
1519  /* count stuff inserted by already-aborted subxacts, too */
1520  deadtuples -= rel->pgstat_info->t_counts.t_delta_dead_tuples;
1521  /* Since ANALYZE's counts are estimates, we could have underflowed */
1522  livetuples = Max(livetuples, 0);
1523  deadtuples = Max(deadtuples, 0);
1524  }
1525 
1527  msg.m_databaseid = rel->rd_rel->relisshared ? InvalidOid : MyDatabaseId;
1528  msg.m_tableoid = RelationGetRelid(rel);
1530  msg.m_resetcounter = resetcounter;
1532  msg.m_live_tuples = livetuples;
1533  msg.m_dead_tuples = deadtuples;
1534  pgstat_send(&msg, sizeof(msg));
1535 }
1536 
1537 /* --------
1538  * pgstat_report_recovery_conflict() -
1539  *
1540  * Tell the collector about a Hot Standby recovery conflict.
1541  * --------
1542  */
1543 void
1545 {
1547 
1549  return;
1550 
1552  msg.m_databaseid = MyDatabaseId;
1553  msg.m_reason = reason;
1554  pgstat_send(&msg, sizeof(msg));
1555 }
1556 
1557 /* --------
1558  * pgstat_report_deadlock() -
1559  *
1560  * Tell the collector about a deadlock detected.
1561  * --------
1562  */
1563 void
1565 {
1566  PgStat_MsgDeadlock msg;
1567 
1569  return;
1570 
1572  msg.m_databaseid = MyDatabaseId;
1573  pgstat_send(&msg, sizeof(msg));
1574 }
1575 
1576 
1577 
1578 /* --------
1579  * pgstat_report_checksum_failures_in_db() -
1580  *
1581  * Tell the collector about one or more checksum failures.
1582  * --------
1583  */
1584 void
1586 {
1588 
1590  return;
1591 
1593  msg.m_databaseid = dboid;
1594  msg.m_failurecount = failurecount;
1596 
1597  pgstat_send(&msg, sizeof(msg));
1598 }
1599 
1600 /* --------
1601  * pgstat_report_checksum_failure() -
1602  *
1603  * Tell the collector about a checksum failure.
1604  * --------
1605  */
1606 void
1608 {
1610 }
1611 
1612 /* --------
1613  * pgstat_report_tempfile() -
1614  *
1615  * Tell the collector about a temporary file.
1616  * --------
1617  */
1618 void
1619 pgstat_report_tempfile(size_t filesize)
1620 {
1621  PgStat_MsgTempFile msg;
1622 
1624  return;
1625 
1627  msg.m_databaseid = MyDatabaseId;
1628  msg.m_filesize = filesize;
1629  pgstat_send(&msg, sizeof(msg));
1630 }
1631 
1632 
1633 /* ----------
1634  * pgstat_ping() -
1635  *
1636  * Send some junk data to the collector to increase traffic.
1637  * ----------
1638  */
1639 void
1641 {
1642  PgStat_MsgDummy msg;
1643 
1645  return;
1646 
1648  pgstat_send(&msg, sizeof(msg));
1649 }
1650 
1651 /* ----------
1652  * pgstat_send_inquiry() -
1653  *
1654  * Notify collector that we need fresh data.
1655  * ----------
1656  */
1657 static void
1658 pgstat_send_inquiry(TimestampTz clock_time, TimestampTz cutoff_time, Oid databaseid)
1659 {
1660  PgStat_MsgInquiry msg;
1661 
1663  msg.clock_time = clock_time;
1664  msg.cutoff_time = cutoff_time;
1665  msg.databaseid = databaseid;
1666  pgstat_send(&msg, sizeof(msg));
1667 }
1668 
1669 
1670 /*
1671  * Initialize function call usage data.
1672  * Called by the executor before invoking a function.
1673  */
1674 void
1677 {
1678  PgStat_BackendFunctionEntry *htabent;
1679  bool found;
1680 
1681  if (pgstat_track_functions <= fcinfo->flinfo->fn_stats)
1682  {
1683  /* stats not wanted */
1684  fcu->fs = NULL;
1685  return;
1686  }
1687 
1688  if (!pgStatFunctions)
1689  {
1690  /* First time through - initialize function stat table */
1691  HASHCTL hash_ctl;
1692 
1693  memset(&hash_ctl, 0, sizeof(hash_ctl));
1694  hash_ctl.keysize = sizeof(Oid);
1695  hash_ctl.entrysize = sizeof(PgStat_BackendFunctionEntry);
1696  pgStatFunctions = hash_create("Function stat entries",
1698  &hash_ctl,
1699  HASH_ELEM | HASH_BLOBS);
1700  }
1701 
1702  /* Get the stats entry for this function, create if necessary */
1703  htabent = hash_search(pgStatFunctions, &fcinfo->flinfo->fn_oid,
1704  HASH_ENTER, &found);
1705  if (!found)
1706  MemSet(&htabent->f_counts, 0, sizeof(PgStat_FunctionCounts));
1707 
1708  fcu->fs = &htabent->f_counts;
1709 
1710  /* save stats for this function, later used to compensate for recursion */
1711  fcu->save_f_total_time = htabent->f_counts.f_total_time;
1712 
1713  /* save current backend-wide total time */
1714  fcu->save_total = total_func_time;
1715 
1716  /* get clock time as of function start */
1718 }
1719 
1720 /*
1721  * find_funcstat_entry - find any existing PgStat_BackendFunctionEntry entry
1722  * for specified function
1723  *
1724  * If no entry, return NULL, don't create a new one
1725  */
1728 {
1729  if (pgStatFunctions == NULL)
1730  return NULL;
1731 
1732  return (PgStat_BackendFunctionEntry *) hash_search(pgStatFunctions,
1733  (void *) &func_id,
1734  HASH_FIND, NULL);
1735 }
1736 
1737 /*
1738  * Calculate function call usage and update stat counters.
1739  * Called by the executor after invoking a function.
1740  *
1741  * In the case of a set-returning function that runs in value-per-call mode,
1742  * we will see multiple pgstat_init_function_usage/pgstat_end_function_usage
1743  * calls for what the user considers a single call of the function. The
1744  * finalize flag should be TRUE on the last call.
1745  */
1746 void
1748 {
1749  PgStat_FunctionCounts *fs = fcu->fs;
1750  instr_time f_total;
1751  instr_time f_others;
1752  instr_time f_self;
1753 
1754  /* stats not wanted? */
1755  if (fs == NULL)
1756  return;
1757 
1758  /* total elapsed time in this function call */
1759  INSTR_TIME_SET_CURRENT(f_total);
1760  INSTR_TIME_SUBTRACT(f_total, fcu->f_start);
1761 
1762  /* self usage: elapsed minus anything already charged to other calls */
1763  f_others = total_func_time;
1764  INSTR_TIME_SUBTRACT(f_others, fcu->save_total);
1765  f_self = f_total;
1766  INSTR_TIME_SUBTRACT(f_self, f_others);
1767 
1768  /* update backend-wide total time */
1770 
1771  /*
1772  * Compute the new f_total_time as the total elapsed time added to the
1773  * pre-call value of f_total_time. This is necessary to avoid
1774  * double-counting any time taken by recursive calls of myself. (We do
1775  * not need any similar kluge for self time, since that already excludes
1776  * any recursive calls.)
1777  */
1778  INSTR_TIME_ADD(f_total, fcu->save_f_total_time);
1779 
1780  /* update counters in function stats table */
1781  if (finalize)
1782  fs->f_numcalls++;
1783  fs->f_total_time = f_total;
1784  INSTR_TIME_ADD(fs->f_self_time, f_self);
1785 
1786  /* indicate that we have something to send */
1787  have_function_stats = true;
1788 }
1789 
1790 
1791 /* ----------
1792  * pgstat_initstats() -
1793  *
1794  * Initialize a relcache entry to count access statistics.
1795  * Called whenever a relation is opened.
1796  *
1797  * We assume that a relcache entry's pgstat_info field is zeroed by
1798  * relcache.c when the relcache entry is made; thereafter it is long-lived
1799  * data. We can avoid repeated searches of the TabStatus arrays when the
1800  * same relation is touched repeatedly within a transaction.
1801  * ----------
1802  */
1803 void
1805 {
1806  Oid rel_id = rel->rd_id;
1807  char relkind = rel->rd_rel->relkind;
1808 
1809  /* We only count stats for things that have storage */
1810  if (!RELKIND_HAS_STORAGE(relkind))
1811  {
1812  rel->pgstat_info = NULL;
1813  return;
1814  }
1815 
1817  {
1818  /* We're not counting at all */
1819  rel->pgstat_info = NULL;
1820  return;
1821  }
1822 
1823  /*
1824  * If we already set up this relation in the current transaction, nothing
1825  * to do.
1826  */
1827  if (rel->pgstat_info != NULL &&
1828  rel->pgstat_info->t_id == rel_id)
1829  return;
1830 
1831  /* Else find or make the PgStat_TableStatus entry, and update link */
1832  rel->pgstat_info = get_tabstat_entry(rel_id, rel->rd_rel->relisshared);
1833 }
1834 
1835 /*
1836  * get_tabstat_entry - find or create a PgStat_TableStatus entry for rel
1837  */
1838 static PgStat_TableStatus *
1839 get_tabstat_entry(Oid rel_id, bool isshared)
1840 {
1841  TabStatHashEntry *hash_entry;
1842  PgStat_TableStatus *entry;
1843  TabStatusArray *tsa;
1844  bool found;
1845 
1846  /*
1847  * Create hash table if we don't have it already.
1848  */
1849  if (pgStatTabHash == NULL)
1850  {
1851  HASHCTL ctl;
1852 
1853  memset(&ctl, 0, sizeof(ctl));
1854  ctl.keysize = sizeof(Oid);
1855  ctl.entrysize = sizeof(TabStatHashEntry);
1856 
1857  pgStatTabHash = hash_create("pgstat TabStatusArray lookup hash table",
1859  &ctl,
1860  HASH_ELEM | HASH_BLOBS);
1861  }
1862 
1863  /*
1864  * Find an entry or create a new one.
1865  */
1866  hash_entry = hash_search(pgStatTabHash, &rel_id, HASH_ENTER, &found);
1867  if (!found)
1868  {
1869  /* initialize new entry with null pointer */
1870  hash_entry->tsa_entry = NULL;
1871  }
1872 
1873  /*
1874  * If entry is already valid, we're done.
1875  */
1876  if (hash_entry->tsa_entry)
1877  return hash_entry->tsa_entry;
1878 
1879  /*
1880  * Locate the first pgStatTabList entry with free space, making a new list
1881  * entry if needed. Note that we could get an OOM failure here, but if so
1882  * we have left the hashtable and the list in a consistent state.
1883  */
1884  if (pgStatTabList == NULL)
1885  {
1886  /* Set up first pgStatTabList entry */
1887  pgStatTabList = (TabStatusArray *)
1889  sizeof(TabStatusArray));
1890  }
1891 
1892  tsa = pgStatTabList;
1893  while (tsa->tsa_used >= TABSTAT_QUANTUM)
1894  {
1895  if (tsa->tsa_next == NULL)
1896  tsa->tsa_next = (TabStatusArray *)
1898  sizeof(TabStatusArray));
1899  tsa = tsa->tsa_next;
1900  }
1901 
1902  /*
1903  * Allocate a PgStat_TableStatus entry within this list entry. We assume
1904  * the entry was already zeroed, either at creation or after last use.
1905  */
1906  entry = &tsa->tsa_entries[tsa->tsa_used++];
1907  entry->t_id = rel_id;
1908  entry->t_shared = isshared;
1909 
1910  /*
1911  * Now we can fill the entry in pgStatTabHash.
1912  */
1913  hash_entry->tsa_entry = entry;
1914 
1915  return entry;
1916 }
1917 
1918 /*
1919  * find_tabstat_entry - find any existing PgStat_TableStatus entry for rel
1920  *
1921  * If no entry, return NULL, don't create a new one
1922  *
1923  * Note: if we got an error in the most recent execution of pgstat_report_stat,
1924  * it's possible that an entry exists but there's no hashtable entry for it.
1925  * That's okay, we'll treat this case as "doesn't exist".
1926  */
1929 {
1930  TabStatHashEntry *hash_entry;
1931 
1932  /* If hashtable doesn't exist, there are no entries at all */
1933  if (!pgStatTabHash)
1934  return NULL;
1935 
1936  hash_entry = hash_search(pgStatTabHash, &rel_id, HASH_FIND, NULL);
1937  if (!hash_entry)
1938  return NULL;
1939 
1940  /* Note that this step could also return NULL, but that's correct */
1941  return hash_entry->tsa_entry;
1942 }
1943 
1944 /*
1945  * get_tabstat_stack_level - add a new (sub)transaction stack entry if needed
1946  */
1947 static PgStat_SubXactStatus *
1949 {
1950  PgStat_SubXactStatus *xact_state;
1951 
1952  xact_state = pgStatXactStack;
1953  if (xact_state == NULL || xact_state->nest_level != nest_level)
1954  {
1955  xact_state = (PgStat_SubXactStatus *)
1957  sizeof(PgStat_SubXactStatus));
1958  xact_state->nest_level = nest_level;
1959  xact_state->prev = pgStatXactStack;
1960  xact_state->first = NULL;
1961  pgStatXactStack = xact_state;
1962  }
1963  return xact_state;
1964 }
1965 
1966 /*
1967  * add_tabstat_xact_level - add a new (sub)transaction state record
1968  */
1969 static void
1970 add_tabstat_xact_level(PgStat_TableStatus *pgstat_info, int nest_level)
1971 {
1972  PgStat_SubXactStatus *xact_state;
1974 
1975  /*
1976  * If this is the first rel to be modified at the current nest level, we
1977  * first have to push a transaction stack entry.
1978  */
1979  xact_state = get_tabstat_stack_level(nest_level);
1980 
1981  /* Now make a per-table stack entry */
1982  trans = (PgStat_TableXactStatus *)
1984  sizeof(PgStat_TableXactStatus));
1985  trans->nest_level = nest_level;
1986  trans->upper = pgstat_info->trans;
1987  trans->parent = pgstat_info;
1988  trans->next = xact_state->first;
1989  xact_state->first = trans;
1990  pgstat_info->trans = trans;
1991 }
1992 
1993 /*
1994  * pgstat_count_heap_insert - count a tuple insertion of n tuples
1995  */
1996 void
1998 {
1999  PgStat_TableStatus *pgstat_info = rel->pgstat_info;
2000 
2001  if (pgstat_info != NULL)
2002  {
2003  /* We have to log the effect at the proper transactional level */
2004  int nest_level = GetCurrentTransactionNestLevel();
2005 
2006  if (pgstat_info->trans == NULL ||
2007  pgstat_info->trans->nest_level != nest_level)
2008  add_tabstat_xact_level(pgstat_info, nest_level);
2009 
2010  pgstat_info->trans->tuples_inserted += n;
2011  }
2012 }
2013 
2014 /*
2015  * pgstat_count_heap_update - count a tuple update
2016  */
2017 void
2019 {
2020  PgStat_TableStatus *pgstat_info = rel->pgstat_info;
2021 
2022  if (pgstat_info != NULL)
2023  {
2024  /* We have to log the effect at the proper transactional level */
2025  int nest_level = GetCurrentTransactionNestLevel();
2026 
2027  if (pgstat_info->trans == NULL ||
2028  pgstat_info->trans->nest_level != nest_level)
2029  add_tabstat_xact_level(pgstat_info, nest_level);
2030 
2031  pgstat_info->trans->tuples_updated++;
2032 
2033  /* t_tuples_hot_updated is nontransactional, so just advance it */
2034  if (hot)
2035  pgstat_info->t_counts.t_tuples_hot_updated++;
2036  }
2037 }
2038 
2039 /*
2040  * pgstat_count_heap_delete - count a tuple deletion
2041  */
2042 void
2044 {
2045  PgStat_TableStatus *pgstat_info = rel->pgstat_info;
2046 
2047  if (pgstat_info != NULL)
2048  {
2049  /* We have to log the effect at the proper transactional level */
2050  int nest_level = GetCurrentTransactionNestLevel();
2051 
2052  if (pgstat_info->trans == NULL ||
2053  pgstat_info->trans->nest_level != nest_level)
2054  add_tabstat_xact_level(pgstat_info, nest_level);
2055 
2056  pgstat_info->trans->tuples_deleted++;
2057  }
2058 }
2059 
2060 /*
2061  * pgstat_truncate_save_counters
2062  *
2063  * Whenever a table is truncated, we save its i/u/d counters so that they can
2064  * be cleared, and if the (sub)xact that executed the truncate later aborts,
2065  * the counters can be restored to the saved (pre-truncate) values. Note we do
2066  * this on the first truncate in any particular subxact level only.
2067  */
2068 static void
2070 {
2071  if (!trans->truncated)
2072  {
2073  trans->inserted_pre_trunc = trans->tuples_inserted;
2074  trans->updated_pre_trunc = trans->tuples_updated;
2075  trans->deleted_pre_trunc = trans->tuples_deleted;
2076  trans->truncated = true;
2077  }
2078 }
2079 
2080 /*
2081  * pgstat_truncate_restore_counters - restore counters when a truncate aborts
2082  */
2083 static void
2085 {
2086  if (trans->truncated)
2087  {
2088  trans->tuples_inserted = trans->inserted_pre_trunc;
2089  trans->tuples_updated = trans->updated_pre_trunc;
2090  trans->tuples_deleted = trans->deleted_pre_trunc;
2091  }
2092 }
2093 
2094 /*
2095  * pgstat_count_truncate - update tuple counters due to truncate
2096  */
2097 void
2099 {
2100  PgStat_TableStatus *pgstat_info = rel->pgstat_info;
2101 
2102  if (pgstat_info != NULL)
2103  {
2104  /* We have to log the effect at the proper transactional level */
2105  int nest_level = GetCurrentTransactionNestLevel();
2106 
2107  if (pgstat_info->trans == NULL ||
2108  pgstat_info->trans->nest_level != nest_level)
2109  add_tabstat_xact_level(pgstat_info, nest_level);
2110 
2111  pgstat_truncate_save_counters(pgstat_info->trans);
2112  pgstat_info->trans->tuples_inserted = 0;
2113  pgstat_info->trans->tuples_updated = 0;
2114  pgstat_info->trans->tuples_deleted = 0;
2115  }
2116 }
2117 
2118 /*
2119  * pgstat_update_heap_dead_tuples - update dead-tuples count
2120  *
2121  * The semantics of this are that we are reporting the nontransactional
2122  * recovery of "delta" dead tuples; so t_delta_dead_tuples decreases
2123  * rather than increasing, and the change goes straight into the per-table
2124  * counter, not into transactional state.
2125  */
2126 void
2128 {
2129  PgStat_TableStatus *pgstat_info = rel->pgstat_info;
2130 
2131  if (pgstat_info != NULL)
2132  pgstat_info->t_counts.t_delta_dead_tuples -= delta;
2133 }
2134 
2135 
2136 /* ----------
2137  * AtEOXact_PgStat
2138  *
2139  * Called from access/transam/xact.c at top-level transaction commit/abort.
2140  * ----------
2141  */
2142 void
2143 AtEOXact_PgStat(bool isCommit, bool parallel)
2144 {
2145  PgStat_SubXactStatus *xact_state;
2146 
2147  /* Don't count parallel worker transaction stats */
2148  if (!parallel)
2149  {
2150  /*
2151  * Count transaction commit or abort. (We use counters, not just
2152  * bools, in case the reporting message isn't sent right away.)
2153  */
2154  if (isCommit)
2155  pgStatXactCommit++;
2156  else
2158  }
2159 
2160  /*
2161  * Transfer transactional insert/update counts into the base tabstat
2162  * entries. We don't bother to free any of the transactional state, since
2163  * it's all in TopTransactionContext and will go away anyway.
2164  */
2165  xact_state = pgStatXactStack;
2166  if (xact_state != NULL)
2167  {
2169 
2170  Assert(xact_state->nest_level == 1);
2171  Assert(xact_state->prev == NULL);
2172  for (trans = xact_state->first; trans != NULL; trans = trans->next)
2173  {
2174  PgStat_TableStatus *tabstat;
2175 
2176  Assert(trans->nest_level == 1);
2177  Assert(trans->upper == NULL);
2178  tabstat = trans->parent;
2179  Assert(tabstat->trans == trans);
2180  /* restore pre-truncate stats (if any) in case of aborted xact */
2181  if (!isCommit)
2183  /* count attempted actions regardless of commit/abort */
2184  tabstat->t_counts.t_tuples_inserted += trans->tuples_inserted;
2185  tabstat->t_counts.t_tuples_updated += trans->tuples_updated;
2186  tabstat->t_counts.t_tuples_deleted += trans->tuples_deleted;
2187  if (isCommit)
2188  {
2189  tabstat->t_counts.t_truncated = trans->truncated;
2190  if (trans->truncated)
2191  {
2192  /* forget live/dead stats seen by backend thus far */
2193  tabstat->t_counts.t_delta_live_tuples = 0;
2194  tabstat->t_counts.t_delta_dead_tuples = 0;
2195  }
2196  /* insert adds a live tuple, delete removes one */
2197  tabstat->t_counts.t_delta_live_tuples +=
2198  trans->tuples_inserted - trans->tuples_deleted;
2199  /* update and delete each create a dead tuple */
2200  tabstat->t_counts.t_delta_dead_tuples +=
2201  trans->tuples_updated + trans->tuples_deleted;
2202  /* insert, update, delete each count as one change event */
2203  tabstat->t_counts.t_changed_tuples +=
2204  trans->tuples_inserted + trans->tuples_updated +
2205  trans->tuples_deleted;
2206  }
2207  else
2208  {
2209  /* inserted tuples are dead, deleted tuples are unaffected */
2210  tabstat->t_counts.t_delta_dead_tuples +=
2211  trans->tuples_inserted + trans->tuples_updated;
2212  /* an aborted xact generates no changed_tuple events */
2213  }
2214  tabstat->trans = NULL;
2215  }
2216  }
2217  pgStatXactStack = NULL;
2218 
2219  /* Make sure any stats snapshot is thrown away */
2221 }
2222 
2223 /* ----------
2224  * AtEOSubXact_PgStat
2225  *
2226  * Called from access/transam/xact.c at subtransaction commit/abort.
2227  * ----------
2228  */
2229 void
2230 AtEOSubXact_PgStat(bool isCommit, int nestDepth)
2231 {
2232  PgStat_SubXactStatus *xact_state;
2233 
2234  /*
2235  * Transfer transactional insert/update counts into the next higher
2236  * subtransaction state.
2237  */
2238  xact_state = pgStatXactStack;
2239  if (xact_state != NULL &&
2240  xact_state->nest_level >= nestDepth)
2241  {
2243  PgStat_TableXactStatus *next_trans;
2244 
2245  /* delink xact_state from stack immediately to simplify reuse case */
2246  pgStatXactStack = xact_state->prev;
2247 
2248  for (trans = xact_state->first; trans != NULL; trans = next_trans)
2249  {
2250  PgStat_TableStatus *tabstat;
2251 
2252  next_trans = trans->next;
2253  Assert(trans->nest_level == nestDepth);
2254  tabstat = trans->parent;
2255  Assert(tabstat->trans == trans);
2256  if (isCommit)
2257  {
2258  if (trans->upper && trans->upper->nest_level == nestDepth - 1)
2259  {
2260  if (trans->truncated)
2261  {
2262  /* propagate the truncate status one level up */
2264  /* replace upper xact stats with ours */
2265  trans->upper->tuples_inserted = trans->tuples_inserted;
2266  trans->upper->tuples_updated = trans->tuples_updated;
2267  trans->upper->tuples_deleted = trans->tuples_deleted;
2268  }
2269  else
2270  {
2271  trans->upper->tuples_inserted += trans->tuples_inserted;
2272  trans->upper->tuples_updated += trans->tuples_updated;
2273  trans->upper->tuples_deleted += trans->tuples_deleted;
2274  }
2275  tabstat->trans = trans->upper;
2276  pfree(trans);
2277  }
2278  else
2279  {
2280  /*
2281  * When there isn't an immediate parent state, we can just
2282  * reuse the record instead of going through a
2283  * palloc/pfree pushup (this works since it's all in
2284  * TopTransactionContext anyway). We have to re-link it
2285  * into the parent level, though, and that might mean
2286  * pushing a new entry into the pgStatXactStack.
2287  */
2288  PgStat_SubXactStatus *upper_xact_state;
2289 
2290  upper_xact_state = get_tabstat_stack_level(nestDepth - 1);
2291  trans->next = upper_xact_state->first;
2292  upper_xact_state->first = trans;
2293  trans->nest_level = nestDepth - 1;
2294  }
2295  }
2296  else
2297  {
2298  /*
2299  * On abort, update top-level tabstat counts, then forget the
2300  * subtransaction
2301  */
2302 
2303  /* first restore values obliterated by truncate */
2305  /* count attempted actions regardless of commit/abort */
2306  tabstat->t_counts.t_tuples_inserted += trans->tuples_inserted;
2307  tabstat->t_counts.t_tuples_updated += trans->tuples_updated;
2308  tabstat->t_counts.t_tuples_deleted += trans->tuples_deleted;
2309  /* inserted tuples are dead, deleted tuples are unaffected */
2310  tabstat->t_counts.t_delta_dead_tuples +=
2311  trans->tuples_inserted + trans->tuples_updated;
2312  tabstat->trans = trans->upper;
2313  pfree(trans);
2314  }
2315  }
2316  pfree(xact_state);
2317  }
2318 }
2319 
2320 
2321 /*
2322  * AtPrepare_PgStat
2323  * Save the transactional stats state at 2PC transaction prepare.
2324  *
2325  * In this phase we just generate 2PC records for all the pending
2326  * transaction-dependent stats work.
2327  */
2328 void
2330 {
2331  PgStat_SubXactStatus *xact_state;
2332 
2333  xact_state = pgStatXactStack;
2334  if (xact_state != NULL)
2335  {
2337 
2338  Assert(xact_state->nest_level == 1);
2339  Assert(xact_state->prev == NULL);
2340  for (trans = xact_state->first; trans != NULL; trans = trans->next)
2341  {
2342  PgStat_TableStatus *tabstat;
2343  TwoPhasePgStatRecord record;
2344 
2345  Assert(trans->nest_level == 1);
2346  Assert(trans->upper == NULL);
2347  tabstat = trans->parent;
2348  Assert(tabstat->trans == trans);
2349 
2350  record.tuples_inserted = trans->tuples_inserted;
2351  record.tuples_updated = trans->tuples_updated;
2352  record.tuples_deleted = trans->tuples_deleted;
2353  record.inserted_pre_trunc = trans->inserted_pre_trunc;
2354  record.updated_pre_trunc = trans->updated_pre_trunc;
2355  record.deleted_pre_trunc = trans->deleted_pre_trunc;
2356  record.t_id = tabstat->t_id;
2357  record.t_shared = tabstat->t_shared;
2358  record.t_truncated = trans->truncated;
2359 
2361  &record, sizeof(TwoPhasePgStatRecord));
2362  }
2363  }
2364 }
2365 
2366 /*
2367  * PostPrepare_PgStat
2368  * Clean up after successful PREPARE.
2369  *
2370  * All we need do here is unlink the transaction stats state from the
2371  * nontransactional state. The nontransactional action counts will be
2372  * reported to the stats collector immediately, while the effects on live
2373  * and dead tuple counts are preserved in the 2PC state file.
2374  *
2375  * Note: AtEOXact_PgStat is not called during PREPARE.
2376  */
2377 void
2379 {
2380  PgStat_SubXactStatus *xact_state;
2381 
2382  /*
2383  * We don't bother to free any of the transactional state, since it's all
2384  * in TopTransactionContext and will go away anyway.
2385  */
2386  xact_state = pgStatXactStack;
2387  if (xact_state != NULL)
2388  {
2390 
2391  for (trans = xact_state->first; trans != NULL; trans = trans->next)
2392  {
2393  PgStat_TableStatus *tabstat;
2394 
2395  tabstat = trans->parent;
2396  tabstat->trans = NULL;
2397  }
2398  }
2399  pgStatXactStack = NULL;
2400 
2401  /* Make sure any stats snapshot is thrown away */
2403 }
2404 
2405 /*
2406  * 2PC processing routine for COMMIT PREPARED case.
2407  *
2408  * Load the saved counts into our local pgstats state.
2409  */
2410 void
2412  void *recdata, uint32 len)
2413 {
2414  TwoPhasePgStatRecord *rec = (TwoPhasePgStatRecord *) recdata;
2415  PgStat_TableStatus *pgstat_info;
2416 
2417  /* Find or create a tabstat entry for the rel */
2418  pgstat_info = get_tabstat_entry(rec->t_id, rec->t_shared);
2419 
2420  /* Same math as in AtEOXact_PgStat, commit case */
2421  pgstat_info->t_counts.t_tuples_inserted += rec->tuples_inserted;
2422  pgstat_info->t_counts.t_tuples_updated += rec->tuples_updated;
2423  pgstat_info->t_counts.t_tuples_deleted += rec->tuples_deleted;
2424  pgstat_info->t_counts.t_truncated = rec->t_truncated;
2425  if (rec->t_truncated)
2426  {
2427  /* forget live/dead stats seen by backend thus far */
2428  pgstat_info->t_counts.t_delta_live_tuples = 0;
2429  pgstat_info->t_counts.t_delta_dead_tuples = 0;
2430  }
2431  pgstat_info->t_counts.t_delta_live_tuples +=
2432  rec->tuples_inserted - rec->tuples_deleted;
2433  pgstat_info->t_counts.t_delta_dead_tuples +=
2434  rec->tuples_updated + rec->tuples_deleted;
2435  pgstat_info->t_counts.t_changed_tuples +=
2436  rec->tuples_inserted + rec->tuples_updated +
2437  rec->tuples_deleted;
2438 }
2439 
2440 /*
2441  * 2PC processing routine for ROLLBACK PREPARED case.
2442  *
2443  * Load the saved counts into our local pgstats state, but treat them
2444  * as aborted.
2445  */
2446 void
2448  void *recdata, uint32 len)
2449 {
2450  TwoPhasePgStatRecord *rec = (TwoPhasePgStatRecord *) recdata;
2451  PgStat_TableStatus *pgstat_info;
2452 
2453  /* Find or create a tabstat entry for the rel */
2454  pgstat_info = get_tabstat_entry(rec->t_id, rec->t_shared);
2455 
2456  /* Same math as in AtEOXact_PgStat, abort case */
2457  if (rec->t_truncated)
2458  {
2459  rec->tuples_inserted = rec->inserted_pre_trunc;
2460  rec->tuples_updated = rec->updated_pre_trunc;
2461  rec->tuples_deleted = rec->deleted_pre_trunc;
2462  }
2463  pgstat_info->t_counts.t_tuples_inserted += rec->tuples_inserted;
2464  pgstat_info->t_counts.t_tuples_updated += rec->tuples_updated;
2465  pgstat_info->t_counts.t_tuples_deleted += rec->tuples_deleted;
2466  pgstat_info->t_counts.t_delta_dead_tuples +=
2467  rec->tuples_inserted + rec->tuples_updated;
2468 }
2469 
2470 
2471 /* ----------
2472  * pgstat_fetch_stat_dbentry() -
2473  *
2474  * Support function for the SQL-callable pgstat* functions. Returns
2475  * the collected statistics for one database or NULL. NULL doesn't mean
2476  * that the database doesn't exist, it is just not yet known by the
2477  * collector, so the caller is better off to report ZERO instead.
2478  * ----------
2479  */
2482 {
2483  /*
2484  * If not done for this transaction, read the statistics collector stats
2485  * file into some hash tables.
2486  */
2488 
2489  /*
2490  * Lookup the requested database; return NULL if not found
2491  */
2492  return (PgStat_StatDBEntry *) hash_search(pgStatDBHash,
2493  (void *) &dbid,
2494  HASH_FIND, NULL);
2495 }
2496 
2497 
2498 /* ----------
2499  * pgstat_fetch_stat_tabentry() -
2500  *
2501  * Support function for the SQL-callable pgstat* functions. Returns
2502  * the collected statistics for one table or NULL. NULL doesn't mean
2503  * that the table doesn't exist, it is just not yet known by the
2504  * collector, so the caller is better off to report ZERO instead.
2505  * ----------
2506  */
2509 {
2510  Oid dbid;
2511  PgStat_StatDBEntry *dbentry;
2512  PgStat_StatTabEntry *tabentry;
2513 
2514  /*
2515  * If not done for this transaction, read the statistics collector stats
2516  * file into some hash tables.
2517  */
2519 
2520  /*
2521  * Lookup our database, then look in its table hash table.
2522  */
2523  dbid = MyDatabaseId;
2524  dbentry = (PgStat_StatDBEntry *) hash_search(pgStatDBHash,
2525  (void *) &dbid,
2526  HASH_FIND, NULL);
2527  if (dbentry != NULL && dbentry->tables != NULL)
2528  {
2529  tabentry = (PgStat_StatTabEntry *) hash_search(dbentry->tables,
2530  (void *) &relid,
2531  HASH_FIND, NULL);
2532  if (tabentry)
2533  return tabentry;
2534  }
2535 
2536  /*
2537  * If we didn't find it, maybe it's a shared table.
2538  */
2539  dbid = InvalidOid;
2540  dbentry = (PgStat_StatDBEntry *) hash_search(pgStatDBHash,
2541  (void *) &dbid,
2542  HASH_FIND, NULL);
2543  if (dbentry != NULL && dbentry->tables != NULL)
2544  {
2545  tabentry = (PgStat_StatTabEntry *) hash_search(dbentry->tables,
2546  (void *) &relid,
2547  HASH_FIND, NULL);
2548  if (tabentry)
2549  return tabentry;
2550  }
2551 
2552  return NULL;
2553 }
2554 
2555 
2556 /* ----------
2557  * pgstat_fetch_stat_funcentry() -
2558  *
2559  * Support function for the SQL-callable pgstat* functions. Returns
2560  * the collected statistics for one function or NULL.
2561  * ----------
2562  */
2565 {
2566  PgStat_StatDBEntry *dbentry;
2567  PgStat_StatFuncEntry *funcentry = NULL;
2568 
2569  /* load the stats file if needed */
2571 
2572  /* Lookup our database, then find the requested function. */
2574  if (dbentry != NULL && dbentry->functions != NULL)
2575  {
2576  funcentry = (PgStat_StatFuncEntry *) hash_search(dbentry->functions,
2577  (void *) &func_id,
2578  HASH_FIND, NULL);
2579  }
2580 
2581  return funcentry;
2582 }
2583 
2584 
2585 /* ----------
2586  * pgstat_fetch_stat_beentry() -
2587  *
2588  * Support function for the SQL-callable pgstat* functions. Returns
2589  * our local copy of the current-activity entry for one backend.
2590  *
2591  * NB: caller is responsible for a check if the user is permitted to see
2592  * this info (especially the querystring).
2593  * ----------
2594  */
2597 {
2599 
2600  if (beid < 1 || beid > localNumBackends)
2601  return NULL;
2602 
2603  return &localBackendStatusTable[beid - 1].backendStatus;
2604 }
2605 
2606 
2607 /* ----------
2608  * pgstat_fetch_stat_local_beentry() -
2609  *
2610  * Like pgstat_fetch_stat_beentry() but with locally computed additions (like
2611  * xid and xmin values of the backend)
2612  *
2613  * NB: caller is responsible for a check if the user is permitted to see
2614  * this info (especially the querystring).
2615  * ----------
2616  */
2619 {
2621 
2622  if (beid < 1 || beid > localNumBackends)
2623  return NULL;
2624 
2625  return &localBackendStatusTable[beid - 1];
2626 }
2627 
2628 
2629 /* ----------
2630  * pgstat_fetch_stat_numbackends() -
2631  *
2632  * Support function for the SQL-callable pgstat* functions. Returns
2633  * the maximum current backend id.
2634  * ----------
2635  */
2636 int
2638 {
2640 
2641  return localNumBackends;
2642 }
2643 
2644 /*
2645  * ---------
2646  * pgstat_fetch_stat_archiver() -
2647  *
2648  * Support function for the SQL-callable pgstat* functions. Returns
2649  * a pointer to the archiver statistics struct.
2650  * ---------
2651  */
2654 {
2656 
2657  return &archiverStats;
2658 }
2659 
2660 
2661 /*
2662  * ---------
2663  * pgstat_fetch_global() -
2664  *
2665  * Support function for the SQL-callable pgstat* functions. Returns
2666  * a pointer to the global statistics struct.
2667  * ---------
2668  */
2671 {
2673 
2674  return &globalStats;
2675 }
2676 
2677 
2678 /*
2679  * ---------
2680  * pgstat_fetch_slru() -
2681  *
2682  * Support function for the SQL-callable pgstat* functions. Returns
2683  * a pointer to the slru statistics struct.
2684  * ---------
2685  */
2688 {
2690 
2691  return slruStats;
2692 }
2693 
2694 
2695 /* ------------------------------------------------------------
2696  * Functions for management of the shared-memory PgBackendStatus array
2697  * ------------------------------------------------------------
2698  */
2699 
2702 static char *BackendAppnameBuffer = NULL;
2703 static char *BackendClientHostnameBuffer = NULL;
2704 static char *BackendActivityBuffer = NULL;
2706 #ifdef USE_SSL
2707 static PgBackendSSLStatus *BackendSslStatusBuffer = NULL;
2708 #endif
2709 #ifdef ENABLE_GSS
2710 static PgBackendGSSStatus *BackendGssStatusBuffer = NULL;
2711 #endif
2712 
2713 
2714 /*
2715  * Report shared-memory space needed by CreateSharedBackendStatus.
2716  */
2717 Size
2719 {
2720  Size size;
2721 
2722  /* BackendStatusArray: */
2723  size = mul_size(sizeof(PgBackendStatus), NumBackendStatSlots);
2724  /* BackendAppnameBuffer: */
2725  size = add_size(size,
2727  /* BackendClientHostnameBuffer: */
2728  size = add_size(size,
2730  /* BackendActivityBuffer: */
2731  size = add_size(size,
2733 #ifdef USE_SSL
2734  /* BackendSslStatusBuffer: */
2735  size = add_size(size,
2737 #endif
2738  return size;
2739 }
2740 
2741 /*
2742  * Initialize the shared status array and several string buffers
2743  * during postmaster startup.
2744  */
2745 void
2747 {
2748  Size size;
2749  bool found;
2750  int i;
2751  char *buffer;
2752 
2753  /* Create or attach to the shared array */
2754  size = mul_size(sizeof(PgBackendStatus), NumBackendStatSlots);
2755  BackendStatusArray = (PgBackendStatus *)
2756  ShmemInitStruct("Backend Status Array", size, &found);
2757 
2758  if (!found)
2759  {
2760  /*
2761  * We're the first - initialize.
2762  */
2763  MemSet(BackendStatusArray, 0, size);
2764  }
2765 
2766  /* Create or attach to the shared appname buffer */
2768  BackendAppnameBuffer = (char *)
2769  ShmemInitStruct("Backend Application Name Buffer", size, &found);
2770 
2771  if (!found)
2772  {
2773  MemSet(BackendAppnameBuffer, 0, size);
2774 
2775  /* Initialize st_appname pointers. */
2776  buffer = BackendAppnameBuffer;
2777  for (i = 0; i < NumBackendStatSlots; i++)
2778  {
2779  BackendStatusArray[i].st_appname = buffer;
2780  buffer += NAMEDATALEN;
2781  }
2782  }
2783 
2784  /* Create or attach to the shared client hostname buffer */
2786  BackendClientHostnameBuffer = (char *)
2787  ShmemInitStruct("Backend Client Host Name Buffer", size, &found);
2788 
2789  if (!found)
2790  {
2792 
2793  /* Initialize st_clienthostname pointers. */
2794  buffer = BackendClientHostnameBuffer;
2795  for (i = 0; i < NumBackendStatSlots; i++)
2796  {
2797  BackendStatusArray[i].st_clienthostname = buffer;
2798  buffer += NAMEDATALEN;
2799  }
2800  }
2801 
2802  /* Create or attach to the shared activity buffer */
2805  BackendActivityBuffer = (char *)
2806  ShmemInitStruct("Backend Activity Buffer",
2808  &found);
2809 
2810  if (!found)
2811  {
2813 
2814  /* Initialize st_activity pointers. */
2815  buffer = BackendActivityBuffer;
2816  for (i = 0; i < NumBackendStatSlots; i++)
2817  {
2818  BackendStatusArray[i].st_activity_raw = buffer;
2820  }
2821  }
2822 
2823 #ifdef USE_SSL
2824  /* Create or attach to the shared SSL status buffer */
2826  BackendSslStatusBuffer = (PgBackendSSLStatus *)
2827  ShmemInitStruct("Backend SSL Status Buffer", size, &found);
2828 
2829  if (!found)
2830  {
2831  PgBackendSSLStatus *ptr;
2832 
2833  MemSet(BackendSslStatusBuffer, 0, size);
2834 
2835  /* Initialize st_sslstatus pointers. */
2836  ptr = BackendSslStatusBuffer;
2837  for (i = 0; i < NumBackendStatSlots; i++)
2838  {
2839  BackendStatusArray[i].st_sslstatus = ptr;
2840  ptr++;
2841  }
2842  }
2843 #endif
2844 
2845 #ifdef ENABLE_GSS
2846  /* Create or attach to the shared GSSAPI status buffer */
2848  BackendGssStatusBuffer = (PgBackendGSSStatus *)
2849  ShmemInitStruct("Backend GSS Status Buffer", size, &found);
2850 
2851  if (!found)
2852  {
2853  PgBackendGSSStatus *ptr;
2854 
2855  MemSet(BackendGssStatusBuffer, 0, size);
2856 
2857  /* Initialize st_gssstatus pointers. */
2858  ptr = BackendGssStatusBuffer;
2859  for (i = 0; i < NumBackendStatSlots; i++)
2860  {
2861  BackendStatusArray[i].st_gssstatus = ptr;
2862  ptr++;
2863  }
2864  }
2865 #endif
2866 }
2867 
2868 
2869 /* ----------
2870  * pgstat_initialize() -
2871  *
2872  * Initialize pgstats state, and set up our on-proc-exit hook.
2873  * Called from InitPostgres and AuxiliaryProcessMain. For auxiliary process,
2874  * MyBackendId is invalid. Otherwise, MyBackendId must be set,
2875  * but we must not have started any transaction yet (since the
2876  * exit hook must run after the last transaction exit).
2877  * NOTE: MyDatabaseId isn't set yet; so the shutdown hook has to be careful.
2878  * ----------
2879  */
2880 void
2882 {
2883  /* Initialize MyBEEntry */
2885  {
2887  MyBEEntry = &BackendStatusArray[MyBackendId - 1];
2888  }
2889  else
2890  {
2891  /* Must be an auxiliary process */
2893 
2894  /*
2895  * Assign the MyBEEntry for an auxiliary process. Since it doesn't
2896  * have a BackendId, the slot is statically allocated based on the
2897  * auxiliary process type (MyAuxProcType). Backends use slots indexed
2898  * in the range from 1 to MaxBackends (inclusive), so we use
2899  * MaxBackends + AuxBackendType + 1 as the index of the slot for an
2900  * auxiliary process.
2901  */
2902  MyBEEntry = &BackendStatusArray[MaxBackends + MyAuxProcType];
2903  }
2904 
2905  /* Set up a process-exit hook to clean up */
2907 }
2908 
2909 /* ----------
2910  * pgstat_bestart() -
2911  *
2912  * Initialize this backend's entry in the PgBackendStatus array.
2913  * Called from InitPostgres.
2914  *
2915  * Apart from auxiliary processes, MyBackendId, MyDatabaseId,
2916  * session userid, and application_name must be set for a
2917  * backend (hence, this cannot be combined with pgstat_initialize).
2918  * Note also that we must be inside a transaction if this isn't an aux
2919  * process, as we may need to do encoding conversion on some strings.
2920  * ----------
2921  */
2922 void
2924 {
2925  volatile PgBackendStatus *vbeentry = MyBEEntry;
2926  PgBackendStatus lbeentry;
2927 #ifdef USE_SSL
2928  PgBackendSSLStatus lsslstatus;
2929 #endif
2930 #ifdef ENABLE_GSS
2931  PgBackendGSSStatus lgssstatus;
2932 #endif
2933 
2934  /* pgstats state must be initialized from pgstat_initialize() */
2935  Assert(vbeentry != NULL);
2936 
2937  /*
2938  * To minimize the time spent modifying the PgBackendStatus entry, and
2939  * avoid risk of errors inside the critical section, we first copy the
2940  * shared-memory struct to a local variable, then modify the data in the
2941  * local variable, then copy the local variable back to shared memory.
2942  * Only the last step has to be inside the critical section.
2943  *
2944  * Most of the data we copy from shared memory is just going to be
2945  * overwritten, but the struct's not so large that it's worth the
2946  * maintenance hassle to copy only the needful fields.
2947  */
2948  memcpy(&lbeentry,
2949  unvolatize(PgBackendStatus *, vbeentry),
2950  sizeof(PgBackendStatus));
2951 
2952  /* These structs can just start from zeroes each time, though */
2953 #ifdef USE_SSL
2954  memset(&lsslstatus, 0, sizeof(lsslstatus));
2955 #endif
2956 #ifdef ENABLE_GSS
2957  memset(&lgssstatus, 0, sizeof(lgssstatus));
2958 #endif
2959 
2960  /*
2961  * Now fill in all the fields of lbeentry, except for strings that are
2962  * out-of-line data. Those have to be handled separately, below.
2963  */
2964  lbeentry.st_procpid = MyProcPid;
2965  lbeentry.st_backendType = MyBackendType;
2967  lbeentry.st_activity_start_timestamp = 0;
2968  lbeentry.st_state_start_timestamp = 0;
2969  lbeentry.st_xact_start_timestamp = 0;
2970  lbeentry.st_databaseid = MyDatabaseId;
2971 
2972  /* We have userid for client-backends, wal-sender and bgworker processes */
2973  if (lbeentry.st_backendType == B_BACKEND
2974  || lbeentry.st_backendType == B_WAL_SENDER
2975  || lbeentry.st_backendType == B_BG_WORKER)
2976  lbeentry.st_userid = GetSessionUserId();
2977  else
2978  lbeentry.st_userid = InvalidOid;
2979 
2980  /*
2981  * We may not have a MyProcPort (eg, if this is the autovacuum process).
2982  * If so, use all-zeroes client address, which is dealt with specially in
2983  * pg_stat_get_backend_client_addr and pg_stat_get_backend_client_port.
2984  */
2985  if (MyProcPort)
2986  memcpy(&lbeentry.st_clientaddr, &MyProcPort->raddr,
2987  sizeof(lbeentry.st_clientaddr));
2988  else
2989  MemSet(&lbeentry.st_clientaddr, 0, sizeof(lbeentry.st_clientaddr));
2990 
2991 #ifdef USE_SSL
2993  {
2994  lbeentry.st_ssl = true;
3002  }
3003  else
3004  {
3005  lbeentry.st_ssl = false;
3006  }
3007 #else
3008  lbeentry.st_ssl = false;
3009 #endif
3010 
3011 #ifdef ENABLE_GSS
3012  if (MyProcPort && MyProcPort->gss != NULL)
3013  {
3014  lbeentry.st_gss = true;
3015  lgssstatus.gss_auth = be_gssapi_get_auth(MyProcPort);
3016  lgssstatus.gss_enc = be_gssapi_get_enc(MyProcPort);
3017 
3018  if (lgssstatus.gss_auth)
3020  }
3021  else
3022  {
3023  lbeentry.st_gss = false;
3024  }
3025 #else
3026  lbeentry.st_gss = false;
3027 #endif
3028 
3029  lbeentry.st_state = STATE_UNDEFINED;
3032 
3033  /*
3034  * we don't zero st_progress_param here to save cycles; nobody should
3035  * examine it until st_progress_command has been set to something other
3036  * than PROGRESS_COMMAND_INVALID
3037  */
3038 
3039  /*
3040  * We're ready to enter the critical section that fills the shared-memory
3041  * status entry. We follow the protocol of bumping st_changecount before
3042  * and after; and make sure it's even afterwards. We use a volatile
3043  * pointer here to ensure the compiler doesn't try to get cute.
3044  */
3045  PGSTAT_BEGIN_WRITE_ACTIVITY(vbeentry);
3046 
3047  /* make sure we'll memcpy the same st_changecount back */
3048  lbeentry.st_changecount = vbeentry->st_changecount;
3049 
3050  memcpy(unvolatize(PgBackendStatus *, vbeentry),
3051  &lbeentry,
3052  sizeof(PgBackendStatus));
3053 
3054  /*
3055  * We can write the out-of-line strings and structs using the pointers
3056  * that are in lbeentry; this saves some de-volatilizing messiness.
3057  */
3058  lbeentry.st_appname[0] = '\0';
3061  NAMEDATALEN);
3062  else
3063  lbeentry.st_clienthostname[0] = '\0';
3064  lbeentry.st_activity_raw[0] = '\0';
3065  /* Also make sure the last byte in each string area is always 0 */
3066  lbeentry.st_appname[NAMEDATALEN - 1] = '\0';
3067  lbeentry.st_clienthostname[NAMEDATALEN - 1] = '\0';
3069 
3070 #ifdef USE_SSL
3071  memcpy(lbeentry.st_sslstatus, &lsslstatus, sizeof(PgBackendSSLStatus));
3072 #endif
3073 #ifdef ENABLE_GSS
3074  memcpy(lbeentry.st_gssstatus, &lgssstatus, sizeof(PgBackendGSSStatus));
3075 #endif
3076 
3077  PGSTAT_END_WRITE_ACTIVITY(vbeentry);
3078 
3079  /* Update app name to current GUC setting */
3080  if (application_name)
3082 }
3083 
3084 /*
3085  * Shut down a single backend's statistics reporting at process exit.
3086  *
3087  * Flush any remaining statistics counts out to the collector.
3088  * Without this, operations triggered during backend exit (such as
3089  * temp table deletions) won't be counted.
3090  *
3091  * Lastly, clear out our entry in the PgBackendStatus array.
3092  */
3093 static void
3095 {
3096  volatile PgBackendStatus *beentry = MyBEEntry;
3097 
3098  /*
3099  * If we got as far as discovering our own database ID, we can report what
3100  * we did to the collector. Otherwise, we'd be sending an invalid
3101  * database ID, so forget it. (This means that accesses to pg_database
3102  * during failed backend starts might never get counted.)
3103  */
3104  if (OidIsValid(MyDatabaseId))
3105  pgstat_report_stat(true);
3106 
3107  /*
3108  * Clear my status entry, following the protocol of bumping st_changecount
3109  * before and after. We use a volatile pointer here to ensure the
3110  * compiler doesn't try to get cute.
3111  */
3112  PGSTAT_BEGIN_WRITE_ACTIVITY(beentry);
3113 
3114  beentry->st_procpid = 0; /* mark invalid */
3115 
3116  PGSTAT_END_WRITE_ACTIVITY(beentry);
3117 }
3118 
3119 
3120 /* ----------
3121  * pgstat_report_activity() -
3122  *
3123  * Called from tcop/postgres.c to report what the backend is actually doing
3124  * (but note cmd_str can be NULL for certain cases).
3125  *
3126  * All updates of the status entry follow the protocol of bumping
3127  * st_changecount before and after. We use a volatile pointer here to
3128  * ensure the compiler doesn't try to get cute.
3129  * ----------
3130  */
3131 void
3133 {
3134  volatile PgBackendStatus *beentry = MyBEEntry;
3135  TimestampTz start_timestamp;
3136  TimestampTz current_timestamp;
3137  int len = 0;
3138 
3139  TRACE_POSTGRESQL_STATEMENT_STATUS(cmd_str);
3140 
3141  if (!beentry)
3142  return;
3143 
3145  {
3146  if (beentry->st_state != STATE_DISABLED)
3147  {
3148  volatile PGPROC *proc = MyProc;
3149 
3150  /*
3151  * track_activities is disabled, but we last reported a
3152  * non-disabled state. As our final update, change the state and
3153  * clear fields we will not be updating anymore.
3154  */
3155  PGSTAT_BEGIN_WRITE_ACTIVITY(beentry);
3156  beentry->st_state = STATE_DISABLED;
3157  beentry->st_state_start_timestamp = 0;
3158  beentry->st_activity_raw[0] = '\0';
3159  beentry->st_activity_start_timestamp = 0;
3160  /* st_xact_start_timestamp and wait_event_info are also disabled */
3161  beentry->st_xact_start_timestamp = 0;
3162  proc->wait_event_info = 0;
3163  PGSTAT_END_WRITE_ACTIVITY(beentry);
3164  }
3165  return;
3166  }
3167 
3168  /*
3169  * To minimize the time spent modifying the entry, and avoid risk of
3170  * errors inside the critical section, fetch all the needed data first.
3171  */
3172  start_timestamp = GetCurrentStatementStartTimestamp();
3173  if (cmd_str != NULL)
3174  {
3175  /*
3176  * Compute length of to-be-stored string unaware of multi-byte
3177  * characters. For speed reasons that'll get corrected on read, rather
3178  * than computed every write.
3179  */
3180  len = Min(strlen(cmd_str), pgstat_track_activity_query_size - 1);
3181  }
3182  current_timestamp = GetCurrentTimestamp();
3183 
3184  /*
3185  * Now update the status entry
3186  */
3187  PGSTAT_BEGIN_WRITE_ACTIVITY(beentry);
3188 
3189  beentry->st_state = state;
3190  beentry->st_state_start_timestamp = current_timestamp;
3191 
3192  if (cmd_str != NULL)
3193  {
3194  memcpy((char *) beentry->st_activity_raw, cmd_str, len);
3195  beentry->st_activity_raw[len] = '\0';
3196  beentry->st_activity_start_timestamp = start_timestamp;
3197  }
3198 
3199  PGSTAT_END_WRITE_ACTIVITY(beentry);
3200 }
3201 
3202 /*-----------
3203  * pgstat_progress_start_command() -
3204  *
3205  * Set st_progress_command (and st_progress_command_target) in own backend
3206  * entry. Also, zero-initialize st_progress_param array.
3207  *-----------
3208  */
3209 void
3211 {
3212  volatile PgBackendStatus *beentry = MyBEEntry;
3213 
3214  if (!beentry || !pgstat_track_activities)
3215  return;
3216 
3217  PGSTAT_BEGIN_WRITE_ACTIVITY(beentry);
3218  beentry->st_progress_command = cmdtype;
3219  beentry->st_progress_command_target = relid;
3220  MemSet(&beentry->st_progress_param, 0, sizeof(beentry->st_progress_param));
3221  PGSTAT_END_WRITE_ACTIVITY(beentry);
3222 }
3223 
3224 /*-----------
3225  * pgstat_progress_update_param() -
3226  *
3227  * Update index'th member in st_progress_param[] of own backend entry.
3228  *-----------
3229  */
3230 void
3232 {
3233  volatile PgBackendStatus *beentry = MyBEEntry;
3234 
3235  Assert(index >= 0 && index < PGSTAT_NUM_PROGRESS_PARAM);
3236 
3237  if (!beentry || !pgstat_track_activities)
3238  return;
3239 
3240  PGSTAT_BEGIN_WRITE_ACTIVITY(beentry);
3241  beentry->st_progress_param[index] = val;
3242  PGSTAT_END_WRITE_ACTIVITY(beentry);
3243 }
3244 
3245 /*-----------
3246  * pgstat_progress_update_multi_param() -
3247  *
3248  * Update multiple members in st_progress_param[] of own backend entry.
3249  * This is atomic; readers won't see intermediate states.
3250  *-----------
3251  */
3252 void
3254  const int64 *val)
3255 {
3256  volatile PgBackendStatus *beentry = MyBEEntry;
3257  int i;
3258 
3259  if (!beentry || !pgstat_track_activities || nparam == 0)
3260  return;
3261 
3262  PGSTAT_BEGIN_WRITE_ACTIVITY(beentry);
3263 
3264  for (i = 0; i < nparam; ++i)
3265  {
3266  Assert(index[i] >= 0 && index[i] < PGSTAT_NUM_PROGRESS_PARAM);
3267 
3268  beentry->st_progress_param[index[i]] = val[i];
3269  }
3270 
3271  PGSTAT_END_WRITE_ACTIVITY(beentry);
3272 }
3273 
3274 /*-----------
3275  * pgstat_progress_end_command() -
3276  *
3277  * Reset st_progress_command (and st_progress_command_target) in own backend
3278  * entry. This signals the end of the command.
3279  *-----------
3280  */
3281 void
3283 {
3284  volatile PgBackendStatus *beentry = MyBEEntry;
3285 
3286  if (!beentry || !pgstat_track_activities)
3287  return;
3288 
3290  return;
3291 
3292  PGSTAT_BEGIN_WRITE_ACTIVITY(beentry);
3295  PGSTAT_END_WRITE_ACTIVITY(beentry);
3296 }
3297 
3298 /* ----------
3299  * pgstat_report_appname() -
3300  *
3301  * Called to update our application name.
3302  * ----------
3303  */
3304 void
3305 pgstat_report_appname(const char *appname)
3306 {
3307  volatile PgBackendStatus *beentry = MyBEEntry;
3308  int len;
3309 
3310  if (!beentry)
3311  return;
3312 
3313  /* This should be unnecessary if GUC did its job, but be safe */
3314  len = pg_mbcliplen(appname, strlen(appname), NAMEDATALEN - 1);
3315 
3316  /*
3317  * Update my status entry, following the protocol of bumping
3318  * st_changecount before and after. We use a volatile pointer here to
3319  * ensure the compiler doesn't try to get cute.
3320  */
3321  PGSTAT_BEGIN_WRITE_ACTIVITY(beentry);
3322 
3323  memcpy((char *) beentry->st_appname, appname, len);
3324  beentry->st_appname[len] = '\0';
3325 
3326  PGSTAT_END_WRITE_ACTIVITY(beentry);
3327 }
3328 
3329 /*
3330  * Report current transaction start timestamp as the specified value.
3331  * Zero means there is no active transaction.
3332  */
3333 void
3335 {
3336  volatile PgBackendStatus *beentry = MyBEEntry;
3337 
3338  if (!pgstat_track_activities || !beentry)
3339  return;
3340 
3341  /*
3342  * Update my status entry, following the protocol of bumping
3343  * st_changecount before and after. We use a volatile pointer here to
3344  * ensure the compiler doesn't try to get cute.
3345  */
3346  PGSTAT_BEGIN_WRITE_ACTIVITY(beentry);
3347 
3348  beentry->st_xact_start_timestamp = tstamp;
3349 
3350  PGSTAT_END_WRITE_ACTIVITY(beentry);
3351 }
3352 
3353 /* ----------
3354  * pgstat_read_current_status() -
3355  *
3356  * Copy the current contents of the PgBackendStatus array to local memory,
3357  * if not already done in this transaction.
3358  * ----------
3359  */
3360 static void
3362 {
3363  volatile PgBackendStatus *beentry;
3364  LocalPgBackendStatus *localtable;
3365  LocalPgBackendStatus *localentry;
3366  char *localappname,
3367  *localclienthostname,
3368  *localactivity;
3369 #ifdef USE_SSL
3370  PgBackendSSLStatus *localsslstatus;
3371 #endif
3372 #ifdef ENABLE_GSS
3373  PgBackendGSSStatus *localgssstatus;
3374 #endif
3375  int i;
3376 
3378  if (localBackendStatusTable)
3379  return; /* already done */
3380 
3382 
3383  /*
3384  * Allocate storage for local copy of state data. We can presume that
3385  * none of these requests overflow size_t, because we already calculated
3386  * the same values using mul_size during shmem setup. However, with
3387  * probably-silly values of pgstat_track_activity_query_size and
3388  * max_connections, the localactivity buffer could exceed 1GB, so use
3389  * "huge" allocation for that one.
3390  */
3391  localtable = (LocalPgBackendStatus *)
3392  MemoryContextAlloc(pgStatLocalContext,
3394  localappname = (char *)
3395  MemoryContextAlloc(pgStatLocalContext,
3397  localclienthostname = (char *)
3398  MemoryContextAlloc(pgStatLocalContext,
3399  NAMEDATALEN * NumBackendStatSlots);
3400  localactivity = (char *)
3401  MemoryContextAllocHuge(pgStatLocalContext,
3402  pgstat_track_activity_query_size * NumBackendStatSlots);
3403 #ifdef USE_SSL
3404  localsslstatus = (PgBackendSSLStatus *)
3405  MemoryContextAlloc(pgStatLocalContext,
3407 #endif
3408 #ifdef ENABLE_GSS
3409  localgssstatus = (PgBackendGSSStatus *)
3410  MemoryContextAlloc(pgStatLocalContext,
3412 #endif
3413 
3414  localNumBackends = 0;
3415 
3416  beentry = BackendStatusArray;
3417  localentry = localtable;
3418  for (i = 1; i <= NumBackendStatSlots; i++)
3419  {
3420  /*
3421  * Follow the protocol of retrying if st_changecount changes while we
3422  * copy the entry, or if it's odd. (The check for odd is needed to
3423  * cover the case where we are able to completely copy the entry while
3424  * the source backend is between increment steps.) We use a volatile
3425  * pointer here to ensure the compiler doesn't try to get cute.
3426  */
3427  for (;;)
3428  {
3429  int before_changecount;
3430  int after_changecount;
3431 
3432  pgstat_begin_read_activity(beentry, before_changecount);
3433 
3434  localentry->backendStatus.st_procpid = beentry->st_procpid;
3435  /* Skip all the data-copying work if entry is not in use */
3436  if (localentry->backendStatus.st_procpid > 0)
3437  {
3438  memcpy(&localentry->backendStatus, unvolatize(PgBackendStatus *, beentry), sizeof(PgBackendStatus));
3439 
3440  /*
3441  * For each PgBackendStatus field that is a pointer, copy the
3442  * pointed-to data, then adjust the local copy of the pointer
3443  * field to point at the local copy of the data.
3444  *
3445  * strcpy is safe even if the string is modified concurrently,
3446  * because there's always a \0 at the end of the buffer.
3447  */
3448  strcpy(localappname, (char *) beentry->st_appname);
3449  localentry->backendStatus.st_appname = localappname;
3450  strcpy(localclienthostname, (char *) beentry->st_clienthostname);
3451  localentry->backendStatus.st_clienthostname = localclienthostname;
3452  strcpy(localactivity, (char *) beentry->st_activity_raw);
3453  localentry->backendStatus.st_activity_raw = localactivity;
3454 #ifdef USE_SSL
3455  if (beentry->st_ssl)
3456  {
3457  memcpy(localsslstatus, beentry->st_sslstatus, sizeof(PgBackendSSLStatus));
3458  localentry->backendStatus.st_sslstatus = localsslstatus;
3459  }
3460 #endif
3461 #ifdef ENABLE_GSS
3462  if (beentry->st_gss)
3463  {
3464  memcpy(localgssstatus, beentry->st_gssstatus, sizeof(PgBackendGSSStatus));
3465  localentry->backendStatus.st_gssstatus = localgssstatus;
3466  }
3467 #endif
3468  }
3469 
3470  pgstat_end_read_activity(beentry, after_changecount);
3471 
3472  if (pgstat_read_activity_complete(before_changecount,
3473  after_changecount))
3474  break;
3475 
3476  /* Make sure we can break out of loop if stuck... */
3478  }
3479 
3480  beentry++;
3481  /* Only valid entries get included into the local array */
3482  if (localentry->backendStatus.st_procpid > 0)
3483  {
3485  &localentry->backend_xid,
3486  &localentry->backend_xmin);
3487 
3488  localentry++;
3489  localappname += NAMEDATALEN;
3490  localclienthostname += NAMEDATALEN;
3491  localactivity += pgstat_track_activity_query_size;
3492 #ifdef USE_SSL
3493  localsslstatus++;
3494 #endif
3495 #ifdef ENABLE_GSS
3496  localgssstatus++;
3497 #endif
3498  localNumBackends++;
3499  }
3500  }
3501 
3502  /* Set the pointer only after completion of a valid table */
3503  localBackendStatusTable = localtable;
3504 }
3505 
3506 /* ----------
3507  * pgstat_get_wait_event_type() -
3508  *
3509  * Return a string representing the current wait event type, backend is
3510  * waiting on.
3511  */
3512 const char *
3514 {
3515  uint32 classId;
3516  const char *event_type;
3517 
3518  /* report process as not waiting. */
3519  if (wait_event_info == 0)
3520  return NULL;
3521 
3522  classId = wait_event_info & 0xFF000000;
3523 
3524  switch (classId)
3525  {
3526  case PG_WAIT_LWLOCK:
3527  event_type = "LWLock";
3528  break;
3529  case PG_WAIT_LOCK:
3530  event_type = "Lock";
3531  break;
3532  case PG_WAIT_BUFFER_PIN:
3533  event_type = "BufferPin";
3534  break;
3535  case PG_WAIT_ACTIVITY:
3536  event_type = "Activity";
3537  break;
3538  case PG_WAIT_CLIENT:
3539  event_type = "Client";
3540  break;
3541  case PG_WAIT_EXTENSION:
3542  event_type = "Extension";
3543  break;
3544  case PG_WAIT_IPC:
3545  event_type = "IPC";
3546  break;
3547  case PG_WAIT_TIMEOUT:
3548  event_type = "Timeout";
3549  break;
3550  case PG_WAIT_IO:
3551  event_type = "IO";
3552  break;
3553  default:
3554  event_type = "???";
3555  break;
3556  }
3557 
3558  return event_type;
3559 }
3560 
3561 /* ----------
3562  * pgstat_get_wait_event() -
3563  *
3564  * Return a string representing the current wait event, backend is
3565  * waiting on.
3566  */
3567 const char *
3569 {
3570  uint32 classId;
3571  uint16 eventId;
3572  const char *event_name;
3573 
3574  /* report process as not waiting. */
3575  if (wait_event_info == 0)
3576  return NULL;
3577 
3578  classId = wait_event_info & 0xFF000000;
3579  eventId = wait_event_info & 0x0000FFFF;
3580 
3581  switch (classId)
3582  {
3583  case PG_WAIT_LWLOCK:
3584  event_name = GetLWLockIdentifier(classId, eventId);
3585  break;
3586  case PG_WAIT_LOCK:
3587  event_name = GetLockNameFromTagType(eventId);
3588  break;
3589  case PG_WAIT_BUFFER_PIN:
3590  event_name = "BufferPin";
3591  break;
3592  case PG_WAIT_ACTIVITY:
3593  {
3594  WaitEventActivity w = (WaitEventActivity) wait_event_info;
3595 
3596  event_name = pgstat_get_wait_activity(w);
3597  break;
3598  }
3599  case PG_WAIT_CLIENT:
3600  {
3601  WaitEventClient w = (WaitEventClient) wait_event_info;
3602 
3603  event_name = pgstat_get_wait_client(w);
3604  break;
3605  }
3606  case PG_WAIT_EXTENSION:
3607  event_name = "Extension";
3608  break;
3609  case PG_WAIT_IPC:
3610  {
3611  WaitEventIPC w = (WaitEventIPC) wait_event_info;
3612 
3613  event_name = pgstat_get_wait_ipc(w);
3614  break;
3615  }
3616  case PG_WAIT_TIMEOUT:
3617  {
3618  WaitEventTimeout w = (WaitEventTimeout) wait_event_info;
3619 
3620  event_name = pgstat_get_wait_timeout(w);
3621  break;
3622  }
3623  case PG_WAIT_IO:
3624  {
3625  WaitEventIO w = (WaitEventIO) wait_event_info;
3626 
3627  event_name = pgstat_get_wait_io(w);
3628  break;
3629  }
3630  default:
3631  event_name = "unknown wait event";
3632  break;
3633  }
3634 
3635  return event_name;
3636 }
3637 
3638 /* ----------
3639  * pgstat_get_wait_activity() -
3640  *
3641  * Convert WaitEventActivity to string.
3642  * ----------
3643  */
3644 static const char *
3646 {
3647  const char *event_name = "unknown wait event";
3648 
3649  switch (w)
3650  {
3652  event_name = "ArchiverMain";
3653  break;
3655  event_name = "AutoVacuumMain";
3656  break;
3658  event_name = "BgWriterHibernate";
3659  break;
3661  event_name = "BgWriterMain";
3662  break;
3664  event_name = "CheckpointerMain";
3665  break;
3667  event_name = "LogicalApplyMain";
3668  break;
3670  event_name = "LogicalLauncherMain";
3671  break;
3673  event_name = "PgStatMain";
3674  break;
3676  event_name = "RecoveryWalStream";
3677  break;
3679  event_name = "SysLoggerMain";
3680  break;
3682  event_name = "WalReceiverMain";
3683  break;
3685  event_name = "WalSenderMain";
3686  break;
3688  event_name = "WalWriterMain";
3689  break;
3690  /* no default case, so that compiler will warn */
3691  }
3692 
3693  return event_name;
3694 }
3695 
3696 /* ----------
3697  * pgstat_get_wait_client() -
3698  *
3699  * Convert WaitEventClient to string.
3700  * ----------
3701  */
3702 static const char *
3704 {
3705  const char *event_name = "unknown wait event";
3706 
3707  switch (w)
3708  {
3710  event_name = "ClientRead";
3711  break;
3713  event_name = "ClientWrite";
3714  break;
3716  event_name = "GSSOpenServer";
3717  break;
3719  event_name = "LibPQWalReceiverConnect";
3720  break;
3722  event_name = "LibPQWalReceiverReceive";
3723  break;
3725  event_name = "SSLOpenServer";
3726  break;
3728  event_name = "WalReceiverWaitStart";
3729  break;
3731  event_name = "WalSenderWaitForWAL";
3732  break;
3734  event_name = "WalSenderWriteData";
3735  break;
3736  /* no default case, so that compiler will warn */
3737  }
3738 
3739  return event_name;
3740 }
3741 
3742 /* ----------
3743  * pgstat_get_wait_ipc() -
3744  *
3745  * Convert WaitEventIPC to string.
3746  * ----------
3747  */
3748 static const char *
3750 {
3751  const char *event_name = "unknown wait event";
3752 
3753  switch (w)
3754  {
3756  event_name = "BackupWaitWalArchive";
3757  break;
3759  event_name = "BgWorkerShutdown";
3760  break;
3762  event_name = "BgWorkerStartup";
3763  break;
3764  case WAIT_EVENT_BTREE_PAGE:
3765  event_name = "BtreePage";
3766  break;
3768  event_name = "CheckpointDone";
3769  break;
3771  event_name = "CheckpointStart";
3772  break;
3774  event_name = "ExecuteGather";
3775  break;
3777  event_name = "HashBatchAllocate";
3778  break;
3780  event_name = "HashBatchElect";
3781  break;
3783  event_name = "HashBatchLoad";
3784  break;
3786  event_name = "HashBuildAllocate";
3787  break;
3789  event_name = "HashBuildElect";
3790  break;
3792  event_name = "HashBuildHashInner";
3793  break;
3795  event_name = "HashBuildHashOuter";
3796  break;
3798  event_name = "HashGrowBatchesAllocate";
3799  break;
3801  event_name = "HashGrowBatchesDecide";
3802  break;
3804  event_name = "HashGrowBatchesElect";
3805  break;
3807  event_name = "HashGrowBatchesFinish";
3808  break;
3810  event_name = "HashGrowBatchesRepartition";
3811  break;
3813  event_name = "HashGrowBucketsAllocate";
3814  break;
3816  event_name = "HashGrowBucketsElect";
3817  break;
3819  event_name = "HashGrowBucketsReinsert";
3820  break;
3822  event_name = "LogicalSyncData";
3823  break;
3825  event_name = "LogicalSyncStateChange";
3826  break;
3828  event_name = "MessageQueueInternal";
3829  break;
3831  event_name = "MessageQueuePutMessage";
3832  break;
3833  case WAIT_EVENT_MQ_RECEIVE:
3834  event_name = "MessageQueueReceive";
3835  break;
3836  case WAIT_EVENT_MQ_SEND:
3837  event_name = "MessageQueueSend";
3838  break;
3840  event_name = "ParallelBitmapScan";
3841  break;
3843  event_name = "ParallelCreateIndexScan";
3844  break;
3846  event_name = "ParallelFinish";
3847  break;
3849  event_name = "ProcArrayGroupUpdate";
3850  break;
3852  event_name = "ProcSignalBarrier";
3853  break;
3854  case WAIT_EVENT_PROMOTE:
3855  event_name = "Promote";
3856  break;
3858  event_name = "RecoveryConflictSnapshot";
3859  break;
3861  event_name = "RecoveryConflictTablespace";
3862  break;
3864  event_name = "RecoveryPause";
3865  break;
3867  event_name = "ReplicationOriginDrop";
3868  break;
3870  event_name = "ReplicationSlotDrop";
3871  break;
3873  event_name = "SafeSnapshot";
3874  break;
3875  case WAIT_EVENT_SYNC_REP:
3876  event_name = "SyncRep";
3877  break;
3879  event_name = "XactGroupUpdate";
3880  break;
3881  /* no default case, so that compiler will warn */
3882  }
3883 
3884  return event_name;
3885 }
3886 
3887 /* ----------
3888  * pgstat_get_wait_timeout() -
3889  *
3890  * Convert WaitEventTimeout to string.
3891  * ----------
3892  */
3893 static const char *
3895 {
3896  const char *event_name = "unknown wait event";
3897 
3898  switch (w)
3899  {
3901  event_name = "BaseBackupThrottle";
3902  break;
3903  case WAIT_EVENT_PG_SLEEP:
3904  event_name = "PgSleep";
3905  break;
3907  event_name = "RecoveryApplyDelay";
3908  break;
3910  event_name = "RecoveryRetrieveRetryInterval";
3911  break;
3913  event_name = "VacuumDelay";
3914  break;
3915  /* no default case, so that compiler will warn */
3916  }
3917 
3918  return event_name;
3919 }
3920 
3921 /* ----------
3922  * pgstat_get_wait_io() -
3923  *
3924  * Convert WaitEventIO to string.
3925  * ----------
3926  */
3927 static const char *
3929 {
3930  const char *event_name = "unknown wait event";
3931 
3932  switch (w)
3933  {
3935  event_name = "BaseBackupRead";
3936  break;
3938  event_name = "BufFileRead";
3939  break;
3941  event_name = "BufFileWrite";
3942  break;
3944  event_name = "ControlFileRead";
3945  break;
3947  event_name = "ControlFileSync";
3948  break;
3950  event_name = "ControlFileSyncUpdate";
3951  break;
3953  event_name = "ControlFileWrite";
3954  break;
3956  event_name = "ControlFileWriteUpdate";
3957  break;
3959  event_name = "CopyFileRead";
3960  break;
3962  event_name = "CopyFileWrite";
3963  break;
3965  event_name = "DataFileExtend";
3966  break;
3968  event_name = "DataFileFlush";
3969  break;
3971  event_name = "DataFileImmediateSync";
3972  break;
3974  event_name = "DataFilePrefetch";
3975  break;
3977  event_name = "DataFileRead";
3978  break;
3980  event_name = "DataFileSync";
3981  break;
3983  event_name = "DataFileTruncate";
3984  break;
3986  event_name = "DataFileWrite";
3987  break;
3989  event_name = "DSMFillZeroWrite";
3990  break;
3992  event_name = "LockFileAddToDataDirRead";
3993  break;
3995  event_name = "LockFileAddToDataDirSync";
3996  break;
3998  event_name = "LockFileAddToDataDirWrite";
3999  break;
4001  event_name = "LockFileCreateRead";
4002  break;
4004  event_name = "LockFileCreateSync";
4005  break;
4007  event_name = "LockFileCreateWrite";
4008  break;
4010  event_name = "LockFileReCheckDataDirRead";
4011  break;
4013  event_name = "LogicalRewriteCheckpointSync";
4014  break;
4016  event_name = "LogicalRewriteMappingSync";
4017  break;
4019  event_name = "LogicalRewriteMappingWrite";
4020  break;
4022  event_name = "LogicalRewriteSync";
4023  break;
4025  event_name = "LogicalRewriteTruncate";
4026  break;
4028  event_name = "LogicalRewriteWrite";
4029  break;
4031  event_name = "RelationMapRead";
4032  break;
4034  event_name = "RelationMapSync";
4035  break;
4037  event_name = "RelationMapWrite";
4038  break;
4040  event_name = "ReorderBufferRead";
4041  break;
4043  event_name = "ReorderBufferWrite";
4044  break;
4046  event_name = "ReorderLogicalMappingRead";
4047  break;
4049  event_name = "ReplicationSlotRead";
4050  break;
4052  event_name = "ReplicationSlotRestoreSync";
4053  break;
4055  event_name = "ReplicationSlotSync";
4056  break;
4058  event_name = "ReplicationSlotWrite";
4059  break;
4061  event_name = "SLRUFlushSync";
4062  break;
4063  case WAIT_EVENT_SLRU_READ:
4064  event_name = "SLRURead";
4065  break;
4066  case WAIT_EVENT_SLRU_SYNC:
4067  event_name = "SLRUSync";
4068  break;
4069  case WAIT_EVENT_SLRU_WRITE:
4070  event_name = "SLRUWrite";
4071  break;
4073  event_name = "SnapbuildRead";
4074  break;
4076  event_name = "SnapbuildSync";
4077  break;
4079  event_name = "SnapbuildWrite";
4080  break;
4082  event_name = "TimelineHistoryFileSync";
4083  break;
4085  event_name = "TimelineHistoryFileWrite";
4086  break;
4088  event_name = "TimelineHistoryRead";
4089  break;
4091  event_name = "TimelineHistorySync";
4092  break;
4094  event_name = "TimelineHistoryWrite";
4095  break;
4097  event_name = "TwophaseFileRead";
4098  break;
4100  event_name = "TwophaseFileSync";
4101  break;
4103  event_name = "TwophaseFileWrite";
4104  break;
4106  event_name = "WALSenderTimelineHistoryRead";
4107  break;
4109  event_name = "WALBootstrapSync";
4110  break;
4112  event_name = "WALBootstrapWrite";
4113  break;
4115  event_name = "WALCopyRead";
4116  break;
4118  event_name = "WALCopySync";
4119  break;
4121  event_name = "WALCopyWrite";
4122  break;
4124  event_name = "WALInitSync";
4125  break;
4127  event_name = "WALInitWrite";
4128  break;
4129  case WAIT_EVENT_WAL_READ:
4130  event_name = "WALRead";
4131  break;
4132  case WAIT_EVENT_WAL_SYNC:
4133  event_name = "WALSync";
4134  break;
4136  event_name = "WALSyncMethodAssign";
4137  break;
4138  case WAIT_EVENT_WAL_WRITE:
4139  event_name = "WALWrite";
4140  break;
4141 
4142  /* no default case, so that compiler will warn */
4143  }
4144 
4145  return event_name;
4146 }
4147 
4148 
4149 /* ----------
4150  * pgstat_get_backend_current_activity() -
4151  *
4152  * Return a string representing the current activity of the backend with
4153  * the specified PID. This looks directly at the BackendStatusArray,
4154  * and so will provide current information regardless of the age of our
4155  * transaction's snapshot of the status array.
4156  *
4157  * It is the caller's responsibility to invoke this only for backends whose
4158  * state is expected to remain stable while the result is in use. The
4159  * only current use is in deadlock reporting, where we can expect that
4160  * the target backend is blocked on a lock. (There are corner cases
4161  * where the target's wait could get aborted while we are looking at it,
4162  * but the very worst consequence is to return a pointer to a string
4163  * that's been changed, so we won't worry too much.)
4164  *
4165  * Note: return strings for special cases match pg_stat_get_backend_activity.
4166  * ----------
4167  */
4168 const char *
4169 pgstat_get_backend_current_activity(int pid, bool checkUser)
4170 {
4171  PgBackendStatus *beentry;
4172  int i;
4173 
4174  beentry = BackendStatusArray;
4175  for (i = 1; i <= MaxBackends; i++)
4176  {
4177  /*
4178  * Although we expect the target backend's entry to be stable, that
4179  * doesn't imply that anyone else's is. To avoid identifying the
4180  * wrong backend, while we check for a match to the desired PID we
4181  * must follow the protocol of retrying if st_changecount changes
4182  * while we examine the entry, or if it's odd. (This might be
4183  * unnecessary, since fetching or storing an int is almost certainly
4184  * atomic, but let's play it safe.) We use a volatile pointer here to
4185  * ensure the compiler doesn't try to get cute.
4186  */
4187  volatile PgBackendStatus *vbeentry = beentry;
4188  bool found;
4189 
4190  for (;;)
4191  {
4192  int before_changecount;
4193  int after_changecount;
4194 
4195  pgstat_begin_read_activity(vbeentry, before_changecount);
4196 
4197  found = (vbeentry->st_procpid == pid);
4198 
4199  pgstat_end_read_activity(vbeentry, after_changecount);
4200 
4201  if (pgstat_read_activity_complete(before_changecount,
4202  after_changecount))
4203  break;
4204 
4205  /* Make sure we can break out of loop if stuck... */
4207  }
4208 
4209  if (found)
4210  {
4211  /* Now it is safe to use the non-volatile pointer */
4212  if (checkUser && !superuser() && beentry->st_userid != GetUserId())
4213  return "<insufficient privilege>";
4214  else if (*(beentry->st_activity_raw) == '\0')
4215  return "<command string not enabled>";
4216  else
4217  {
4218  /* this'll leak a bit of memory, but that seems acceptable */
4219  return pgstat_clip_activity(beentry->st_activity_raw);
4220  }
4221  }
4222 
4223  beentry++;
4224  }
4225 
4226  /* If we get here, caller is in error ... */
4227  return "<backend information not available>";
4228 }
4229 
4230 /* ----------
4231  * pgstat_get_crashed_backend_activity() -
4232  *
4233  * Return a string representing the current activity of the backend with
4234  * the specified PID. Like the function above, but reads shared memory with
4235  * the expectation that it may be corrupt. On success, copy the string
4236  * into the "buffer" argument and return that pointer. On failure,
4237  * return NULL.
4238  *
4239  * This function is only intended to be used by the postmaster to report the
4240  * query that crashed a backend. In particular, no attempt is made to
4241  * follow the correct concurrency protocol when accessing the
4242  * BackendStatusArray. But that's OK, in the worst case we'll return a
4243  * corrupted message. We also must take care not to trip on ereport(ERROR).
4244  * ----------
4245  */
4246 const char *
4247 pgstat_get_crashed_backend_activity(int pid, char *buffer, int buflen)
4248 {
4249  volatile PgBackendStatus *beentry;
4250  int i;
4251 
4252  beentry = BackendStatusArray;
4253 
4254  /*
4255  * We probably shouldn't get here before shared memory has been set up,
4256  * but be safe.
4257  */
4258  if (beentry == NULL || BackendActivityBuffer == NULL)
4259  return NULL;
4260 
4261  for (i = 1; i <= MaxBackends; i++)
4262  {
4263  if (beentry->st_procpid == pid)
4264  {
4265  /* Read pointer just once, so it can't change after validation */
4266  const char *activity = beentry->st_activity_raw;
4267  const char *activity_last;
4268 
4269  /*
4270  * We mustn't access activity string before we verify that it
4271  * falls within the BackendActivityBuffer. To make sure that the
4272  * entire string including its ending is contained within the
4273  * buffer, subtract one activity length from the buffer size.
4274  */
4277 
4278  if (activity < BackendActivityBuffer ||
4279  activity > activity_last)
4280  return NULL;
4281 
4282  /* If no string available, no point in a report */
4283  if (activity[0] == '\0')
4284  return NULL;
4285 
4286  /*
4287  * Copy only ASCII-safe characters so we don't run into encoding
4288  * problems when reporting the message; and be sure not to run off
4289  * the end of memory. As only ASCII characters are reported, it
4290  * doesn't seem necessary to perform multibyte aware clipping.
4291  */
4292  ascii_safe_strlcpy(buffer, activity,
4293  Min(buflen, pgstat_track_activity_query_size));
4294 
4295  return buffer;
4296  }
4297 
4298  beentry++;
4299  }
4300 
4301  /* PID not found */
4302  return NULL;
4303 }
4304 
4305 /* ------------------------------------------------------------
4306  * Local support functions follow
4307  * ------------------------------------------------------------
4308  */
4309 
4310 
4311 /* ----------
4312  * pgstat_setheader() -
4313  *
4314  * Set common header fields in a statistics message
4315  * ----------
4316  */
4317 static void
4319 {
4320  hdr->m_type = mtype;
4321 }
4322 
4323 
4324 /* ----------
4325  * pgstat_send() -
4326  *
4327  * Send out one statistics message to the collector
4328  * ----------
4329  */
4330 static void
4331 pgstat_send(void *msg, int len)
4332 {
4333  int rc;
4334 
4336  return;
4337 
4338  ((PgStat_MsgHdr *) msg)->m_size = len;
4339 
4340  /* We'll retry after EINTR, but ignore all other failures */
4341  do
4342  {
4343  rc = send(pgStatSock, msg, len, 0);
4344  } while (rc < 0 && errno == EINTR);
4345 
4346 #ifdef USE_ASSERT_CHECKING
4347  /* In debug builds, log send failures ... */
4348  if (rc < 0)
4349  elog(LOG, "could not send to statistics collector: %m");
4350 #endif
4351 }
4352 
4353 /* ----------
4354  * pgstat_send_archiver() -
4355  *
4356  * Tell the collector about the WAL file that we successfully
4357  * archived or failed to archive.
4358  * ----------
4359  */
4360 void
4361 pgstat_send_archiver(const char *xlog, bool failed)
4362 {
4363  PgStat_MsgArchiver msg;
4364 
4365  /*
4366  * Prepare and send the message
4367  */
4369  msg.m_failed = failed;
4370  StrNCpy(msg.m_xlog, xlog, sizeof(msg.m_xlog));
4372  pgstat_send(&msg, sizeof(msg));
4373 }
4374 
4375 /* ----------
4376  * pgstat_send_bgwriter() -
4377  *
4378  * Send bgwriter statistics to the collector
4379  * ----------
4380  */
4381 void
4383 {
4384  /* We assume this initializes to zeroes */
4385  static const PgStat_MsgBgWriter all_zeroes;
4386 
4387  /*
4388  * This function can be called even if nothing at all has happened. In
4389  * this case, avoid sending a completely empty message to the stats
4390  * collector.
4391  */
4392  if (memcmp(&BgWriterStats, &all_zeroes, sizeof(PgStat_MsgBgWriter)) == 0)
4393  return;
4394 
4395  /*
4396  * Prepare and send the message
4397  */
4398  pgstat_setheader(&BgWriterStats.m_hdr, PGSTAT_MTYPE_BGWRITER);
4399  pgstat_send(&BgWriterStats, sizeof(BgWriterStats));
4400 
4401  /*
4402  * Clear out the statistics buffer, so it can be re-used.
4403  */
4404  MemSet(&BgWriterStats, 0, sizeof(BgWriterStats));
4405 }
4406 
4407 /* ----------
4408  * pgstat_send_slru() -
4409  *
4410  * Send SLRU statistics to the collector
4411  * ----------
4412  */
4413 static void
4415 {
4416  /* We assume this initializes to zeroes */
4417  static const PgStat_MsgSLRU all_zeroes;
4418 
4419  for (int i = 0; i < SLRU_NUM_ELEMENTS; i++)
4420  {
4421  /*
4422  * This function can be called even if nothing at all has happened. In
4423  * this case, avoid sending a completely empty message to the stats
4424  * collector.
4425  */
4426  if (memcmp(&SLRUStats[i], &all_zeroes, sizeof(PgStat_MsgSLRU)) == 0)
4427  continue;
4428 
4429  /* set the SLRU type before each send */
4430  SLRUStats[i].m_index = i;
4431 
4432  /*
4433  * Prepare and send the message
4434  */
4435  pgstat_setheader(&SLRUStats[i].m_hdr, PGSTAT_MTYPE_SLRU);
4436  pgstat_send(&SLRUStats[i], sizeof(PgStat_MsgSLRU));
4437 
4438  /*
4439  * Clear out the statistics buffer, so it can be re-used.
4440  */
4441  MemSet(&SLRUStats[i], 0, sizeof(PgStat_MsgSLRU));
4442  }
4443 }
4444 
4445 
4446 /* ----------
4447  * PgstatCollectorMain() -
4448  *
4449  * Start up the statistics collector process. This is the body of the
4450  * postmaster child process.
4451  *
4452  * The argc/argv parameters are valid only in EXEC_BACKEND case.
4453  * ----------
4454  */
4455 NON_EXEC_STATIC void
4456 PgstatCollectorMain(int argc, char *argv[])
4457 {
4458  int len;
4459  PgStat_Msg msg;
4460  int wr;
4461 
4462  /*
4463  * Ignore all signals usually bound to some action in the postmaster,
4464  * except SIGHUP and SIGQUIT. Note we don't need a SIGUSR1 handler to
4465  * support latch operations, because we only use a local latch.
4466  */
4468  pqsignal(SIGINT, SIG_IGN);
4469  pqsignal(SIGTERM, SIG_IGN);
4475  /* Reset some signals that are accepted by postmaster but not here */
4478 
4480  init_ps_display(NULL);
4481 
4482  /*
4483  * Read in existing stats files or initialize the stats to zero.
4484  */
4485  pgStatRunningInCollector = true;
4486  pgStatDBHash = pgstat_read_statsfiles(InvalidOid, true, true);
4487 
4488  /*
4489  * Loop to process messages until we get SIGQUIT or detect ungraceful
4490  * death of our parent postmaster.
4491  *
4492  * For performance reasons, we don't want to do ResetLatch/WaitLatch after
4493  * every message; instead, do that only after a recv() fails to obtain a
4494  * message. (This effectively means that if backends are sending us stuff
4495  * like mad, we won't notice postmaster death until things slack off a
4496  * bit; which seems fine.) To do that, we have an inner loop that
4497  * iterates as long as recv() succeeds. We do check ConfigReloadPending
4498  * inside the inner loop, which means that such interrupts will get
4499  * serviced but the latch won't get cleared until next time there is a
4500  * break in the action.
4501  */
4502  for (;;)
4503  {
4504  /* Clear any already-pending wakeups */
4506 
4507  /*
4508  * Quit if we get SIGQUIT from the postmaster.
4509  */
4511  break;
4512 
4513  /*
4514  * Inner loop iterates as long as we keep getting messages, or until
4515  * ShutdownRequestPending becomes set.
4516  */
4517  while (!ShutdownRequestPending)
4518  {
4519  /*
4520  * Reload configuration if we got SIGHUP from the postmaster.
4521  */
4522  if (ConfigReloadPending)
4523  {
4524  ConfigReloadPending = false;
4526  }
4527 
4528  /*
4529  * Write the stats file(s) if a new request has arrived that is
4530  * not satisfied by existing file(s).
4531  */
4533  pgstat_write_statsfiles(false, false);
4534 
4535  /*
4536  * Try to receive and process a message. This will not block,
4537  * since the socket is set to non-blocking mode.
4538  *
4539  * XXX On Windows, we have to force pgwin32_recv to cooperate,
4540  * despite the previous use of pg_set_noblock() on the socket.
4541  * This is extremely broken and should be fixed someday.
4542  */
4543 #ifdef WIN32
4544  pgwin32_noblock = 1;
4545 #endif
4546 
4547  len = recv(pgStatSock, (char *) &msg,
4548  sizeof(PgStat_Msg), 0);
4549 
4550 #ifdef WIN32
4551  pgwin32_noblock = 0;
4552 #endif
4553 
4554  if (len < 0)
4555  {
4556  if (errno == EAGAIN || errno == EWOULDBLOCK || errno == EINTR)
4557  break; /* out of inner loop */
4558  ereport(ERROR,
4560  errmsg("could not read statistics message: %m")));
4561  }
4562 
4563  /*
4564  * We ignore messages that are smaller than our common header
4565  */
4566  if (len < sizeof(PgStat_MsgHdr))
4567  continue;
4568 
4569  /*
4570  * The received length must match the length in the header
4571  */
4572  if (msg.msg_hdr.m_size != len)
4573  continue;
4574 
4575  /*
4576  * O.K. - we accept this message. Process it.
4577  */
4578  switch (msg.msg_hdr.m_type)
4579  {
4580  case PGSTAT_MTYPE_DUMMY:
4581  break;
4582 
4583  case PGSTAT_MTYPE_INQUIRY:
4584  pgstat_recv_inquiry(&msg.msg_inquiry, len);
4585  break;
4586 
4587  case PGSTAT_MTYPE_TABSTAT:
4588  pgstat_recv_tabstat(&msg.msg_tabstat, len);
4589  break;
4590 
4591  case PGSTAT_MTYPE_TABPURGE:
4592  pgstat_recv_tabpurge(&msg.msg_tabpurge, len);
4593  break;
4594 
4595  case PGSTAT_MTYPE_DROPDB:
4596  pgstat_recv_dropdb(&msg.msg_dropdb, len);
4597  break;
4598 
4600  pgstat_recv_resetcounter(&msg.msg_resetcounter, len);
4601  break;
4602 
4604  pgstat_recv_resetsharedcounter(&msg.msg_resetsharedcounter,
4605  len);
4606  break;
4607 
4609  pgstat_recv_resetsinglecounter(&msg.msg_resetsinglecounter,
4610  len);
4611  break;
4612 
4614  pgstat_recv_resetslrucounter(&msg.msg_resetslrucounter,
4615  len);
4616  break;
4617 
4619  pgstat_recv_autovac(&msg.msg_autovacuum_start, len);
4620  break;
4621 
4622  case PGSTAT_MTYPE_VACUUM:
4623  pgstat_recv_vacuum(&msg.msg_vacuum, len);
4624  break;
4625 
4626  case PGSTAT_MTYPE_ANALYZE:
4627  pgstat_recv_analyze(&msg.msg_analyze, len);
4628  break;
4629 
4630  case PGSTAT_MTYPE_ARCHIVER:
4631  pgstat_recv_archiver(&msg.msg_archiver, len);
4632  break;
4633 
4634  case PGSTAT_MTYPE_BGWRITER:
4635  pgstat_recv_bgwriter(&msg.msg_bgwriter, len);
4636  break;
4637 
4638  case PGSTAT_MTYPE_SLRU:
4639  pgstat_recv_slru(&msg.msg_slru, len);
4640  break;
4641 
4642  case PGSTAT_MTYPE_FUNCSTAT:
4643  pgstat_recv_funcstat(&msg.msg_funcstat, len);
4644  break;
4645 
4647  pgstat_recv_funcpurge(&msg.msg_funcpurge, len);
4648  break;
4649 
4651  pgstat_recv_recoveryconflict(&msg.msg_recoveryconflict,
4652  len);
4653  break;
4654 
4655  case PGSTAT_MTYPE_DEADLOCK:
4656  pgstat_recv_deadlock(&msg.msg_deadlock, len);
4657  break;
4658 
4659  case PGSTAT_MTYPE_TEMPFILE:
4660  pgstat_recv_tempfile(&msg.msg_tempfile, len);
4661  break;
4662 
4664  pgstat_recv_checksum_failure(&msg.msg_checksumfailure,
4665  len);
4666  break;
4667 
4668  default:
4669  break;
4670  }
4671  } /* end of inner message-processing loop */
4672 
4673  /* Sleep until there's something to do */
4674 #ifndef WIN32
4677  pgStatSock, -1L,
4679 #else
4680 
4681  /*
4682  * Windows, at least in its Windows Server 2003 R2 incarnation,
4683  * sometimes loses FD_READ events. Waking up and retrying the recv()
4684  * fixes that, so don't sleep indefinitely. This is a crock of the
4685  * first water, but until somebody wants to debug exactly what's
4686  * happening there, this is the best we can do. The two-second
4687  * timeout matches our pre-9.2 behavior, and needs to be short enough
4688  * to not provoke "using stale statistics" complaints from
4689  * backend_read_statsfile.
4690  */
4693  pgStatSock,
4694  2 * 1000L /* msec */ ,
4696 #endif
4697 
4698  /*
4699  * Emergency bailout if postmaster has died. This is to avoid the
4700  * necessity for manual cleanup of all postmaster children.
4701  */
4702  if (wr & WL_POSTMASTER_DEATH)
4703  break;
4704  } /* end of outer loop */
4705 
4706  /*
4707  * Save the final stats to reuse at next startup.
4708  */
4709  pgstat_write_statsfiles(true, true);
4710 
4711  exit(0);
4712 }
4713 
4714 /*
4715  * Subroutine to clear stats in a database entry
4716  *
4717  * Tables and functions hashes are initialized to empty.
4718  */
4719 static void
4721 {
4722  HASHCTL hash_ctl;
4723 
4724  dbentry->n_xact_commit = 0;
4725  dbentry->n_xact_rollback = 0;
4726  dbentry->n_blocks_fetched = 0;
4727  dbentry->n_blocks_hit = 0;
4728  dbentry->n_tuples_returned = 0;
4729  dbentry->n_tuples_fetched = 0;
4730  dbentry->n_tuples_inserted = 0;
4731  dbentry->n_tuples_updated = 0;
4732  dbentry->n_tuples_deleted = 0;
4733  dbentry->last_autovac_time = 0;
4734  dbentry->n_conflict_tablespace = 0;
4735  dbentry->n_conflict_lock = 0;
4736  dbentry->n_conflict_snapshot = 0;
4737  dbentry->n_conflict_bufferpin = 0;
4738  dbentry->n_conflict_startup_deadlock = 0;
4739  dbentry->n_temp_files = 0;
4740  dbentry->n_temp_bytes = 0;
4741  dbentry->n_deadlocks = 0;
4742  dbentry->n_checksum_failures = 0;
4743  dbentry->last_checksum_failure = 0;
4744  dbentry->n_block_read_time = 0;
4745  dbentry->n_block_write_time = 0;
4746 
4748  dbentry->stats_timestamp = 0;
4749 
4750  memset(&hash_ctl, 0, sizeof(hash_ctl));
4751  hash_ctl.keysize = sizeof(Oid);
4752  hash_ctl.entrysize = sizeof(PgStat_StatTabEntry);
4753  dbentry->tables = hash_create("Per-database table",
4755  &hash_ctl,
4756  HASH_ELEM | HASH_BLOBS);
4757 
4758  hash_ctl.keysize = sizeof(Oid);
4759  hash_ctl.entrysize = sizeof(PgStat_StatFuncEntry);
4760  dbentry->functions = hash_create("Per-database function",
4762  &hash_ctl,
4763  HASH_ELEM | HASH_BLOBS);
4764 }
4765 
4766 /*
4767  * Lookup the hash table entry for the specified database. If no hash
4768  * table entry exists, initialize it, if the create parameter is true.
4769  * Else, return NULL.
4770  */
4771 static PgStat_StatDBEntry *
4772 pgstat_get_db_entry(Oid databaseid, bool create)
4773 {
4774  PgStat_StatDBEntry *result;
4775  bool found;
4776  HASHACTION action = (create ? HASH_ENTER : HASH_FIND);
4777 
4778  /* Lookup or create the hash table entry for this database */
4779  result = (PgStat_StatDBEntry *) hash_search(pgStatDBHash,
4780  &databaseid,
4781  action, &found);
4782 
4783  if (!create && !found)
4784  return NULL;
4785 
4786  /*
4787  * If not found, initialize the new one. This creates empty hash tables
4788  * for tables and functions, too.
4789  */
4790  if (!found)
4791  reset_dbentry_counters(result);
4792 
4793  return result;
4794 }
4795 
4796 
4797 /*
4798  * Lookup the hash table entry for the specified table. If no hash
4799  * table entry exists, initialize it, if the create parameter is true.
4800  * Else, return NULL.
4801  */
4802 static PgStat_StatTabEntry *
4803 pgstat_get_tab_entry(PgStat_StatDBEntry *dbentry, Oid tableoid, bool create)
4804 {
4805  PgStat_StatTabEntry *result;
4806  bool found;
4807  HASHACTION action = (create ? HASH_ENTER : HASH_FIND);
4808 
4809  /* Lookup or create the hash table entry for this table */
4810  result = (PgStat_StatTabEntry *) hash_search(dbentry->tables,
4811  &tableoid,
4812  action, &found);
4813 
4814  if (!create && !found)
4815  return NULL;
4816 
4817  /* If not found, initialize the new one. */
4818  if (!found)
4819  {
4820  result->numscans = 0;
4821  result->tuples_returned = 0;
4822  result->tuples_fetched = 0;
4823  result->tuples_inserted = 0;
4824  result->tuples_updated = 0;
4825  result->tuples_deleted = 0;
4826  result->tuples_hot_updated = 0;
4827  result->n_live_tuples = 0;
4828  result->n_dead_tuples = 0;
4829  result->changes_since_analyze = 0;
4830  result->inserts_since_vacuum = 0;
4831  result->blocks_fetched = 0;
4832  result->blocks_hit = 0;
4833  result->vacuum_timestamp = 0;
4834  result->vacuum_count = 0;
4835  result->autovac_vacuum_timestamp = 0;
4836  result->autovac_vacuum_count = 0;
4837  result->analyze_timestamp = 0;
4838  result->analyze_count = 0;
4839  result->autovac_analyze_timestamp = 0;
4840  result->autovac_analyze_count = 0;
4841  }
4842 
4843  return result;
4844 }
4845 
4846 
4847 /* ----------
4848  * pgstat_write_statsfiles() -
4849  * Write the global statistics file, as well as requested DB files.
4850  *
4851  * 'permanent' specifies writing to the permanent files not temporary ones.
4852  * When true (happens only when the collector is shutting down), also remove
4853  * the temporary files so that backends starting up under a new postmaster
4854  * can't read old data before the new collector is ready.
4855  *
4856  * When 'allDbs' is false, only the requested databases (listed in
4857  * pending_write_requests) will be written; otherwise, all databases
4858  * will be written.
4859  * ----------
4860  */
4861 static void
4862 pgstat_write_statsfiles(bool permanent, bool allDbs)
4863 {
4864  HASH_SEQ_STATUS hstat;
4865  PgStat_StatDBEntry *dbentry;
4866  FILE *fpout;
4867  int32 format_id;
4868  const char *tmpfile = permanent ? PGSTAT_STAT_PERMANENT_TMPFILE : pgstat_stat_tmpname;
4869  const char *statfile = permanent ? PGSTAT_STAT_PERMANENT_FILENAME : pgstat_stat_filename;
4870  int rc;
4871 
4872  elog(DEBUG2, "writing stats file \"%s\"", statfile);
4873 
4874  /*
4875  * Open the statistics temp file to write out the current values.
4876  */
4877  fpout = AllocateFile(tmpfile, PG_BINARY_W);
4878  if (fpout == NULL)
4879  {
4880  ereport(LOG,
4882  errmsg("could not open temporary statistics file \"%s\": %m",
4883  tmpfile)));
4884  return;
4885  }
4886 
4887  /*
4888  * Set the timestamp of the stats file.
4889  */
4890  globalStats.stats_timestamp = GetCurrentTimestamp();
4891 
4892  /*
4893  * Write the file header --- currently just a format ID.
4894  */
4895  format_id = PGSTAT_FILE_FORMAT_ID;
4896  rc = fwrite(&format_id, sizeof(format_id), 1, fpout);
4897  (void) rc; /* we'll check for error with ferror */
4898 
4899  /*
4900  * Write global stats struct
4901  */
4902  rc = fwrite(&globalStats, sizeof(globalStats), 1, fpout);
4903  (void) rc; /* we'll check for error with ferror */
4904 
4905  /*
4906  * Write archiver stats struct
4907  */
4908  rc = fwrite(&archiverStats, sizeof(archiverStats), 1, fpout);
4909  (void) rc; /* we'll check for error with ferror */
4910 
4911  /*
4912  * Write SLRU stats struct
4913  */
4914  rc = fwrite(slruStats, sizeof(slruStats), 1, fpout);
4915  (void) rc; /* we'll check for error with ferror */
4916 
4917  /*
4918  * Walk through the database table.
4919  */
4920  hash_seq_init(&hstat, pgStatDBHash);
4921  while ((dbentry = (PgStat_StatDBEntry *) hash_seq_search(&hstat)) != NULL)
4922  {
4923  /*
4924  * Write out the table and function stats for this DB into the
4925  * appropriate per-DB stat file, if required.
4926  */
4927  if (allDbs || pgstat_db_requested(dbentry->databaseid))
4928  {
4929  /* Make DB's timestamp consistent with the global stats */
4930  dbentry->stats_timestamp = globalStats.stats_timestamp;
4931 
4932  pgstat_write_db_statsfile(dbentry, permanent);
4933  }
4934 
4935  /*
4936  * Write out the DB entry. We don't write the tables or functions
4937  * pointers, since they're of no use to any other process.
4938  */
4939  fputc('D', fpout);
4940  rc = fwrite(dbentry, offsetof(PgStat_StatDBEntry, tables), 1, fpout);
4941  (void) rc; /* we'll check for error with ferror */
4942  }
4943 
4944  /*
4945  * No more output to be done. Close the temp file and replace the old
4946  * pgstat.stat with it. The ferror() check replaces testing for error
4947  * after each individual fputc or fwrite above.
4948  */
4949  fputc('E', fpout);
4950 
4951  if (ferror(fpout))
4952  {
4953  ereport(LOG,
4955  errmsg("could not write temporary statistics file \"%s\": %m",
4956  tmpfile)));
4957  FreeFile(fpout);
4958  unlink(tmpfile);
4959  }
4960  else if (FreeFile(fpout) < 0)
4961  {
4962  ereport(LOG,
4964  errmsg("could not close temporary statistics file \"%s\": %m",
4965  tmpfile)));
4966  unlink(tmpfile);
4967  }
4968  else if (rename(tmpfile, statfile) < 0)
4969  {
4970  ereport(LOG,
4972  errmsg("could not rename temporary statistics file \"%s\" to \"%s\": %m",
4973  tmpfile, statfile)));
4974  unlink(tmpfile);
4975  }
4976 
4977  if (permanent)
4978  unlink(pgstat_stat_filename);
4979 
4980  /*
4981  * Now throw away the list of requests. Note that requests sent after we
4982  * started the write are still waiting on the network socket.
4983  */
4984  list_free(pending_write_requests);
4985  pending_write_requests = NIL;
4986 }
4987 
4988 /*
4989  * return the filename for a DB stat file; filename is the output buffer,
4990  * of length len.
4991  */
4992 static void
4993 get_dbstat_filename(bool permanent, bool tempname, Oid databaseid,
4994  char *filename, int len)
4995 {
4996  int printed;
4997 
4998  /* NB -- pgstat_reset_remove_files knows about the pattern this uses */
4999  printed = snprintf(filename, len, "%s/db_%u.%s",
5000  permanent ? PGSTAT_STAT_PERMANENT_DIRECTORY :
5002  databaseid,
5003  tempname ? "tmp" : "stat");
5004  if (printed >= len)
5005  elog(ERROR, "overlength pgstat path");
5006 }
5007 
5008 /* ----------
5009  * pgstat_write_db_statsfile() -
5010  * Write the stat file for a single database.
5011  *
5012  * If writing to the permanent file (happens when the collector is
5013  * shutting down only), remove the temporary file so that backends
5014  * starting up under a new postmaster can't read the old data before
5015  * the new collector is ready.
5016  * ----------
5017  */
5018 static void
5020 {
5021  HASH_SEQ_STATUS tstat;
5022  HASH_SEQ_STATUS fstat;
5023  PgStat_StatTabEntry *tabentry;
5024  PgStat_StatFuncEntry *funcentry;
5025  FILE *fpout;
5026  int32 format_id;
5027  Oid dbid = dbentry->databaseid;
5028  int rc;
5029  char tmpfile[MAXPGPATH];
5030  char statfile[MAXPGPATH];
5031 
5032  get_dbstat_filename(permanent, true, dbid, tmpfile, MAXPGPATH);
5033  get_dbstat_filename(permanent, false, dbid, statfile, MAXPGPATH);
5034 
5035  elog(DEBUG2, "writing stats file \"%s\"", statfile);
5036 
5037  /*
5038  * Open the statistics temp file to write out the current values.
5039  */
5040  fpout = AllocateFile(tmpfile, PG_BINARY_W);
5041  if (fpout == NULL)
5042  {
5043  ereport(LOG,
5045  errmsg("could not open temporary statistics file \"%s\": %m",
5046  tmpfile)));
5047  return;
5048  }
5049 
5050  /*
5051  * Write the file header --- currently just a format ID.
5052  */
5053  format_id = PGSTAT_FILE_FORMAT_ID;
5054  rc = fwrite(&format_id, sizeof(format_id), 1, fpout);
5055  (void) rc; /* we'll check for error with ferror */
5056 
5057  /*
5058  * Walk through the database's access stats per table.
5059  */
5060  hash_seq_init(&tstat, dbentry->tables);
5061  while ((tabentry = (PgStat_StatTabEntry *) hash_seq_search(&tstat)) != NULL)
5062  {
5063  fputc('T', fpout);
5064  rc = fwrite(tabentry, sizeof(PgStat_StatTabEntry), 1, fpout);
5065  (void) rc; /* we'll check for error with ferror */
5066  }
5067 
5068  /*
5069  * Walk through the database's function stats table.
5070  */
5071  hash_seq_init(&fstat, dbentry->functions);
5072  while ((funcentry = (PgStat_StatFuncEntry *) hash_seq_search(&fstat)) != NULL)
5073  {
5074  fputc('F', fpout);
5075  rc = fwrite(funcentry, sizeof(PgStat_StatFuncEntry), 1, fpout);
5076  (void) rc; /* we'll check for error with ferror */
5077  }
5078 
5079  /*
5080  * No more output to be done. Close the temp file and replace the old
5081  * pgstat.stat with it. The ferror() check replaces testing for error
5082  * after each individual fputc or fwrite above.
5083  */
5084  fputc('E', fpout);
5085 
5086  if (ferror(fpout))
5087  {
5088  ereport(LOG,
5090  errmsg("could not write temporary statistics file \"%s\": %m",
5091  tmpfile)));
5092  FreeFile(fpout);
5093  unlink(tmpfile);
5094  }
5095  else if (FreeFile(fpout) < 0)
5096  {
5097  ereport(LOG,
5099  errmsg("could not close temporary statistics file \"%s\": %m",
5100  tmpfile)));
5101  unlink(tmpfile);
5102  }
5103  else if (rename(tmpfile, statfile) < 0)
5104  {
5105  ereport(LOG,
5107  errmsg("could not rename temporary statistics file \"%s\" to \"%s\": %m",
5108  tmpfile, statfile)));
5109  unlink(tmpfile);
5110  }
5111 
5112  if (permanent)
5113  {
5114  get_dbstat_filename(false, false, dbid, statfile, MAXPGPATH);
5115 
5116  elog(DEBUG2, "removing temporary stats file \"%s\"", statfile);
5117  unlink(statfile);
5118  }
5119 }
5120 
5121 /* ----------
5122  * pgstat_read_statsfiles() -
5123  *
5124  * Reads in some existing statistics collector files and returns the
5125  * databases hash table that is the top level of the data.
5126  *
5127  * If 'onlydb' is not InvalidOid, it means we only want data for that DB
5128  * plus the shared catalogs ("DB 0"). We'll still populate the DB hash
5129  * table for all databases, but we don't bother even creating table/function
5130  * hash tables for other databases.
5131  *
5132  * 'permanent' specifies reading from the permanent files not temporary ones.
5133  * When true (happens only when the collector is starting up), remove the
5134  * files after reading; the in-memory status is now authoritative, and the
5135  * files would be out of date in case somebody else reads them.
5136  *
5137  * If a 'deep' read is requested, table/function stats are read, otherwise
5138  * the table/function hash tables remain empty.
5139  * ----------
5140  */
5141 static HTAB *
5142 pgstat_read_statsfiles(Oid onlydb, bool permanent, bool deep)
5143 {
5144  PgStat_StatDBEntry *dbentry;
5145  PgStat_StatDBEntry dbbuf;
5146  HASHCTL hash_ctl;
5147  HTAB *dbhash;
5148  FILE *fpin;
5149  int32 format_id;
5150  bool found;
5151  const char *statfile = permanent ? PGSTAT_STAT_PERMANENT_FILENAME : pgstat_stat_filename;
5152  int i;
5153 
5154  /*
5155  * The tables will live in pgStatLocalContext.
5156  */
5158 
5159  /*
5160  * Create the DB hashtable
5161  */
5162  memset(&hash_ctl, 0, sizeof(hash_ctl));
5163  hash_ctl.keysize = sizeof(Oid);
5164  hash_ctl.entrysize = sizeof(PgStat_StatDBEntry);
5165  hash_ctl.hcxt = pgStatLocalContext;
5166  dbhash = hash_create("Databases hash", PGSTAT_DB_HASH_SIZE, &hash_ctl,
5168 
5169  /*
5170  * Clear out global and archiver statistics so they start from zero in
5171  * case we can't load an existing statsfile.
5172  */
5173  memset(&globalStats, 0, sizeof(globalStats));
5174  memset(&archiverStats, 0, sizeof(archiverStats));
5175  memset(&slruStats, 0, sizeof(slruStats));
5176 
5177  /*
5178  * Set the current timestamp (will be kept only in case we can't load an
5179  * existing statsfile).
5180  */
5181  globalStats.stat_reset_timestamp = GetCurrentTimestamp();
5182  archiverStats.stat_reset_timestamp = globalStats.stat_reset_timestamp;
5183 
5184  /*
5185  * Set the same reset timestamp for all SLRU items too.
5186  */
5187  for (i = 0; i < SLRU_NUM_ELEMENTS; i++)
5188  slruStats[i].stat_reset_timestamp = globalStats.stat_reset_timestamp;
5189 
5190  /*
5191  * Try to open the stats file. If it doesn't exist, the backends simply
5192  * return zero for anything and the collector simply starts from scratch
5193  * with empty counters.
5194  *
5195  * ENOENT is a possibility if the stats collector is not running or has
5196  * not yet written the stats file the first time. Any other failure
5197  * condition is suspicious.
5198  */
5199  if ((fpin = AllocateFile(statfile, PG_BINARY_R)) == NULL)
5200  {
5201  if (errno != ENOENT)
5204  errmsg("could not open statistics file \"%s\": %m",
5205  statfile)));
5206  return dbhash;
5207  }
5208 
5209  /*
5210  * Verify it's of the expected format.
5211  */
5212  if (fread(&format_id, 1, sizeof(format_id), fpin) != sizeof(format_id) ||
5213  format_id != PGSTAT_FILE_FORMAT_ID)
5214  {
5216  (errmsg("corrupted statistics file \"%s\"", statfile)));
5217  goto done;
5218  }
5219 
5220  /*
5221  * Read global stats struct
5222  */
5223  if (fread(&globalStats, 1, sizeof(globalStats), fpin) != sizeof(globalStats))
5224  {
5226  (errmsg("corrupted statistics file \"%s\"", statfile)));
5227  memset(&globalStats, 0, sizeof(globalStats));
5228  goto done;
5229  }
5230 
5231  /*
5232  * In the collector, disregard the timestamp we read from the permanent
5233  * stats file; we should be willing to write a temp stats file immediately
5234  * upon the first request from any backend. This only matters if the old
5235  * file's timestamp is less than PGSTAT_STAT_INTERVAL ago, but that's not
5236  * an unusual scenario.
5237  */
5239  globalStats.stats_timestamp = 0;
5240 
5241  /*
5242  * Read archiver stats struct
5243  */
5244  if (fread(&archiverStats, 1, sizeof(archiverStats), fpin) != sizeof(archiverStats))
5245  {
5247  (errmsg("corrupted statistics file \"%s\"", statfile)));
5248  memset(&archiverStats, 0, sizeof(archiverStats));
5249  goto done;
5250  }
5251 
5252  /*
5253  * Read SLRU stats struct
5254  */
5255  if (fread(slruStats, 1, sizeof(slruStats), fpin) != sizeof(slruStats))
5256  {
5258  (errmsg("corrupted statistics file \"%s\"", statfile)));
5259  memset(&slruStats, 0, sizeof(slruStats));
5260  goto done;
5261  }
5262 
5263  /*
5264  * We found an existing collector stats file. Read it and put all the
5265  * hashtable entries into place.
5266  */
5267  for (;;)
5268  {
5269  switch (fgetc(fpin))
5270  {
5271  /*
5272  * 'D' A PgStat_StatDBEntry struct describing a database
5273  * follows.
5274  */
5275  case 'D':
5276  if (fread(&dbbuf, 1, offsetof(PgStat_StatDBEntry, tables),
5277  fpin) != offsetof(PgStat_StatDBEntry, tables))
5278  {
5280  (errmsg("corrupted statistics file \"%s\"",
5281  statfile)));
5282  goto done;
5283  }
5284 
5285  /*
5286  * Add to the DB hash
5287  */
5288  dbentry = (PgStat_StatDBEntry *) hash_search(dbhash,
5289  (void *) &dbbuf.databaseid,
5290  HASH_ENTER,
5291  &found);
5292  if (found)
5293  {
5295  (errmsg("corrupted statistics file \"%s\"",
5296  statfile)));
5297  goto done;
5298  }
5299 
5300  memcpy(dbentry, &dbbuf, sizeof(PgStat_StatDBEntry));
5301  dbentry->tables = NULL;
5302  dbentry->functions = NULL;
5303 
5304  /*
5305  * In the collector, disregard the timestamp we read from the
5306  * permanent stats file; we should be willing to write a temp
5307  * stats file immediately upon the first request from any
5308  * backend.
5309  */
5311  dbentry->stats_timestamp = 0;
5312 
5313  /*
5314  * Don't create tables/functions hashtables for uninteresting
5315  * databases.
5316  */
5317  if (onlydb != InvalidOid)
5318  {
5319  if (dbbuf.databaseid != onlydb &&
5320  dbbuf.databaseid != InvalidOid)
5321  break;
5322  }
5323 
5324  memset(&hash_ctl, 0, sizeof(hash_ctl));
5325  hash_ctl.keysize = sizeof(Oid);
5326  hash_ctl.entrysize = sizeof(PgStat_StatTabEntry);
5327  hash_ctl.hcxt = pgStatLocalContext;
5328  dbentry->tables = hash_create("Per-database table",
5330  &hash_ctl,
5332 
5333  hash_ctl.keysize = sizeof(Oid);
5334  hash_ctl.entrysize = sizeof(PgStat_StatFuncEntry);
5335  hash_ctl.hcxt = pgStatLocalContext;
5336  dbentry->functions = hash_create("Per-database function",
5338  &hash_ctl,
5340 
5341  /*
5342  * If requested, read the data from the database-specific
5343  * file. Otherwise we just leave the hashtables empty.
5344  */
5345  if (deep)
5347  dbentry->tables,
5348  dbentry->functions,
5349  permanent);
5350 
5351  break;
5352 
5353  case 'E':
5354  goto done;
5355 
5356  default:
5358  (errmsg("corrupted statistics file \"%s\"",
5359  statfile)));
5360  goto done;
5361  }
5362  }
5363 
5364 done:
5365  FreeFile(fpin);
5366 
5367  /* If requested to read the permanent file, also get rid of it. */
5368  if (permanent)
5369  {
5370  elog(DEBUG2, "removing permanent stats file \"%s\"", statfile);
5371  unlink(statfile);
5372  }
5373 
5374  return dbhash;
5375 }
5376 
5377 
5378 /* ----------
5379  * pgstat_read_db_statsfile() -
5380  *
5381  * Reads in the existing statistics collector file for the given database,
5382  * filling the passed-in tables and functions hash tables.
5383  *
5384  * As in pgstat_read_statsfiles, if the permanent file is requested, it is
5385  * removed after reading.
5386  *
5387  * Note: this code has the ability to skip storing per-table or per-function
5388  * data, if NULL is passed for the corresponding hashtable. That's not used
5389  * at the moment though.
5390  * ----------
5391  */
5392 static void
5393 pgstat_read_db_statsfile(Oid databaseid, HTAB *tabhash, HTAB *funchash,
5394  bool permanent)
5395 {
5396  PgStat_StatTabEntry *tabentry;
5397  PgStat_StatTabEntry tabbuf;
5398  PgStat_StatFuncEntry funcbuf;
5399  PgStat_StatFuncEntry *funcentry;
5400  FILE *fpin;
5401  int32 format_id;
5402  bool found;
5403  char statfile[MAXPGPATH];
5404 
5405  get_dbstat_filename(permanent, false, databaseid, statfile, MAXPGPATH);
5406 
5407  /*
5408  * Try to open the stats file. If it doesn't exist, the backends simply
5409  * return zero for anything and the collector simply starts from scratch
5410  * with empty counters.
5411  *
5412  * ENOENT is a possibility if the stats collector is not running or has
5413  * not yet written the stats file the first time. Any other failure
5414  * condition is suspicious.
5415  */
5416  if ((fpin = AllocateFile(statfile, PG_BINARY_R)) == NULL)
5417  {
5418  if (errno != ENOENT)
5421  errmsg("could not open statistics file \"%s\": %m",
5422  statfile)));
5423  return;
5424  }
5425 
5426  /*
5427  * Verify it's of the expected format.
5428  */
5429  if (fread(&format_id, 1, sizeof(format_id), fpin) != sizeof(format_id) ||
5430  format_id != PGSTAT_FILE_FORMAT_ID)
5431  {
5433  (errmsg("corrupted statistics file \"%s\"", statfile)));
5434  goto done;
5435  }
5436 
5437  /*
5438  * We found an existing collector stats file. Read it and put all the
5439  * hashtable entries into place.
5440  */
5441  for (;;)
5442  {
5443  switch (fgetc(fpin))
5444  {
5445  /*
5446  * 'T' A PgStat_StatTabEntry follows.
5447  */
5448  case 'T':
5449  if (fread(&tabbuf, 1, sizeof(PgStat_StatTabEntry),
5450  fpin) != sizeof(PgStat_StatTabEntry))
5451  {
5453  (errmsg("corrupted statistics file \"%s\"",
5454  statfile)));
5455  goto done;
5456  }
5457 
5458  /*
5459  * Skip if table data not wanted.
5460  */
5461  if (tabhash == NULL)
5462  break;
5463 
5464  tabentry = (PgStat_StatTabEntry *) hash_search(tabhash,
5465  (void *) &tabbuf.tableid,
5466  HASH_ENTER, &found);
5467 
5468  if (found)
5469  {
5471  (errmsg("corrupted statistics file \"%s\"",
5472  statfile)));
5473  goto done;
5474  }
5475 
5476  memcpy(tabentry, &tabbuf, sizeof(tabbuf));
5477  break;
5478 
5479  /*
5480  * 'F' A PgStat_StatFuncEntry follows.
5481  */
5482  case 'F':
5483  if (fread(&funcbuf, 1, sizeof(PgStat_StatFuncEntry),
5484  fpin) != sizeof(PgStat_StatFuncEntry))
5485  {
5487  (errmsg("corrupted statistics file \"%s\"",
5488  statfile)));
5489  goto done;
5490  }
5491 
5492  /*
5493  * Skip if function data not wanted.
5494  */
5495  if (funchash == NULL)
5496  break;
5497 
5498  funcentry = (PgStat_StatFuncEntry *) hash_search(funchash,
5499  (void *) &funcbuf.functionid,
5500  HASH_ENTER, &found);
5501 
5502  if (found)
5503  {
5505  (errmsg("corrupted statistics file \"%s\"",
5506  statfile)));
5507  goto done;
5508  }
5509 
5510  memcpy(funcentry, &funcbuf, sizeof(funcbuf));
5511  break;
5512 
5513  /*
5514  * 'E' The EOF marker of a complete stats file.
5515  */
5516  case 'E':
5517  goto done;
5518 
5519  default:
5521  (errmsg("corrupted statistics file \"%s\"",
5522  statfile)));
5523  goto done;
5524  }
5525  }
5526 
5527 done:
5528  FreeFile(fpin);
5529 
5530  if (permanent)
5531  {
5532  elog(DEBUG2, "removing permanent stats file \"%s\"", statfile);
5533  unlink(statfile);
5534  }
5535 }
5536 
5537 /* ----------
5538  * pgstat_read_db_statsfile_timestamp() -
5539  *
5540  * Attempt to determine the timestamp of the last db statfile write.
5541  * Returns true if successful; the timestamp is stored in *ts.
5542  *
5543  * This needs to be careful about handling databases for which no stats file
5544  * exists, such as databases without a stat entry or those not yet written:
5545  *
5546  * - if there's a database entry in the global file, return the corresponding
5547  * stats_timestamp value.
5548  *
5549  * - if there's no db stat entry (e.g. for a new or inactive database),
5550  * there's no stats_timestamp value, but also nothing to write so we return
5551  * the timestamp of the global statfile.
5552  * ----------
5553  */
5554 static bool
5555 pgstat_read_db_statsfile_timestamp(Oid databaseid, bool permanent,
5556  TimestampTz *ts)
5557 {
5558  PgStat_StatDBEntry dbentry;
5559  PgStat_GlobalStats myGlobalStats;
5560  PgStat_ArchiverStats myArchiverStats;
5561  PgStat_SLRUStats mySLRUStats[SLRU_NUM_ELEMENTS];
5562  FILE *fpin;
5563  int32 format_id;
5564  const char *statfile = permanent ? PGSTAT_STAT_PERMANENT_FILENAME : pgstat_stat_filename;
5565 
5566  /*
5567  * Try to open the stats file. As above, anything but ENOENT is worthy of
5568  * complaining about.
5569  */
5570  if ((fpin = AllocateFile(statfile, PG_BINARY_R)) == NULL)
5571  {
5572  if (errno != ENOENT)
5575  errmsg("could not open statistics file \"%s\": %m",
5576  statfile)));
5577  return false;
5578  }
5579 
5580  /*
5581  * Verify it's of the expected format.
5582  */
5583  if (fread(&format_id, 1, sizeof(format_id), fpin) != sizeof(format_id) ||
5584  format_id != PGSTAT_FILE_FORMAT_ID)
5585  {
5587  (errmsg("corrupted statistics file \"%s\"", statfile)));
5588  FreeFile(fpin);
5589  return false;
5590  }
5591 
5592  /*
5593  * Read global stats struct
5594  */
5595  if (fread(&myGlobalStats, 1, sizeof(myGlobalStats),
5596  fpin) != sizeof(myGlobalStats))
5597  {
5599  (errmsg("corrupted statistics file \"%s\"", statfile)));
5600  FreeFile(fpin);
5601  return false;
5602  }
5603 
5604  /*
5605  * Read archiver stats struct
5606  */
5607  if (fread(&myArchiverStats, 1, sizeof(myArchiverStats),
5608  fpin) != sizeof(myArchiverStats))
5609  {
5611  (errmsg("corrupted statistics file \"%s\"", statfile)));
5612  FreeFile(fpin);
5613  return false;
5614  }
5615 
5616  /*
5617  * Read SLRU stats struct
5618  */
5619  if (fread(mySLRUStats, 1, sizeof(mySLRUStats), fpin) != sizeof(mySLRUStats))
5620  {
5622  (errmsg("corrupted statistics file \"%s\"", statfile)));
5623  FreeFile(fpin);
5624  return false;
5625  }
5626 
5627  /* By default, we're going to return the timestamp of the global file. */
5628  *ts = myGlobalStats.stats_timestamp;
5629 
5630  /*
5631  * We found an existing collector stats file. Read it and look for a
5632  * record for the requested database. If found, use its timestamp.
5633  */
5634  for (;;)
5635  {
5636  switch (fgetc(fpin))
5637  {
5638  /*
5639  * 'D' A PgStat_StatDBEntry struct describing a database
5640  * follows.
5641  */
5642  case 'D':
5643  if (fread(&dbentry, 1, offsetof(PgStat_StatDBEntry, tables),
5644  fpin) != offsetof(PgStat_StatDBEntry, tables))
5645  {
5647  (errmsg("corrupted statistics file \"%s\"",
5648  statfile)));
5649  goto done;
5650  }
5651 
5652  /*
5653  * If this is the DB we're looking for, save its timestamp and
5654  * we're done.
5655  */
5656  if (dbentry.databaseid == databaseid)
5657  {
5658  *ts = dbentry.stats_timestamp;
5659  goto done;
5660  }
5661 
5662  break;
5663 
5664  case 'E':
5665  goto done;
5666 
5667  default:
5669  (errmsg("corrupted statistics file \"%s\"",
5670  statfile)));
5671  goto done;
5672  }
5673  }
5674 
5675 done:
5676  FreeFile(fpin);
5677  return true;
5678 }
5679 
5680 /*
5681  * If not already done, read the statistics collector stats file into
5682  * some hash tables. The results will be kept until pgstat_clear_snapshot()
5683  * is called (typically, at end of transaction).
5684  */
5685 static void
5687 {
5688  TimestampTz min_ts = 0;
5689  TimestampTz ref_ts = 0;
5690  Oid inquiry_db;
5691  int count;
5692 
5693  /* already read it? */
5694  if (pgStatDBHash)
5695  return;
5697 
5698  /*
5699  * In a normal backend, we check staleness of the data for our own DB, and
5700  * so we send MyDatabaseId in inquiry messages. In the autovac launcher,
5701  * check staleness of the shared-catalog data, and send InvalidOid in
5702  * inquiry messages so as not to force writing unnecessary data.
5703  */
5705  inquiry_db = InvalidOid;
5706  else
5707  inquiry_db = MyDatabaseId;
5708 
5709  /*
5710  * Loop until fresh enough stats file is available or we ran out of time.
5711  * The stats inquiry message is sent repeatedly in case collector drops
5712  * it; but not every single time, as that just swamps the collector.
5713  */
5714  for (count = 0; count < PGSTAT_POLL_LOOP_COUNT; count++)
5715  {
5716  bool ok;
5717  TimestampTz file_ts = 0;
5718  TimestampTz cur_ts;
5719 
5721 
5722  ok = pgstat_read_db_statsfile_timestamp(inquiry_db, false, &file_ts);
5723 
5724  cur_ts = GetCurrentTimestamp();
5725  /* Calculate min acceptable timestamp, if we didn't already */
5726  if (count == 0 || cur_ts < ref_ts)
5727  {
5728  /*
5729  * We set the minimum acceptable timestamp to PGSTAT_STAT_INTERVAL
5730  * msec before now. This indirectly ensures that the collector
5731  * needn't write the file more often than PGSTAT_STAT_INTERVAL. In
5732  * an autovacuum worker, however, we want a lower delay to avoid
5733  * using stale data, so we use PGSTAT_RETRY_DELAY (since the
5734  * number of workers is low, this shouldn't be a problem).
5735  *
5736  * We don't recompute min_ts after sleeping, except in the
5737  * unlikely case that cur_ts went backwards. So we might end up
5738  * accepting a file a bit older than PGSTAT_STAT_INTERVAL. In
5739  * practice that shouldn't happen, though, as long as the sleep
5740  * time is less than PGSTAT_STAT_INTERVAL; and we don't want to
5741  * tell the collector that our cutoff time is less than what we'd
5742  * actually accept.
5743  */
5744  ref_ts = cur_ts;
5746  min_ts = TimestampTzPlusMilliseconds(ref_ts,
5748  else
5749  min_ts = TimestampTzPlusMilliseconds(ref_ts,
5751  }
5752 
5753  /*
5754  * If the file timestamp is actually newer than cur_ts, we must have
5755  * had a clock glitch (system time went backwards) or there is clock
5756  * skew between our processor and the stats collector's processor.
5757  * Accept the file, but send an inquiry message anyway to make
5758  * pgstat_recv_inquiry do a sanity check on the collector's time.
5759  */
5760  if (ok && file_ts > cur_ts)
5761  {
5762  /*
5763  * A small amount of clock skew between processors isn't terribly
5764  * surprising, but a large difference is worth logging. We
5765  * arbitrarily define "large" as 1000 msec.
5766  */
5767  if (file_ts >= TimestampTzPlusMilliseconds(cur_ts, 1000))
5768  {
5769  char *filetime;
5770  char *mytime;
5771 
5772  /* Copy because timestamptz_to_str returns a static buffer */
5773  filetime = pstrdup(timestamptz_to_str(file_ts));
5774  mytime = pstrdup(timestamptz_to_str(cur_ts));
5775  elog(LOG, "stats collector's time %s is later than backend local time %s",
5776  filetime, mytime);
5777  pfree(filetime);
5778  pfree(mytime);
5779  }
5780 
5781  pgstat_send_inquiry(cur_ts, min_ts, inquiry_db);
5782  break;
5783  }
5784 
5785  /* Normal acceptance case: file is not older than cutoff time */
5786  if (ok && file_ts >= min_ts)
5787  break;
5788 
5789  /* Not there or too old, so kick the collector and wait a bit */
5790  if ((count % PGSTAT_INQ_LOOP_COUNT) == 0)
5791  pgstat_send_inquiry(cur_ts, min_ts, inquiry_db);
5792 
5793  pg_usleep(PGSTAT_RETRY_DELAY * 1000L);
5794  }
5795 
5796  if (count >= PGSTAT_POLL_LOOP_COUNT)
5797  ereport(LOG,
5798  (errmsg("using stale statistics instead of current ones "
5799  "because stats collector is not responding")));
5800 
5801  /*
5802  * Autovacuum launcher wants stats about all databases, but a shallow read
5803  * is sufficient. Regular backends want a deep read for just the tables
5804  * they can see (MyDatabaseId + shared catalogs).
5805  */
5807  pgStatDBHash = pgstat_read_statsfiles(InvalidOid, false, false);
5808  else
5809  pgStatDBHash = pgstat_read_statsfiles(MyDatabaseId, false, true);
5810 }
5811 
5812 
5813 /* ----------
5814  * pgstat_setup_memcxt() -
5815  *
5816  * Create pgStatLocalContext, if not already done.
5817  * ----------
5818  */
5819 static void
5821 {
5822  if (!pgStatLocalContext)
5823  pgStatLocalContext = AllocSetContextCreate(TopMemoryContext,
5824  "Statistics snapshot",
5826 }
5827 
5828 
5829 /* ----------
5830  * pgstat_clear_snapshot() -
5831  *
5832  * Discard any data collected in the current transaction. Any subsequent
5833  * request will cause new snapshots to be read.
5834  *
5835  * This is also invoked during transaction commit or abort to discard
5836  * the no-longer-wanted snapshot.
5837  * ----------
5838  */
5839 void
5841 {
5842  /* Release memory, if any was allocated */
5843  if (pgStatLocalContext)
5844  MemoryContextDelete(pgStatLocalContext);
5845 
5846  /* Reset variables */
5847  pgStatLocalContext = NULL;
5848  pgStatDBHash = NULL;
5849  localBackendStatusTable = NULL;
5850  localNumBackends = 0;
5851 }
5852 
5853 
5854 /* ----------
5855  * pgstat_recv_inquiry() -
5856  *
5857  * Process stat inquiry requests.
5858  * ----------
5859  */
5860 static void
5862 {
5863  PgStat_StatDBEntry *dbentry;
5864 
5865  elog(DEBUG2, "received inquiry for database %u", msg->databaseid);
5866 
5867  /*
5868  * If there's already a write request for this DB, there's nothing to do.
5869  *
5870  * Note that if a request is found, we return early and skip the below
5871  * check for clock skew. This is okay, since the only way for a DB
5872  * request to be present in the list is that we have been here since the
5873  * last write round. It seems sufficient to check for clock skew once per
5874  * write round.
5875  */
5876  if (list_member_oid(pending_write_requests, msg->databaseid))
5877  return;
5878 
5879  /*
5880  * Check to see if we last wrote this database at a time >= the requested
5881  * cutoff time. If so, this is a stale request that was generated before
5882  * we updated the DB file, and we don't need to do so again.
5883  *
5884  * If the requestor's local clock time is older than stats_timestamp, we
5885  * should suspect a clock glitch, ie system time going backwards; though
5886  * the more likely explanation is just delayed message receipt. It is
5887  * worth expending a GetCurrentTimestamp call to be sure, since a large
5888  * retreat in the system clock reading could otherwise cause us to neglect
5889  * to update the stats file for a long time.
5890  */
5891  dbentry = pgstat_get_db_entry(msg->databaseid, false);
5892  if (dbentry == NULL)
5893  {
5894  /*
5895  * We have no data for this DB. Enter a write request anyway so that
5896  * the global stats will get updated. This is needed to prevent
5897  * backend_read_statsfile from waiting for data that we cannot supply,
5898  * in the case of a new DB that nobody has yet reported any stats for.
5899  * See the behavior of pgstat_read_db_statsfile_timestamp.
5900  */
5901  }
5902  else if (msg->clock_time < dbentry->stats_timestamp)
5903  {
5904  TimestampTz cur_ts = GetCurrentTimestamp();
5905 
5906  if (cur_ts < dbentry->stats_timestamp)
5907  {
5908  /*
5909  * Sure enough, time went backwards. Force a new stats file write
5910  * to get back in sync; but first, log a complaint.
5911  */
5912  char *writetime;
5913  char *mytime;
5914 
5915  /* Copy because timestamptz_to_str returns a static buffer */
5916  writetime = pstrdup(timestamptz_to_str(dbentry->stats_timestamp));
5917  mytime = pstrdup(timestamptz_to_str(cur_ts));
5918  elog(LOG,
5919  "stats_timestamp %s is later than collector's time %s for database %u",
5920  writetime, mytime, dbentry->databaseid);
5921  pfree(writetime);
5922  pfree(mytime);
5923  }
5924  else
5925  {
5926  /*
5927  * Nope, it's just an old request. Assuming msg's clock_time is
5928  * >= its cutoff_time, it must be stale, so we can ignore it.
5929  */
5930  return;
5931  }
5932  }
5933  else if (msg->cutoff_time <= dbentry->stats_timestamp)
5934  {
5935  /* Stale request, ignore it */
5936  return;
5937  }
5938 
5939  /*
5940  * We need to write this DB, so create a request.
5941  */
5942  pending_write_requests = lappend_oid(pending_write_requests,</