PostgreSQL Source Code  git master
pgstat.c
Go to the documentation of this file.
1 /* ----------
2  * pgstat.c
3  *
4  * All the statistics collector stuff hacked up in one big, ugly file.
5  *
6  * TODO: - Separate collector, postmaster and backend stuff
7  * into different files.
8  *
9  * - Add some automatic call for pgstat vacuuming.
10  *
11  * - Add a pgstat config column to pg_database, so this
12  * entire thing can be enabled/disabled on a per db basis.
13  *
14  * Copyright (c) 2001-2020, PostgreSQL Global Development Group
15  *
16  * src/backend/postmaster/pgstat.c
17  * ----------
18  */
19 #include "postgres.h"
20 
21 #include <unistd.h>
22 #include <fcntl.h>
23 #include <sys/param.h>
24 #include <sys/time.h>
25 #include <sys/socket.h>
26 #include <netdb.h>
27 #include <netinet/in.h>
28 #include <arpa/inet.h>
29 #include <signal.h>
30 #include <time.h>
31 #ifdef HAVE_SYS_SELECT_H
32 #include <sys/select.h>
33 #endif
34 
35 #include "access/heapam.h"
36 #include "access/htup_details.h"
37 #include "access/tableam.h"
38 #include "access/transam.h"
39 #include "access/twophase_rmgr.h"
40 #include "access/xact.h"
41 #include "catalog/pg_database.h"
42 #include "catalog/pg_proc.h"
43 #include "common/ip.h"
44 #include "libpq/libpq.h"
45 #include "libpq/pqsignal.h"
46 #include "mb/pg_wchar.h"
47 #include "miscadmin.h"
48 #include "pg_trace.h"
49 #include "pgstat.h"
50 #include "postmaster/autovacuum.h"
52 #include "postmaster/interrupt.h"
53 #include "postmaster/postmaster.h"
54 #include "replication/walsender.h"
55 #include "storage/backendid.h"
56 #include "storage/dsm.h"
57 #include "storage/fd.h"
58 #include "storage/ipc.h"
59 #include "storage/latch.h"
60 #include "storage/lmgr.h"
61 #include "storage/pg_shmem.h"
62 #include "storage/procsignal.h"
63 #include "storage/sinvaladt.h"
64 #include "utils/ascii.h"
65 #include "utils/guc.h"
66 #include "utils/memutils.h"
67 #include "utils/ps_status.h"
68 #include "utils/rel.h"
69 #include "utils/snapmgr.h"
70 #include "utils/timestamp.h"
71 
72 /* ----------
73  * Timer definitions.
74  * ----------
75  */
76 #define PGSTAT_STAT_INTERVAL 500 /* Minimum time between stats file
77  * updates; in milliseconds. */
78 
79 #define PGSTAT_RETRY_DELAY 10 /* How long to wait between checks for a
80  * new file; in milliseconds. */
81 
82 #define PGSTAT_MAX_WAIT_TIME 10000 /* Maximum time to wait for a stats
83  * file update; in milliseconds. */
84 
85 #define PGSTAT_INQ_INTERVAL 640 /* How often to ping the collector for a
86  * new file; in milliseconds. */
87 
88 #define PGSTAT_RESTART_INTERVAL 60 /* How often to attempt to restart a
89  * failed statistics collector; in
90  * seconds. */
91 
92 #define PGSTAT_POLL_LOOP_COUNT (PGSTAT_MAX_WAIT_TIME / PGSTAT_RETRY_DELAY)
93 #define PGSTAT_INQ_LOOP_COUNT (PGSTAT_INQ_INTERVAL / PGSTAT_RETRY_DELAY)
94 
95 /* Minimum receive buffer size for the collector's socket. */
96 #define PGSTAT_MIN_RCVBUF (100 * 1024)
97 
98 
99 /* ----------
100  * The initial size hints for the hash tables used in the collector.
101  * ----------
102  */
103 #define PGSTAT_DB_HASH_SIZE 16
104 #define PGSTAT_TAB_HASH_SIZE 512
105 #define PGSTAT_FUNCTION_HASH_SIZE 512
106 
107 
108 /* ----------
109  * Total number of backends including auxiliary
110  *
111  * We reserve a slot for each possible BackendId, plus one for each
112  * possible auxiliary process type. (This scheme assumes there is not
113  * more than one of any auxiliary process type at a time.) MaxBackends
114  * includes autovacuum workers and background workers as well.
115  * ----------
116  */
117 #define NumBackendStatSlots (MaxBackends + NUM_AUXPROCTYPES)
118 
119 
120 /* ----------
121  * GUC parameters
122  * ----------
123  */
125 bool pgstat_track_counts = false;
128 
129 /* ----------
130  * Built from GUC parameter
131  * ----------
132  */
134 char *pgstat_stat_filename = NULL;
135 char *pgstat_stat_tmpname = NULL;
136 
137 /*
138  * BgWriter global statistics counters (unused in other processes).
139  * Stored directly in a stats message structure so it can be sent
140  * without needing to copy things around. We assume this inits to zeroes.
141  */
143 
144 /* ----------
145  * Local data
146  * ----------
147  */
149 
151 
153 
154 static bool pgStatRunningInCollector = false;
155 
156 /*
157  * Structures in which backends store per-table info that's waiting to be
158  * sent to the collector.
159  *
160  * NOTE: once allocated, TabStatusArray structures are never moved or deleted
161  * for the life of the backend. Also, we zero out the t_id fields of the
162  * contained PgStat_TableStatus structs whenever they are not actively in use.
163  * This allows relcache pgstat_info pointers to be treated as long-lived data,
164  * avoiding repeated searches in pgstat_initstats() when a relation is
165  * repeatedly opened during a transaction.
166  */
167 #define TABSTAT_QUANTUM 100 /* we alloc this many at a time */
168 
169 typedef struct TabStatusArray
170 {
171  struct TabStatusArray *tsa_next; /* link to next array, if any */
172  int tsa_used; /* # entries currently used */
175 
177 
178 /*
179  * pgStatTabHash entry: map from relation OID to PgStat_TableStatus pointer
180  */
181 typedef struct TabStatHashEntry
182 {
186 
187 /*
188  * Hash table for O(1) t_id -> tsa_entry lookup
189  */
190 static HTAB *pgStatTabHash = NULL;
191 
192 /*
193  * Backends store per-function info that's waiting to be sent to the collector
194  * in this hash table (indexed by function OID).
195  */
196 static HTAB *pgStatFunctions = NULL;
197 
198 /*
199  * Indicates if backend has some function stats that it hasn't yet
200  * sent to the collector.
201  */
202 static bool have_function_stats = false;
203 
204 /*
205  * Tuple insertion/deletion counts for an open transaction can't be propagated
206  * into PgStat_TableStatus counters until we know if it is going to commit
207  * or abort. Hence, we keep these counts in per-subxact structs that live
208  * in TopTransactionContext. This data structure is designed on the assumption
209  * that subxacts won't usually modify very many tables.
210  */
211 typedef struct PgStat_SubXactStatus
212 {
213  int nest_level; /* subtransaction nest level */
214  struct PgStat_SubXactStatus *prev; /* higher-level subxact if any */
215  PgStat_TableXactStatus *first; /* head of list for this subxact */
217 
219 
220 static int pgStatXactCommit = 0;
221 static int pgStatXactRollback = 0;
224 
225 /* Record that's written to 2PC state file when pgstat state is persisted */
226 typedef struct TwoPhasePgStatRecord
227 {
228  PgStat_Counter tuples_inserted; /* tuples inserted in xact */
229  PgStat_Counter tuples_updated; /* tuples updated in xact */
230  PgStat_Counter tuples_deleted; /* tuples deleted in xact */
231  PgStat_Counter inserted_pre_trunc; /* tuples inserted prior to truncate */
232  PgStat_Counter updated_pre_trunc; /* tuples updated prior to truncate */
233  PgStat_Counter deleted_pre_trunc; /* tuples deleted prior to truncate */
234  Oid t_id; /* table's OID */
235  bool t_shared; /* is it a shared catalog? */
236  bool t_truncated; /* was the relation truncated? */
238 
239 /*
240  * Info about current "snapshot" of stats file
241  */
243 static HTAB *pgStatDBHash = NULL;
244 
245 /* Status for backends including auxiliary */
247 
248 /* Total number of backends including auxiliary */
249 static int localNumBackends = 0;
250 
251 /*
252  * Cluster wide statistics, kept in the stats collector.
253  * Contains statistics that are not collected per database
254  * or per table.
255  */
258 
259 /*
260  * List of OIDs of databases we need to write out. If an entry is InvalidOid,
261  * it means to write only the shared-catalog stats ("DB 0"); otherwise, we
262  * will write both that DB's data and the shared stats.
263  */
265 
266 /*
267  * Total time charged to functions so far in the current backend.
268  * We use this to help separate "self" and "other" time charges.
269  * (We assume this initializes to zero.)
270  */
272 
273 
274 /* ----------
275  * Local function forward declarations
276  * ----------
277  */
278 #ifdef EXEC_BACKEND
279 static pid_t pgstat_forkexec(void);
280 #endif
281 
282 NON_EXEC_STATIC void PgstatCollectorMain(int argc, char *argv[]) pg_attribute_noreturn();
283 static void pgstat_beshutdown_hook(int code, Datum arg);
284 
285 static PgStat_StatDBEntry *pgstat_get_db_entry(Oid databaseid, bool create);
287  Oid tableoid, bool create);
288 static void pgstat_write_statsfiles(bool permanent, bool allDbs);
289 static void pgstat_write_db_statsfile(PgStat_StatDBEntry *dbentry, bool permanent);
290 static HTAB *pgstat_read_statsfiles(Oid onlydb, bool permanent, bool deep);
291 static void pgstat_read_db_statsfile(Oid databaseid, HTAB *tabhash, HTAB *funchash, bool permanent);
292 static void backend_read_statsfile(void);
293 static void pgstat_read_current_status(void);
294 
295 static bool pgstat_write_statsfile_needed(void);
296 static bool pgstat_db_requested(Oid databaseid);
297 
298 static void pgstat_send_tabstat(PgStat_MsgTabstat *tsmsg);
299 static void pgstat_send_funcstats(void);
300 static HTAB *pgstat_collect_oids(Oid catalogid, AttrNumber anum_oid);
301 
302 static PgStat_TableStatus *get_tabstat_entry(Oid rel_id, bool isshared);
303 
304 static void pgstat_setup_memcxt(void);
305 
306 static const char *pgstat_get_wait_activity(WaitEventActivity w);
307 static const char *pgstat_get_wait_client(WaitEventClient w);
308 static const char *pgstat_get_wait_ipc(WaitEventIPC w);
309 static const char *pgstat_get_wait_timeout(WaitEventTimeout w);
310 static const char *pgstat_get_wait_io(WaitEventIO w);
311 
312 static void pgstat_setheader(PgStat_MsgHdr *hdr, StatMsgType mtype);
313 static void pgstat_send(void *msg, int len);
314 
315 static void pgstat_recv_inquiry(PgStat_MsgInquiry *msg, int len);
316 static void pgstat_recv_tabstat(PgStat_MsgTabstat *msg, int len);
317 static void pgstat_recv_tabpurge(PgStat_MsgTabpurge *msg, int len);
318 static void pgstat_recv_dropdb(PgStat_MsgDropdb *msg, int len);
319 static void pgstat_recv_resetcounter(PgStat_MsgResetcounter *msg, int len);
322 static void pgstat_recv_autovac(PgStat_MsgAutovacStart *msg, int len);
323 static void pgstat_recv_vacuum(PgStat_MsgVacuum *msg, int len);
324 static void pgstat_recv_analyze(PgStat_MsgAnalyze *msg, int len);
325 static void pgstat_recv_archiver(PgStat_MsgArchiver *msg, int len);
326 static void pgstat_recv_bgwriter(PgStat_MsgBgWriter *msg, int len);
327 static void pgstat_recv_funcstat(PgStat_MsgFuncstat *msg, int len);
328 static void pgstat_recv_funcpurge(PgStat_MsgFuncpurge *msg, int len);
330 static void pgstat_recv_deadlock(PgStat_MsgDeadlock *msg, int len);
332 static void pgstat_recv_tempfile(PgStat_MsgTempFile *msg, int len);
333 
334 /* ------------------------------------------------------------
335  * Public functions called from postmaster follow
336  * ------------------------------------------------------------
337  */
338 
339 /* ----------
340  * pgstat_init() -
341  *
342  * Called from postmaster at startup. Create the resources required
343  * by the statistics collector process. If unable to do so, do not
344  * fail --- better to let the postmaster start with stats collection
345  * disabled.
346  * ----------
347  */
348 void
350 {
351  ACCEPT_TYPE_ARG3 alen;
352  struct addrinfo *addrs = NULL,
353  *addr,
354  hints;
355  int ret;
356  fd_set rset;
357  struct timeval tv;
358  char test_byte;
359  int sel_res;
360  int tries = 0;
361 
362 #define TESTBYTEVAL ((char) 199)
363 
364  /*
365  * This static assertion verifies that we didn't mess up the calculations
366  * involved in selecting maximum payload sizes for our UDP messages.
367  * Because the only consequence of overrunning PGSTAT_MAX_MSG_SIZE would
368  * be silent performance loss from fragmentation, it seems worth having a
369  * compile-time cross-check that we didn't.
370  */
372  "maximum stats message size exceeds PGSTAT_MAX_MSG_SIZE");
373 
374  /*
375  * Create the UDP socket for sending and receiving statistic messages
376  */
377  hints.ai_flags = AI_PASSIVE;
378  hints.ai_family = AF_UNSPEC;
379  hints.ai_socktype = SOCK_DGRAM;
380  hints.ai_protocol = 0;
381  hints.ai_addrlen = 0;
382  hints.ai_addr = NULL;
383  hints.ai_canonname = NULL;
384  hints.ai_next = NULL;
385  ret = pg_getaddrinfo_all("localhost", NULL, &hints, &addrs);
386  if (ret || !addrs)
387  {
388  ereport(LOG,
389  (errmsg("could not resolve \"localhost\": %s",
390  gai_strerror(ret))));
391  goto startup_failed;
392  }
393 
394  /*
395  * On some platforms, pg_getaddrinfo_all() may return multiple addresses
396  * only one of which will actually work (eg, both IPv6 and IPv4 addresses
397  * when kernel will reject IPv6). Worse, the failure may occur at the
398  * bind() or perhaps even connect() stage. So we must loop through the
399  * results till we find a working combination. We will generate LOG
400  * messages, but no error, for bogus combinations.
401  */
402  for (addr = addrs; addr; addr = addr->ai_next)
403  {
404 #ifdef HAVE_UNIX_SOCKETS
405  /* Ignore AF_UNIX sockets, if any are returned. */
406  if (addr->ai_family == AF_UNIX)
407  continue;
408 #endif
409 
410  if (++tries > 1)
411  ereport(LOG,
412  (errmsg("trying another address for the statistics collector")));
413 
414  /*
415  * Create the socket.
416  */
417  if ((pgStatSock = socket(addr->ai_family, SOCK_DGRAM, 0)) == PGINVALID_SOCKET)
418  {
419  ereport(LOG,
421  errmsg("could not create socket for statistics collector: %m")));
422  continue;
423  }
424 
425  /*
426  * Bind it to a kernel assigned port on localhost and get the assigned
427  * port via getsockname().
428  */
429  if (bind(pgStatSock, addr->ai_addr, addr->ai_addrlen) < 0)
430  {
431  ereport(LOG,
433  errmsg("could not bind socket for statistics collector: %m")));
436  continue;
437  }
438 
439  alen = sizeof(pgStatAddr);
440  if (getsockname(pgStatSock, (struct sockaddr *) &pgStatAddr, &alen) < 0)
441  {
442  ereport(LOG,
444  errmsg("could not get address of socket for statistics collector: %m")));
447  continue;
448  }
449 
450  /*
451  * Connect the socket to its own address. This saves a few cycles by
452  * not having to respecify the target address on every send. This also
453  * provides a kernel-level check that only packets from this same
454  * address will be received.
455  */
456  if (connect(pgStatSock, (struct sockaddr *) &pgStatAddr, alen) < 0)
457  {
458  ereport(LOG,
460  errmsg("could not connect socket for statistics collector: %m")));
463  continue;
464  }
465 
466  /*
467  * Try to send and receive a one-byte test message on the socket. This
468  * is to catch situations where the socket can be created but will not
469  * actually pass data (for instance, because kernel packet filtering
470  * rules prevent it).
471  */
472  test_byte = TESTBYTEVAL;
473 
474 retry1:
475  if (send(pgStatSock, &test_byte, 1, 0) != 1)
476  {
477  if (errno == EINTR)
478  goto retry1; /* if interrupted, just retry */
479  ereport(LOG,
481  errmsg("could not send test message on socket for statistics collector: %m")));
484  continue;
485  }
486 
487  /*
488  * There could possibly be a little delay before the message can be
489  * received. We arbitrarily allow up to half a second before deciding
490  * it's broken.
491  */
492  for (;;) /* need a loop to handle EINTR */
493  {
494  FD_ZERO(&rset);
495  FD_SET(pgStatSock, &rset);
496 
497  tv.tv_sec = 0;
498  tv.tv_usec = 500000;
499  sel_res = select(pgStatSock + 1, &rset, NULL, NULL, &tv);
500  if (sel_res >= 0 || errno != EINTR)
501  break;
502  }
503  if (sel_res < 0)
504  {
505  ereport(LOG,
507  errmsg("select() failed in statistics collector: %m")));
510  continue;
511  }
512  if (sel_res == 0 || !FD_ISSET(pgStatSock, &rset))
513  {
514  /*
515  * This is the case we actually think is likely, so take pains to
516  * give a specific message for it.
517  *
518  * errno will not be set meaningfully here, so don't use it.
519  */
520  ereport(LOG,
521  (errcode(ERRCODE_CONNECTION_FAILURE),
522  errmsg("test message did not get through on socket for statistics collector")));
525  continue;
526  }
527 
528  test_byte++; /* just make sure variable is changed */
529 
530 retry2:
531  if (recv(pgStatSock, &test_byte, 1, 0) != 1)
532  {
533  if (errno == EINTR)
534  goto retry2; /* if interrupted, just retry */
535  ereport(LOG,
537  errmsg("could not receive test message on socket for statistics collector: %m")));
540  continue;
541  }
542 
543  if (test_byte != TESTBYTEVAL) /* strictly paranoia ... */
544  {
545  ereport(LOG,
546  (errcode(ERRCODE_INTERNAL_ERROR),
547  errmsg("incorrect test message transmission on socket for statistics collector")));
550  continue;
551  }
552 
553  /* If we get here, we have a working socket */
554  break;
555  }
556 
557  /* Did we find a working address? */
558  if (!addr || pgStatSock == PGINVALID_SOCKET)
559  goto startup_failed;
560 
561  /*
562  * Set the socket to non-blocking IO. This ensures that if the collector
563  * falls behind, statistics messages will be discarded; backends won't
564  * block waiting to send messages to the collector.
565  */
567  {
568  ereport(LOG,
570  errmsg("could not set statistics collector socket to nonblocking mode: %m")));
571  goto startup_failed;
572  }
573 
574  /*
575  * Try to ensure that the socket's receive buffer is at least
576  * PGSTAT_MIN_RCVBUF bytes, so that it won't easily overflow and lose
577  * data. Use of UDP protocol means that we are willing to lose data under
578  * heavy load, but we don't want it to happen just because of ridiculously
579  * small default buffer sizes (such as 8KB on older Windows versions).
580  */
581  {
582  int old_rcvbuf;
583  int new_rcvbuf;
584  ACCEPT_TYPE_ARG3 rcvbufsize = sizeof(old_rcvbuf);
585 
586  if (getsockopt(pgStatSock, SOL_SOCKET, SO_RCVBUF,
587  (char *) &old_rcvbuf, &rcvbufsize) < 0)
588  {
589  elog(LOG, "getsockopt(SO_RCVBUF) failed: %m");
590  /* if we can't get existing size, always try to set it */
591  old_rcvbuf = 0;
592  }
593 
594  new_rcvbuf = PGSTAT_MIN_RCVBUF;
595  if (old_rcvbuf < new_rcvbuf)
596  {
597  if (setsockopt(pgStatSock, SOL_SOCKET, SO_RCVBUF,
598  (char *) &new_rcvbuf, sizeof(new_rcvbuf)) < 0)
599  elog(LOG, "setsockopt(SO_RCVBUF) failed: %m");
600  }
601  }
602 
603  pg_freeaddrinfo_all(hints.ai_family, addrs);
604 
605  return;
606 
607 startup_failed:
608  ereport(LOG,
609  (errmsg("disabling statistics collector for lack of working socket")));
610 
611  if (addrs)
612  pg_freeaddrinfo_all(hints.ai_family, addrs);
613 
617 
618  /*
619  * Adjust GUC variables to suppress useless activity, and for debugging
620  * purposes (seeing track_counts off is a clue that we failed here). We
621  * use PGC_S_OVERRIDE because there is no point in trying to turn it back
622  * on from postgresql.conf without a restart.
623  */
624  SetConfigOption("track_counts", "off", PGC_INTERNAL, PGC_S_OVERRIDE);
625 }
626 
627 /*
628  * subroutine for pgstat_reset_all
629  */
630 static void
632 {
633  DIR *dir;
634  struct dirent *entry;
635  char fname[MAXPGPATH * 2];
636 
637  dir = AllocateDir(directory);
638  while ((entry = ReadDir(dir, directory)) != NULL)
639  {
640  int nchars;
641  Oid tmp_oid;
642 
643  /*
644  * Skip directory entries that don't match the file names we write.
645  * See get_dbstat_filename for the database-specific pattern.
646  */
647  if (strncmp(entry->d_name, "global.", 7) == 0)
648  nchars = 7;
649  else
650  {
651  nchars = 0;
652  (void) sscanf(entry->d_name, "db_%u.%n",
653  &tmp_oid, &nchars);
654  if (nchars <= 0)
655  continue;
656  /* %u allows leading whitespace, so reject that */
657  if (strchr("0123456789", entry->d_name[3]) == NULL)
658  continue;
659  }
660 
661  if (strcmp(entry->d_name + nchars, "tmp") != 0 &&
662  strcmp(entry->d_name + nchars, "stat") != 0)
663  continue;
664 
665  snprintf(fname, sizeof(fname), "%s/%s", directory,
666  entry->d_name);
667  unlink(fname);
668  }
669  FreeDir(dir);
670 }
671 
672 /*
673  * pgstat_reset_all() -
674  *
675  * Remove the stats files. This is currently used only if WAL
676  * recovery is needed after a crash.
677  */
678 void
680 {
683 }
684 
685 #ifdef EXEC_BACKEND
686 
687 /*
688  * pgstat_forkexec() -
689  *
690  * Format up the arglist for, then fork and exec, statistics collector process
691  */
692 static pid_t
693 pgstat_forkexec(void)
694 {
695  char *av[10];
696  int ac = 0;
697 
698  av[ac++] = "postgres";
699  av[ac++] = "--forkcol";
700  av[ac++] = NULL; /* filled in by postmaster_forkexec */
701 
702  av[ac] = NULL;
703  Assert(ac < lengthof(av));
704 
705  return postmaster_forkexec(ac, av);
706 }
707 #endif /* EXEC_BACKEND */
708 
709 
710 /*
711  * pgstat_start() -
712  *
713  * Called from postmaster at startup or after an existing collector
714  * died. Attempt to fire up a fresh statistics collector.
715  *
716  * Returns PID of child process, or 0 if fail.
717  *
718  * Note: if fail, we will be called again from the postmaster main loop.
719  */
720 int
722 {
723  time_t curtime;
724  pid_t pgStatPid;
725 
726  /*
727  * Check that the socket is there, else pgstat_init failed and we can do
728  * nothing useful.
729  */
731  return 0;
732 
733  /*
734  * Do nothing if too soon since last collector start. This is a safety
735  * valve to protect against continuous respawn attempts if the collector
736  * is dying immediately at launch. Note that since we will be re-called
737  * from the postmaster main loop, we will get another chance later.
738  */
739  curtime = time(NULL);
740  if ((unsigned int) (curtime - last_pgstat_start_time) <
741  (unsigned int) PGSTAT_RESTART_INTERVAL)
742  return 0;
743  last_pgstat_start_time = curtime;
744 
745  /*
746  * Okay, fork off the collector.
747  */
748 #ifdef EXEC_BACKEND
749  switch ((pgStatPid = pgstat_forkexec()))
750 #else
751  switch ((pgStatPid = fork_process()))
752 #endif
753  {
754  case -1:
755  ereport(LOG,
756  (errmsg("could not fork statistics collector: %m")));
757  return 0;
758 
759 #ifndef EXEC_BACKEND
760  case 0:
761  /* in postmaster child ... */
763 
764  /* Close the postmaster's sockets */
765  ClosePostmasterPorts(false);
766 
767  /* Drop our connection to postmaster's shared memory, as well */
768  dsm_detach_all();
770 
771  PgstatCollectorMain(0, NULL);
772  break;
773 #endif
774 
775  default:
776  return (int) pgStatPid;
777  }
778 
779  /* shouldn't get here */
780  return 0;
781 }
782 
783 void
785 {
787 }
788 
789 /* ------------------------------------------------------------
790  * Public functions used by backends follow
791  *------------------------------------------------------------
792  */
793 
794 
795 /* ----------
796  * pgstat_report_stat() -
797  *
798  * Must be called by processes that performs DML: tcop/postgres.c, logical
799  * receiver processes, SPI worker, etc. to send the so far collected
800  * per-table and function usage statistics to the collector. Note that this
801  * is called only when not within a transaction, so it is fair to use
802  * transaction stop time as an approximation of current time.
803  * ----------
804  */
805 void
807 {
808  /* we assume this inits to all zeroes: */
809  static const PgStat_TableCounts all_zeroes;
810  static TimestampTz last_report = 0;
811 
813  PgStat_MsgTabstat regular_msg;
814  PgStat_MsgTabstat shared_msg;
815  TabStatusArray *tsa;
816  int i;
817 
818  /* Don't expend a clock check if nothing to do */
819  if ((pgStatTabList == NULL || pgStatTabList->tsa_used == 0) &&
820  pgStatXactCommit == 0 && pgStatXactRollback == 0 &&
822  return;
823 
824  /*
825  * Don't send a message unless it's been at least PGSTAT_STAT_INTERVAL
826  * msec since we last sent one, or the caller wants to force stats out.
827  */
829  if (!force &&
831  return;
832  last_report = now;
833 
834  /*
835  * Destroy pgStatTabHash before we start invalidating PgStat_TableEntry
836  * entries it points to. (Should we fail partway through the loop below,
837  * it's okay to have removed the hashtable already --- the only
838  * consequence is we'd get multiple entries for the same table in the
839  * pgStatTabList, and that's safe.)
840  */
841  if (pgStatTabHash)
842  hash_destroy(pgStatTabHash);
843  pgStatTabHash = NULL;
844 
845  /*
846  * Scan through the TabStatusArray struct(s) to find tables that actually
847  * have counts, and build messages to send. We have to separate shared
848  * relations from regular ones because the databaseid field in the message
849  * header has to depend on that.
850  */
851  regular_msg.m_databaseid = MyDatabaseId;
852  shared_msg.m_databaseid = InvalidOid;
853  regular_msg.m_nentries = 0;
854  shared_msg.m_nentries = 0;
855 
856  for (tsa = pgStatTabList; tsa != NULL; tsa = tsa->tsa_next)
857  {
858  for (i = 0; i < tsa->tsa_used; i++)
859  {
860  PgStat_TableStatus *entry = &tsa->tsa_entries[i];
861  PgStat_MsgTabstat *this_msg;
862  PgStat_TableEntry *this_ent;
863 
864  /* Shouldn't have any pending transaction-dependent counts */
865  Assert(entry->trans == NULL);
866 
867  /*
868  * Ignore entries that didn't accumulate any actual counts, such
869  * as indexes that were opened by the planner but not used.
870  */
871  if (memcmp(&entry->t_counts, &all_zeroes,
872  sizeof(PgStat_TableCounts)) == 0)
873  continue;
874 
875  /*
876  * OK, insert data into the appropriate message, and send if full.
877  */
878  this_msg = entry->t_shared ? &shared_msg : &regular_msg;
879  this_ent = &this_msg->m_entry[this_msg->m_nentries];
880  this_ent->t_id = entry->t_id;
881  memcpy(&this_ent->t_counts, &entry->t_counts,
882  sizeof(PgStat_TableCounts));
883  if (++this_msg->m_nentries >= PGSTAT_NUM_TABENTRIES)
884  {
885  pgstat_send_tabstat(this_msg);
886  this_msg->m_nentries = 0;
887  }
888  }
889  /* zero out PgStat_TableStatus structs after use */
890  MemSet(tsa->tsa_entries, 0,
891  tsa->tsa_used * sizeof(PgStat_TableStatus));
892  tsa->tsa_used = 0;
893  }
894 
895  /*
896  * Send partial messages. Make sure that any pending xact commit/abort
897  * gets counted, even if there are no table stats to send.
898  */
899  if (regular_msg.m_nentries > 0 ||
901  pgstat_send_tabstat(&regular_msg);
902  if (shared_msg.m_nentries > 0)
903  pgstat_send_tabstat(&shared_msg);
904 
905  /* Now, send function statistics */
907 }
908 
909 /*
910  * Subroutine for pgstat_report_stat: finish and send a tabstat message
911  */
912 static void
914 {
915  int n;
916  int len;
917 
918  /* It's unlikely we'd get here with no socket, but maybe not impossible */
920  return;
921 
922  /*
923  * Report and reset accumulated xact commit/rollback and I/O timings
924  * whenever we send a normal tabstat message
925  */
926  if (OidIsValid(tsmsg->m_databaseid))
927  {
932  pgStatXactCommit = 0;
933  pgStatXactRollback = 0;
936  }
937  else
938  {
939  tsmsg->m_xact_commit = 0;
940  tsmsg->m_xact_rollback = 0;
941  tsmsg->m_block_read_time = 0;
942  tsmsg->m_block_write_time = 0;
943  }
944 
945  n = tsmsg->m_nentries;
946  len = offsetof(PgStat_MsgTabstat, m_entry[0]) +
947  n * sizeof(PgStat_TableEntry);
948 
950  pgstat_send(tsmsg, len);
951 }
952 
953 /*
954  * Subroutine for pgstat_report_stat: populate and send a function stat message
955  */
956 static void
958 {
959  /* we assume this inits to all zeroes: */
960  static const PgStat_FunctionCounts all_zeroes;
961 
962  PgStat_MsgFuncstat msg;
964  HASH_SEQ_STATUS fstat;
965 
966  if (pgStatFunctions == NULL)
967  return;
968 
971  msg.m_nentries = 0;
972 
973  hash_seq_init(&fstat, pgStatFunctions);
974  while ((entry = (PgStat_BackendFunctionEntry *) hash_seq_search(&fstat)) != NULL)
975  {
976  PgStat_FunctionEntry *m_ent;
977 
978  /* Skip it if no counts accumulated since last time */
979  if (memcmp(&entry->f_counts, &all_zeroes,
980  sizeof(PgStat_FunctionCounts)) == 0)
981  continue;
982 
983  /* need to convert format of time accumulators */
984  m_ent = &msg.m_entry[msg.m_nentries];
985  m_ent->f_id = entry->f_id;
986  m_ent->f_numcalls = entry->f_counts.f_numcalls;
989 
990  if (++msg.m_nentries >= PGSTAT_NUM_FUNCENTRIES)
991  {
992  pgstat_send(&msg, offsetof(PgStat_MsgFuncstat, m_entry[0]) +
993  msg.m_nentries * sizeof(PgStat_FunctionEntry));
994  msg.m_nentries = 0;
995  }
996 
997  /* reset the entry's counts */
998  MemSet(&entry->f_counts, 0, sizeof(PgStat_FunctionCounts));
999  }
1000 
1001  if (msg.m_nentries > 0)
1002  pgstat_send(&msg, offsetof(PgStat_MsgFuncstat, m_entry[0]) +
1003  msg.m_nentries * sizeof(PgStat_FunctionEntry));
1004 
1005  have_function_stats = false;
1006 }
1007 
1008 
1009 /* ----------
1010  * pgstat_vacuum_stat() -
1011  *
1012  * Will tell the collector about objects he can get rid of.
1013  * ----------
1014  */
1015 void
1017 {
1018  HTAB *htab;
1019  PgStat_MsgTabpurge msg;
1020  PgStat_MsgFuncpurge f_msg;
1021  HASH_SEQ_STATUS hstat;
1022  PgStat_StatDBEntry *dbentry;
1023  PgStat_StatTabEntry *tabentry;
1024  PgStat_StatFuncEntry *funcentry;
1025  int len;
1026 
1028  return;
1029 
1030  /*
1031  * If not done for this transaction, read the statistics collector stats
1032  * file into some hash tables.
1033  */
1035 
1036  /*
1037  * Read pg_database and make a list of OIDs of all existing databases
1038  */
1039  htab = pgstat_collect_oids(DatabaseRelationId, Anum_pg_database_oid);
1040 
1041  /*
1042  * Search the database hash table for dead databases and tell the
1043  * collector to drop them.
1044  */
1045  hash_seq_init(&hstat, pgStatDBHash);
1046  while ((dbentry = (PgStat_StatDBEntry *) hash_seq_search(&hstat)) != NULL)
1047  {
1048  Oid dbid = dbentry->databaseid;
1049 
1051 
1052  /* the DB entry for shared tables (with InvalidOid) is never dropped */
1053  if (OidIsValid(dbid) &&
1054  hash_search(htab, (void *) &dbid, HASH_FIND, NULL) == NULL)
1055  pgstat_drop_database(dbid);
1056  }
1057 
1058  /* Clean up */
1059  hash_destroy(htab);
1060 
1061  /*
1062  * Lookup our own database entry; if not found, nothing more to do.
1063  */
1064  dbentry = (PgStat_StatDBEntry *) hash_search(pgStatDBHash,
1065  (void *) &MyDatabaseId,
1066  HASH_FIND, NULL);
1067  if (dbentry == NULL || dbentry->tables == NULL)
1068  return;
1069 
1070  /*
1071  * Similarly to above, make a list of all known relations in this DB.
1072  */
1073  htab = pgstat_collect_oids(RelationRelationId, Anum_pg_class_oid);
1074 
1075  /*
1076  * Initialize our messages table counter to zero
1077  */
1078  msg.m_nentries = 0;
1079 
1080  /*
1081  * Check for all tables listed in stats hashtable if they still exist.
1082  */
1083  hash_seq_init(&hstat, dbentry->tables);
1084  while ((tabentry = (PgStat_StatTabEntry *) hash_seq_search(&hstat)) != NULL)
1085  {
1086  Oid tabid = tabentry->tableid;
1087 
1089 
1090  if (hash_search(htab, (void *) &tabid, HASH_FIND, NULL) != NULL)
1091  continue;
1092 
1093  /*
1094  * Not there, so add this table's Oid to the message
1095  */
1096  msg.m_tableid[msg.m_nentries++] = tabid;
1097 
1098  /*
1099  * If the message is full, send it out and reinitialize to empty
1100  */
1101  if (msg.m_nentries >= PGSTAT_NUM_TABPURGE)
1102  {
1103  len = offsetof(PgStat_MsgTabpurge, m_tableid[0])
1104  + msg.m_nentries * sizeof(Oid);
1105 
1107  msg.m_databaseid = MyDatabaseId;
1108  pgstat_send(&msg, len);
1109 
1110  msg.m_nentries = 0;
1111  }
1112  }
1113 
1114  /*
1115  * Send the rest
1116  */
1117  if (msg.m_nentries > 0)
1118  {
1119  len = offsetof(PgStat_MsgTabpurge, m_tableid[0])
1120  + msg.m_nentries * sizeof(Oid);
1121 
1123  msg.m_databaseid = MyDatabaseId;
1124  pgstat_send(&msg, len);
1125  }
1126 
1127  /* Clean up */
1128  hash_destroy(htab);
1129 
1130  /*
1131  * Now repeat the above steps for functions. However, we needn't bother
1132  * in the common case where no function stats are being collected.
1133  */
1134  if (dbentry->functions != NULL &&
1135  hash_get_num_entries(dbentry->functions) > 0)
1136  {
1137  htab = pgstat_collect_oids(ProcedureRelationId, Anum_pg_proc_oid);
1138 
1140  f_msg.m_databaseid = MyDatabaseId;
1141  f_msg.m_nentries = 0;
1142 
1143  hash_seq_init(&hstat, dbentry->functions);
1144  while ((funcentry = (PgStat_StatFuncEntry *) hash_seq_search(&hstat)) != NULL)
1145  {
1146  Oid funcid = funcentry->functionid;
1147 
1149 
1150  if (hash_search(htab, (void *) &funcid, HASH_FIND, NULL) != NULL)
1151  continue;
1152 
1153  /*
1154  * Not there, so add this function's Oid to the message
1155  */
1156  f_msg.m_functionid[f_msg.m_nentries++] = funcid;
1157 
1158  /*
1159  * If the message is full, send it out and reinitialize to empty
1160  */
1161  if (f_msg.m_nentries >= PGSTAT_NUM_FUNCPURGE)
1162  {
1163  len = offsetof(PgStat_MsgFuncpurge, m_functionid[0])
1164  + f_msg.m_nentries * sizeof(Oid);
1165 
1166  pgstat_send(&f_msg, len);
1167 
1168  f_msg.m_nentries = 0;
1169  }
1170  }
1171 
1172  /*
1173  * Send the rest
1174  */
1175  if (f_msg.m_nentries > 0)
1176  {
1177  len = offsetof(PgStat_MsgFuncpurge, m_functionid[0])
1178  + f_msg.m_nentries * sizeof(Oid);
1179 
1180  pgstat_send(&f_msg, len);
1181  }
1182 
1183  hash_destroy(htab);
1184  }
1185 }
1186 
1187 
1188 /* ----------
1189  * pgstat_collect_oids() -
1190  *
1191  * Collect the OIDs of all objects listed in the specified system catalog
1192  * into a temporary hash table. Caller should hash_destroy the result
1193  * when done with it. (However, we make the table in CurrentMemoryContext
1194  * so that it will be freed properly in event of an error.)
1195  * ----------
1196  */
1197 static HTAB *
1198 pgstat_collect_oids(Oid catalogid, AttrNumber anum_oid)
1199 {
1200  HTAB *htab;
1201  HASHCTL hash_ctl;
1202  Relation rel;
1203  TableScanDesc scan;
1204  HeapTuple tup;
1205  Snapshot snapshot;
1206 
1207  memset(&hash_ctl, 0, sizeof(hash_ctl));
1208  hash_ctl.keysize = sizeof(Oid);
1209  hash_ctl.entrysize = sizeof(Oid);
1210  hash_ctl.hcxt = CurrentMemoryContext;
1211  htab = hash_create("Temporary table of OIDs",
1213  &hash_ctl,
1215 
1216  rel = table_open(catalogid, AccessShareLock);
1217  snapshot = RegisterSnapshot(GetLatestSnapshot());
1218  scan = table_beginscan(rel, snapshot, 0, NULL);
1219  while ((tup = heap_getnext(scan, ForwardScanDirection)) != NULL)
1220  {
1221  Oid thisoid;
1222  bool isnull;
1223 
1224  thisoid = heap_getattr(tup, anum_oid, RelationGetDescr(rel), &isnull);
1225  Assert(!isnull);
1226 
1228 
1229  (void) hash_search(htab, (void *) &thisoid, HASH_ENTER, NULL);
1230  }
1231  table_endscan(scan);
1232  UnregisterSnapshot(snapshot);
1234 
1235  return htab;
1236 }
1237 
1238 
1239 /* ----------
1240  * pgstat_drop_database() -
1241  *
1242  * Tell the collector that we just dropped a database.
1243  * (If the message gets lost, we will still clean the dead DB eventually
1244  * via future invocations of pgstat_vacuum_stat().)
1245  * ----------
1246  */
1247 void
1249 {
1250  PgStat_MsgDropdb msg;
1251 
1253  return;
1254 
1256  msg.m_databaseid = databaseid;
1257  pgstat_send(&msg, sizeof(msg));
1258 }
1259 
1260 
1261 /* ----------
1262  * pgstat_drop_relation() -
1263  *
1264  * Tell the collector that we just dropped a relation.
1265  * (If the message gets lost, we will still clean the dead entry eventually
1266  * via future invocations of pgstat_vacuum_stat().)
1267  *
1268  * Currently not used for lack of any good place to call it; we rely
1269  * entirely on pgstat_vacuum_stat() to clean out stats for dead rels.
1270  * ----------
1271  */
1272 #ifdef NOT_USED
1273 void
1274 pgstat_drop_relation(Oid relid)
1275 {
1276  PgStat_MsgTabpurge msg;
1277  int len;
1278 
1280  return;
1281 
1282  msg.m_tableid[0] = relid;
1283  msg.m_nentries = 1;
1284 
1285  len = offsetof(PgStat_MsgTabpurge, m_tableid[0]) + sizeof(Oid);
1286 
1288  msg.m_databaseid = MyDatabaseId;
1289  pgstat_send(&msg, len);
1290 }
1291 #endif /* NOT_USED */
1292 
1293 
1294 /* ----------
1295  * pgstat_reset_counters() -
1296  *
1297  * Tell the statistics collector to reset counters for our database.
1298  *
1299  * Permission checking for this function is managed through the normal
1300  * GRANT system.
1301  * ----------
1302  */
1303 void
1305 {
1307 
1309  return;
1310 
1312  msg.m_databaseid = MyDatabaseId;
1313  pgstat_send(&msg, sizeof(msg));
1314 }
1315 
1316 /* ----------
1317  * pgstat_reset_shared_counters() -
1318  *
1319  * Tell the statistics collector to reset cluster-wide shared counters.
1320  *
1321  * Permission checking for this function is managed through the normal
1322  * GRANT system.
1323  * ----------
1324  */
1325 void
1326 pgstat_reset_shared_counters(const char *target)
1327 {
1329 
1331  return;
1332 
1333  if (strcmp(target, "archiver") == 0)
1335  else if (strcmp(target, "bgwriter") == 0)
1337  else
1338  ereport(ERROR,
1339  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1340  errmsg("unrecognized reset target: \"%s\"", target),
1341  errhint("Target must be \"archiver\" or \"bgwriter\".")));
1342 
1344  pgstat_send(&msg, sizeof(msg));
1345 }
1346 
1347 /* ----------
1348  * pgstat_reset_single_counter() -
1349  *
1350  * Tell the statistics collector to reset a single counter.
1351  *
1352  * Permission checking for this function is managed through the normal
1353  * GRANT system.
1354  * ----------
1355  */
1356 void
1358 {
1360 
1362  return;
1363 
1365  msg.m_databaseid = MyDatabaseId;
1366  msg.m_resettype = type;
1367  msg.m_objectid = objoid;
1368 
1369  pgstat_send(&msg, sizeof(msg));
1370 }
1371 
1372 /* ----------
1373  * pgstat_report_autovac() -
1374  *
1375  * Called from autovacuum.c to report startup of an autovacuum process.
1376  * We are called before InitPostgres is done, so can't rely on MyDatabaseId;
1377  * the db OID must be passed in, instead.
1378  * ----------
1379  */
1380 void
1382 {
1384 
1386  return;
1387 
1389  msg.m_databaseid = dboid;
1391 
1392  pgstat_send(&msg, sizeof(msg));
1393 }
1394 
1395 
1396 /* ---------
1397  * pgstat_report_vacuum() -
1398  *
1399  * Tell the collector about the table we just vacuumed.
1400  * ---------
1401  */
1402 void
1403 pgstat_report_vacuum(Oid tableoid, bool shared,
1404  PgStat_Counter livetuples, PgStat_Counter deadtuples)
1405 {
1406  PgStat_MsgVacuum msg;
1407 
1409  return;
1410 
1412  msg.m_databaseid = shared ? InvalidOid : MyDatabaseId;
1413  msg.m_tableoid = tableoid;
1416  msg.m_live_tuples = livetuples;
1417  msg.m_dead_tuples = deadtuples;
1418  pgstat_send(&msg, sizeof(msg));
1419 }
1420 
1421 /* --------
1422  * pgstat_report_analyze() -
1423  *
1424  * Tell the collector about the table we just analyzed.
1425  *
1426  * Caller must provide new live- and dead-tuples estimates, as well as a
1427  * flag indicating whether to reset the changes_since_analyze counter.
1428  * --------
1429  */
1430 void
1432  PgStat_Counter livetuples, PgStat_Counter deadtuples,
1433  bool resetcounter)
1434 {
1435  PgStat_MsgAnalyze msg;
1436 
1438  return;
1439 
1440  /*
1441  * Unlike VACUUM, ANALYZE might be running inside a transaction that has
1442  * already inserted and/or deleted rows in the target table. ANALYZE will
1443  * have counted such rows as live or dead respectively. Because we will
1444  * report our counts of such rows at transaction end, we should subtract
1445  * off these counts from what we send to the collector now, else they'll
1446  * be double-counted after commit. (This approach also ensures that the
1447  * collector ends up with the right numbers if we abort instead of
1448  * committing.)
1449  */
1450  if (rel->pgstat_info != NULL)
1451  {
1453 
1454  for (trans = rel->pgstat_info->trans; trans; trans = trans->upper)
1455  {
1456  livetuples -= trans->tuples_inserted - trans->tuples_deleted;
1457  deadtuples -= trans->tuples_updated + trans->tuples_deleted;
1458  }
1459  /* count stuff inserted by already-aborted subxacts, too */
1460  deadtuples -= rel->pgstat_info->t_counts.t_delta_dead_tuples;
1461  /* Since ANALYZE's counts are estimates, we could have underflowed */
1462  livetuples = Max(livetuples, 0);
1463  deadtuples = Max(deadtuples, 0);
1464  }
1465 
1467  msg.m_databaseid = rel->rd_rel->relisshared ? InvalidOid : MyDatabaseId;
1468  msg.m_tableoid = RelationGetRelid(rel);
1470  msg.m_resetcounter = resetcounter;
1472  msg.m_live_tuples = livetuples;
1473  msg.m_dead_tuples = deadtuples;
1474  pgstat_send(&msg, sizeof(msg));
1475 }
1476 
1477 /* --------
1478  * pgstat_report_recovery_conflict() -
1479  *
1480  * Tell the collector about a Hot Standby recovery conflict.
1481  * --------
1482  */
1483 void
1485 {
1487 
1489  return;
1490 
1492  msg.m_databaseid = MyDatabaseId;
1493  msg.m_reason = reason;
1494  pgstat_send(&msg, sizeof(msg));
1495 }
1496 
1497 /* --------
1498  * pgstat_report_deadlock() -
1499  *
1500  * Tell the collector about a deadlock detected.
1501  * --------
1502  */
1503 void
1505 {
1506  PgStat_MsgDeadlock msg;
1507 
1509  return;
1510 
1512  msg.m_databaseid = MyDatabaseId;
1513  pgstat_send(&msg, sizeof(msg));
1514 }
1515 
1516 
1517 
1518 /* --------
1519  * pgstat_report_checksum_failures_in_db() -
1520  *
1521  * Tell the collector about one or more checksum failures.
1522  * --------
1523  */
1524 void
1526 {
1528 
1530  return;
1531 
1533  msg.m_databaseid = dboid;
1534  msg.m_failurecount = failurecount;
1536 
1537  pgstat_send(&msg, sizeof(msg));
1538 }
1539 
1540 /* --------
1541  * pgstat_report_checksum_failure() -
1542  *
1543  * Tell the collector about a checksum failure.
1544  * --------
1545  */
1546 void
1548 {
1550 }
1551 
1552 /* --------
1553  * pgstat_report_tempfile() -
1554  *
1555  * Tell the collector about a temporary file.
1556  * --------
1557  */
1558 void
1559 pgstat_report_tempfile(size_t filesize)
1560 {
1561  PgStat_MsgTempFile msg;
1562 
1564  return;
1565 
1567  msg.m_databaseid = MyDatabaseId;
1568  msg.m_filesize = filesize;
1569  pgstat_send(&msg, sizeof(msg));
1570 }
1571 
1572 
1573 /* ----------
1574  * pgstat_ping() -
1575  *
1576  * Send some junk data to the collector to increase traffic.
1577  * ----------
1578  */
1579 void
1581 {
1582  PgStat_MsgDummy msg;
1583 
1585  return;
1586 
1588  pgstat_send(&msg, sizeof(msg));
1589 }
1590 
1591 /* ----------
1592  * pgstat_send_inquiry() -
1593  *
1594  * Notify collector that we need fresh data.
1595  * ----------
1596  */
1597 static void
1598 pgstat_send_inquiry(TimestampTz clock_time, TimestampTz cutoff_time, Oid databaseid)
1599 {
1600  PgStat_MsgInquiry msg;
1601 
1603  msg.clock_time = clock_time;
1604  msg.cutoff_time = cutoff_time;
1605  msg.databaseid = databaseid;
1606  pgstat_send(&msg, sizeof(msg));
1607 }
1608 
1609 
1610 /*
1611  * Initialize function call usage data.
1612  * Called by the executor before invoking a function.
1613  */
1614 void
1617 {
1618  PgStat_BackendFunctionEntry *htabent;
1619  bool found;
1620 
1621  if (pgstat_track_functions <= fcinfo->flinfo->fn_stats)
1622  {
1623  /* stats not wanted */
1624  fcu->fs = NULL;
1625  return;
1626  }
1627 
1628  if (!pgStatFunctions)
1629  {
1630  /* First time through - initialize function stat table */
1631  HASHCTL hash_ctl;
1632 
1633  memset(&hash_ctl, 0, sizeof(hash_ctl));
1634  hash_ctl.keysize = sizeof(Oid);
1635  hash_ctl.entrysize = sizeof(PgStat_BackendFunctionEntry);
1636  pgStatFunctions = hash_create("Function stat entries",
1638  &hash_ctl,
1639  HASH_ELEM | HASH_BLOBS);
1640  }
1641 
1642  /* Get the stats entry for this function, create if necessary */
1643  htabent = hash_search(pgStatFunctions, &fcinfo->flinfo->fn_oid,
1644  HASH_ENTER, &found);
1645  if (!found)
1646  MemSet(&htabent->f_counts, 0, sizeof(PgStat_FunctionCounts));
1647 
1648  fcu->fs = &htabent->f_counts;
1649 
1650  /* save stats for this function, later used to compensate for recursion */
1651  fcu->save_f_total_time = htabent->f_counts.f_total_time;
1652 
1653  /* save current backend-wide total time */
1654  fcu->save_total = total_func_time;
1655 
1656  /* get clock time as of function start */
1658 }
1659 
1660 /*
1661  * find_funcstat_entry - find any existing PgStat_BackendFunctionEntry entry
1662  * for specified function
1663  *
1664  * If no entry, return NULL, don't create a new one
1665  */
1668 {
1669  if (pgStatFunctions == NULL)
1670  return NULL;
1671 
1672  return (PgStat_BackendFunctionEntry *) hash_search(pgStatFunctions,
1673  (void *) &func_id,
1674  HASH_FIND, NULL);
1675 }
1676 
1677 /*
1678  * Calculate function call usage and update stat counters.
1679  * Called by the executor after invoking a function.
1680  *
1681  * In the case of a set-returning function that runs in value-per-call mode,
1682  * we will see multiple pgstat_init_function_usage/pgstat_end_function_usage
1683  * calls for what the user considers a single call of the function. The
1684  * finalize flag should be TRUE on the last call.
1685  */
1686 void
1688 {
1689  PgStat_FunctionCounts *fs = fcu->fs;
1690  instr_time f_total;
1691  instr_time f_others;
1692  instr_time f_self;
1693 
1694  /* stats not wanted? */
1695  if (fs == NULL)
1696  return;
1697 
1698  /* total elapsed time in this function call */
1699  INSTR_TIME_SET_CURRENT(f_total);
1700  INSTR_TIME_SUBTRACT(f_total, fcu->f_start);
1701 
1702  /* self usage: elapsed minus anything already charged to other calls */
1703  f_others = total_func_time;
1704  INSTR_TIME_SUBTRACT(f_others, fcu->save_total);
1705  f_self = f_total;
1706  INSTR_TIME_SUBTRACT(f_self, f_others);
1707 
1708  /* update backend-wide total time */
1710 
1711  /*
1712  * Compute the new f_total_time as the total elapsed time added to the
1713  * pre-call value of f_total_time. This is necessary to avoid
1714  * double-counting any time taken by recursive calls of myself. (We do
1715  * not need any similar kluge for self time, since that already excludes
1716  * any recursive calls.)
1717  */
1718  INSTR_TIME_ADD(f_total, fcu->save_f_total_time);
1719 
1720  /* update counters in function stats table */
1721  if (finalize)
1722  fs->f_numcalls++;
1723  fs->f_total_time = f_total;
1724  INSTR_TIME_ADD(fs->f_self_time, f_self);
1725 
1726  /* indicate that we have something to send */
1727  have_function_stats = true;
1728 }
1729 
1730 
1731 /* ----------
1732  * pgstat_initstats() -
1733  *
1734  * Initialize a relcache entry to count access statistics.
1735  * Called whenever a relation is opened.
1736  *
1737  * We assume that a relcache entry's pgstat_info field is zeroed by
1738  * relcache.c when the relcache entry is made; thereafter it is long-lived
1739  * data. We can avoid repeated searches of the TabStatus arrays when the
1740  * same relation is touched repeatedly within a transaction.
1741  * ----------
1742  */
1743 void
1745 {
1746  Oid rel_id = rel->rd_id;
1747  char relkind = rel->rd_rel->relkind;
1748 
1749  /* We only count stats for things that have storage */
1750  if (!(relkind == RELKIND_RELATION ||
1751  relkind == RELKIND_MATVIEW ||
1752  relkind == RELKIND_INDEX ||
1753  relkind == RELKIND_TOASTVALUE ||
1754  relkind == RELKIND_SEQUENCE))
1755  {
1756  rel->pgstat_info = NULL;
1757  return;
1758  }
1759 
1761  {
1762  /* We're not counting at all */
1763  rel->pgstat_info = NULL;
1764  return;
1765  }
1766 
1767  /*
1768  * If we already set up this relation in the current transaction, nothing
1769  * to do.
1770  */
1771  if (rel->pgstat_info != NULL &&
1772  rel->pgstat_info->t_id == rel_id)
1773  return;
1774 
1775  /* Else find or make the PgStat_TableStatus entry, and update link */
1776  rel->pgstat_info = get_tabstat_entry(rel_id, rel->rd_rel->relisshared);
1777 }
1778 
1779 /*
1780  * get_tabstat_entry - find or create a PgStat_TableStatus entry for rel
1781  */
1782 static PgStat_TableStatus *
1783 get_tabstat_entry(Oid rel_id, bool isshared)
1784 {
1785  TabStatHashEntry *hash_entry;
1786  PgStat_TableStatus *entry;
1787  TabStatusArray *tsa;
1788  bool found;
1789 
1790  /*
1791  * Create hash table if we don't have it already.
1792  */
1793  if (pgStatTabHash == NULL)
1794  {
1795  HASHCTL ctl;
1796 
1797  memset(&ctl, 0, sizeof(ctl));
1798  ctl.keysize = sizeof(Oid);
1799  ctl.entrysize = sizeof(TabStatHashEntry);
1800 
1801  pgStatTabHash = hash_create("pgstat TabStatusArray lookup hash table",
1803  &ctl,
1804  HASH_ELEM | HASH_BLOBS);
1805  }
1806 
1807  /*
1808  * Find an entry or create a new one.
1809  */
1810  hash_entry = hash_search(pgStatTabHash, &rel_id, HASH_ENTER, &found);
1811  if (!found)
1812  {
1813  /* initialize new entry with null pointer */
1814  hash_entry->tsa_entry = NULL;
1815  }
1816 
1817  /*
1818  * If entry is already valid, we're done.
1819  */
1820  if (hash_entry->tsa_entry)
1821  return hash_entry->tsa_entry;
1822 
1823  /*
1824  * Locate the first pgStatTabList entry with free space, making a new list
1825  * entry if needed. Note that we could get an OOM failure here, but if so
1826  * we have left the hashtable and the list in a consistent state.
1827  */
1828  if (pgStatTabList == NULL)
1829  {
1830  /* Set up first pgStatTabList entry */
1831  pgStatTabList = (TabStatusArray *)
1833  sizeof(TabStatusArray));
1834  }
1835 
1836  tsa = pgStatTabList;
1837  while (tsa->tsa_used >= TABSTAT_QUANTUM)
1838  {
1839  if (tsa->tsa_next == NULL)
1840  tsa->tsa_next = (TabStatusArray *)
1842  sizeof(TabStatusArray));
1843  tsa = tsa->tsa_next;
1844  }
1845 
1846  /*
1847  * Allocate a PgStat_TableStatus entry within this list entry. We assume
1848  * the entry was already zeroed, either at creation or after last use.
1849  */
1850  entry = &tsa->tsa_entries[tsa->tsa_used++];
1851  entry->t_id = rel_id;
1852  entry->t_shared = isshared;
1853 
1854  /*
1855  * Now we can fill the entry in pgStatTabHash.
1856  */
1857  hash_entry->tsa_entry = entry;
1858 
1859  return entry;
1860 }
1861 
1862 /*
1863  * find_tabstat_entry - find any existing PgStat_TableStatus entry for rel
1864  *
1865  * If no entry, return NULL, don't create a new one
1866  *
1867  * Note: if we got an error in the most recent execution of pgstat_report_stat,
1868  * it's possible that an entry exists but there's no hashtable entry for it.
1869  * That's okay, we'll treat this case as "doesn't exist".
1870  */
1873 {
1874  TabStatHashEntry *hash_entry;
1875 
1876  /* If hashtable doesn't exist, there are no entries at all */
1877  if (!pgStatTabHash)
1878  return NULL;
1879 
1880  hash_entry = hash_search(pgStatTabHash, &rel_id, HASH_FIND, NULL);
1881  if (!hash_entry)
1882  return NULL;
1883 
1884  /* Note that this step could also return NULL, but that's correct */
1885  return hash_entry->tsa_entry;
1886 }
1887 
1888 /*
1889  * get_tabstat_stack_level - add a new (sub)transaction stack entry if needed
1890  */
1891 static PgStat_SubXactStatus *
1893 {
1894  PgStat_SubXactStatus *xact_state;
1895 
1896  xact_state = pgStatXactStack;
1897  if (xact_state == NULL || xact_state->nest_level != nest_level)
1898  {
1899  xact_state = (PgStat_SubXactStatus *)
1901  sizeof(PgStat_SubXactStatus));
1902  xact_state->nest_level = nest_level;
1903  xact_state->prev = pgStatXactStack;
1904  xact_state->first = NULL;
1905  pgStatXactStack = xact_state;
1906  }
1907  return xact_state;
1908 }
1909 
1910 /*
1911  * add_tabstat_xact_level - add a new (sub)transaction state record
1912  */
1913 static void
1914 add_tabstat_xact_level(PgStat_TableStatus *pgstat_info, int nest_level)
1915 {
1916  PgStat_SubXactStatus *xact_state;
1918 
1919  /*
1920  * If this is the first rel to be modified at the current nest level, we
1921  * first have to push a transaction stack entry.
1922  */
1923  xact_state = get_tabstat_stack_level(nest_level);
1924 
1925  /* Now make a per-table stack entry */
1926  trans = (PgStat_TableXactStatus *)
1928  sizeof(PgStat_TableXactStatus));
1929  trans->nest_level = nest_level;
1930  trans->upper = pgstat_info->trans;
1931  trans->parent = pgstat_info;
1932  trans->next = xact_state->first;
1933  xact_state->first = trans;
1934  pgstat_info->trans = trans;
1935 }
1936 
1937 /*
1938  * pgstat_count_heap_insert - count a tuple insertion of n tuples
1939  */
1940 void
1942 {
1943  PgStat_TableStatus *pgstat_info = rel->pgstat_info;
1944 
1945  if (pgstat_info != NULL)
1946  {
1947  /* We have to log the effect at the proper transactional level */
1948  int nest_level = GetCurrentTransactionNestLevel();
1949 
1950  if (pgstat_info->trans == NULL ||
1951  pgstat_info->trans->nest_level != nest_level)
1952  add_tabstat_xact_level(pgstat_info, nest_level);
1953 
1954  pgstat_info->trans->tuples_inserted += n;
1955  }
1956 }
1957 
1958 /*
1959  * pgstat_count_heap_update - count a tuple update
1960  */
1961 void
1963 {
1964  PgStat_TableStatus *pgstat_info = rel->pgstat_info;
1965 
1966  if (pgstat_info != NULL)
1967  {
1968  /* We have to log the effect at the proper transactional level */
1969  int nest_level = GetCurrentTransactionNestLevel();
1970 
1971  if (pgstat_info->trans == NULL ||
1972  pgstat_info->trans->nest_level != nest_level)
1973  add_tabstat_xact_level(pgstat_info, nest_level);
1974 
1975  pgstat_info->trans->tuples_updated++;
1976 
1977  /* t_tuples_hot_updated is nontransactional, so just advance it */
1978  if (hot)
1979  pgstat_info->t_counts.t_tuples_hot_updated++;
1980  }
1981 }
1982 
1983 /*
1984  * pgstat_count_heap_delete - count a tuple deletion
1985  */
1986 void
1988 {
1989  PgStat_TableStatus *pgstat_info = rel->pgstat_info;
1990 
1991  if (pgstat_info != NULL)
1992  {
1993  /* We have to log the effect at the proper transactional level */
1994  int nest_level = GetCurrentTransactionNestLevel();
1995 
1996  if (pgstat_info->trans == NULL ||
1997  pgstat_info->trans->nest_level != nest_level)
1998  add_tabstat_xact_level(pgstat_info, nest_level);
1999 
2000  pgstat_info->trans->tuples_deleted++;
2001  }
2002 }
2003 
2004 /*
2005  * pgstat_truncate_save_counters
2006  *
2007  * Whenever a table is truncated, we save its i/u/d counters so that they can
2008  * be cleared, and if the (sub)xact that executed the truncate later aborts,
2009  * the counters can be restored to the saved (pre-truncate) values. Note we do
2010  * this on the first truncate in any particular subxact level only.
2011  */
2012 static void
2014 {
2015  if (!trans->truncated)
2016  {
2017  trans->inserted_pre_trunc = trans->tuples_inserted;
2018  trans->updated_pre_trunc = trans->tuples_updated;
2019  trans->deleted_pre_trunc = trans->tuples_deleted;
2020  trans->truncated = true;
2021  }
2022 }
2023 
2024 /*
2025  * pgstat_truncate_restore_counters - restore counters when a truncate aborts
2026  */
2027 static void
2029 {
2030  if (trans->truncated)
2031  {
2032  trans->tuples_inserted = trans->inserted_pre_trunc;
2033  trans->tuples_updated = trans->updated_pre_trunc;
2034  trans->tuples_deleted = trans->deleted_pre_trunc;
2035  }
2036 }
2037 
2038 /*
2039  * pgstat_count_truncate - update tuple counters due to truncate
2040  */
2041 void
2043 {
2044  PgStat_TableStatus *pgstat_info = rel->pgstat_info;
2045 
2046  if (pgstat_info != NULL)
2047  {
2048  /* We have to log the effect at the proper transactional level */
2049  int nest_level = GetCurrentTransactionNestLevel();
2050 
2051  if (pgstat_info->trans == NULL ||
2052  pgstat_info->trans->nest_level != nest_level)
2053  add_tabstat_xact_level(pgstat_info, nest_level);
2054 
2055  pgstat_truncate_save_counters(pgstat_info->trans);
2056  pgstat_info->trans->tuples_inserted = 0;
2057  pgstat_info->trans->tuples_updated = 0;
2058  pgstat_info->trans->tuples_deleted = 0;
2059  }
2060 }
2061 
2062 /*
2063  * pgstat_update_heap_dead_tuples - update dead-tuples count
2064  *
2065  * The semantics of this are that we are reporting the nontransactional
2066  * recovery of "delta" dead tuples; so t_delta_dead_tuples decreases
2067  * rather than increasing, and the change goes straight into the per-table
2068  * counter, not into transactional state.
2069  */
2070 void
2072 {
2073  PgStat_TableStatus *pgstat_info = rel->pgstat_info;
2074 
2075  if (pgstat_info != NULL)
2076  pgstat_info->t_counts.t_delta_dead_tuples -= delta;
2077 }
2078 
2079 
2080 /* ----------
2081  * AtEOXact_PgStat
2082  *
2083  * Called from access/transam/xact.c at top-level transaction commit/abort.
2084  * ----------
2085  */
2086 void
2087 AtEOXact_PgStat(bool isCommit, bool parallel)
2088 {
2089  PgStat_SubXactStatus *xact_state;
2090 
2091  /* Don't count parallel worker transaction stats */
2092  if (!parallel)
2093  {
2094  /*
2095  * Count transaction commit or abort. (We use counters, not just
2096  * bools, in case the reporting message isn't sent right away.)
2097  */
2098  if (isCommit)
2099  pgStatXactCommit++;
2100  else
2102  }
2103 
2104  /*
2105  * Transfer transactional insert/update counts into the base tabstat
2106  * entries. We don't bother to free any of the transactional state, since
2107  * it's all in TopTransactionContext and will go away anyway.
2108  */
2109  xact_state = pgStatXactStack;
2110  if (xact_state != NULL)
2111  {
2113 
2114  Assert(xact_state->nest_level == 1);
2115  Assert(xact_state->prev == NULL);
2116  for (trans = xact_state->first; trans != NULL; trans = trans->next)
2117  {
2118  PgStat_TableStatus *tabstat;
2119 
2120  Assert(trans->nest_level == 1);
2121  Assert(trans->upper == NULL);
2122  tabstat = trans->parent;
2123  Assert(tabstat->trans == trans);
2124  /* restore pre-truncate stats (if any) in case of aborted xact */
2125  if (!isCommit)
2127  /* count attempted actions regardless of commit/abort */
2128  tabstat->t_counts.t_tuples_inserted += trans->tuples_inserted;
2129  tabstat->t_counts.t_tuples_updated += trans->tuples_updated;
2130  tabstat->t_counts.t_tuples_deleted += trans->tuples_deleted;
2131  if (isCommit)
2132  {
2133  tabstat->t_counts.t_truncated = trans->truncated;
2134  if (trans->truncated)
2135  {
2136  /* forget live/dead stats seen by backend thus far */
2137  tabstat->t_counts.t_delta_live_tuples = 0;
2138  tabstat->t_counts.t_delta_dead_tuples = 0;
2139  }
2140  /* insert adds a live tuple, delete removes one */
2141  tabstat->t_counts.t_delta_live_tuples +=
2142  trans->tuples_inserted - trans->tuples_deleted;
2143  /* update and delete each create a dead tuple */
2144  tabstat->t_counts.t_delta_dead_tuples +=
2145  trans->tuples_updated + trans->tuples_deleted;
2146  /* insert, update, delete each count as one change event */
2147  tabstat->t_counts.t_changed_tuples +=
2148  trans->tuples_inserted + trans->tuples_updated +
2149  trans->tuples_deleted;
2150  }
2151  else
2152  {
2153  /* inserted tuples are dead, deleted tuples are unaffected */
2154  tabstat->t_counts.t_delta_dead_tuples +=
2155  trans->tuples_inserted + trans->tuples_updated;
2156  /* an aborted xact generates no changed_tuple events */
2157  }
2158  tabstat->trans = NULL;
2159  }
2160  }
2161  pgStatXactStack = NULL;
2162 
2163  /* Make sure any stats snapshot is thrown away */
2165 }
2166 
2167 /* ----------
2168  * AtEOSubXact_PgStat
2169  *
2170  * Called from access/transam/xact.c at subtransaction commit/abort.
2171  * ----------
2172  */
2173 void
2174 AtEOSubXact_PgStat(bool isCommit, int nestDepth)
2175 {
2176  PgStat_SubXactStatus *xact_state;
2177 
2178  /*
2179  * Transfer transactional insert/update counts into the next higher
2180  * subtransaction state.
2181  */
2182  xact_state = pgStatXactStack;
2183  if (xact_state != NULL &&
2184  xact_state->nest_level >= nestDepth)
2185  {
2187  PgStat_TableXactStatus *next_trans;
2188 
2189  /* delink xact_state from stack immediately to simplify reuse case */
2190  pgStatXactStack = xact_state->prev;
2191 
2192  for (trans = xact_state->first; trans != NULL; trans = next_trans)
2193  {
2194  PgStat_TableStatus *tabstat;
2195 
2196  next_trans = trans->next;
2197  Assert(trans->nest_level == nestDepth);
2198  tabstat = trans->parent;
2199  Assert(tabstat->trans == trans);
2200  if (isCommit)
2201  {
2202  if (trans->upper && trans->upper->nest_level == nestDepth - 1)
2203  {
2204  if (trans->truncated)
2205  {
2206  /* propagate the truncate status one level up */
2208  /* replace upper xact stats with ours */
2209  trans->upper->tuples_inserted = trans->tuples_inserted;
2210  trans->upper->tuples_updated = trans->tuples_updated;
2211  trans->upper->tuples_deleted = trans->tuples_deleted;
2212  }
2213  else
2214  {
2215  trans->upper->tuples_inserted += trans->tuples_inserted;
2216  trans->upper->tuples_updated += trans->tuples_updated;
2217  trans->upper->tuples_deleted += trans->tuples_deleted;
2218  }
2219  tabstat->trans = trans->upper;
2220  pfree(trans);
2221  }
2222  else
2223  {
2224  /*
2225  * When there isn't an immediate parent state, we can just
2226  * reuse the record instead of going through a
2227  * palloc/pfree pushup (this works since it's all in
2228  * TopTransactionContext anyway). We have to re-link it
2229  * into the parent level, though, and that might mean
2230  * pushing a new entry into the pgStatXactStack.
2231  */
2232  PgStat_SubXactStatus *upper_xact_state;
2233 
2234  upper_xact_state = get_tabstat_stack_level(nestDepth - 1);
2235  trans->next = upper_xact_state->first;
2236  upper_xact_state->first = trans;
2237  trans->nest_level = nestDepth - 1;
2238  }
2239  }
2240  else
2241  {
2242  /*
2243  * On abort, update top-level tabstat counts, then forget the
2244  * subtransaction
2245  */
2246 
2247  /* first restore values obliterated by truncate */
2249  /* count attempted actions regardless of commit/abort */
2250  tabstat->t_counts.t_tuples_inserted += trans->tuples_inserted;
2251  tabstat->t_counts.t_tuples_updated += trans->tuples_updated;
2252  tabstat->t_counts.t_tuples_deleted += trans->tuples_deleted;
2253  /* inserted tuples are dead, deleted tuples are unaffected */
2254  tabstat->t_counts.t_delta_dead_tuples +=
2255  trans->tuples_inserted + trans->tuples_updated;
2256  tabstat->trans = trans->upper;
2257  pfree(trans);
2258  }
2259  }
2260  pfree(xact_state);
2261  }
2262 }
2263 
2264 
2265 /*
2266  * AtPrepare_PgStat
2267  * Save the transactional stats state at 2PC transaction prepare.
2268  *
2269  * In this phase we just generate 2PC records for all the pending
2270  * transaction-dependent stats work.
2271  */
2272 void
2274 {
2275  PgStat_SubXactStatus *xact_state;
2276 
2277  xact_state = pgStatXactStack;
2278  if (xact_state != NULL)
2279  {
2281 
2282  Assert(xact_state->nest_level == 1);
2283  Assert(xact_state->prev == NULL);
2284  for (trans = xact_state->first; trans != NULL; trans = trans->next)
2285  {
2286  PgStat_TableStatus *tabstat;
2287  TwoPhasePgStatRecord record;
2288 
2289  Assert(trans->nest_level == 1);
2290  Assert(trans->upper == NULL);
2291  tabstat = trans->parent;
2292  Assert(tabstat->trans == trans);
2293 
2294  record.tuples_inserted = trans->tuples_inserted;
2295  record.tuples_updated = trans->tuples_updated;
2296  record.tuples_deleted = trans->tuples_deleted;
2297  record.inserted_pre_trunc = trans->inserted_pre_trunc;
2298  record.updated_pre_trunc = trans->updated_pre_trunc;
2299  record.deleted_pre_trunc = trans->deleted_pre_trunc;
2300  record.t_id = tabstat->t_id;
2301  record.t_shared = tabstat->t_shared;
2302  record.t_truncated = trans->truncated;
2303 
2305  &record, sizeof(TwoPhasePgStatRecord));
2306  }
2307  }
2308 }
2309 
2310 /*
2311  * PostPrepare_PgStat
2312  * Clean up after successful PREPARE.
2313  *
2314  * All we need do here is unlink the transaction stats state from the
2315  * nontransactional state. The nontransactional action counts will be
2316  * reported to the stats collector immediately, while the effects on live
2317  * and dead tuple counts are preserved in the 2PC state file.
2318  *
2319  * Note: AtEOXact_PgStat is not called during PREPARE.
2320  */
2321 void
2323 {
2324  PgStat_SubXactStatus *xact_state;
2325 
2326  /*
2327  * We don't bother to free any of the transactional state, since it's all
2328  * in TopTransactionContext and will go away anyway.
2329  */
2330  xact_state = pgStatXactStack;
2331  if (xact_state != NULL)
2332  {
2334 
2335  for (trans = xact_state->first; trans != NULL; trans = trans->next)
2336  {
2337  PgStat_TableStatus *tabstat;
2338 
2339  tabstat = trans->parent;
2340  tabstat->trans = NULL;
2341  }
2342  }
2343  pgStatXactStack = NULL;
2344 
2345  /* Make sure any stats snapshot is thrown away */
2347 }
2348 
2349 /*
2350  * 2PC processing routine for COMMIT PREPARED case.
2351  *
2352  * Load the saved counts into our local pgstats state.
2353  */
2354 void
2356  void *recdata, uint32 len)
2357 {
2358  TwoPhasePgStatRecord *rec = (TwoPhasePgStatRecord *) recdata;
2359  PgStat_TableStatus *pgstat_info;
2360 
2361  /* Find or create a tabstat entry for the rel */
2362  pgstat_info = get_tabstat_entry(rec->t_id, rec->t_shared);
2363 
2364  /* Same math as in AtEOXact_PgStat, commit case */
2365  pgstat_info->t_counts.t_tuples_inserted += rec->tuples_inserted;
2366  pgstat_info->t_counts.t_tuples_updated += rec->tuples_updated;
2367  pgstat_info->t_counts.t_tuples_deleted += rec->tuples_deleted;
2368  pgstat_info->t_counts.t_truncated = rec->t_truncated;
2369  if (rec->t_truncated)
2370  {
2371  /* forget live/dead stats seen by backend thus far */
2372  pgstat_info->t_counts.t_delta_live_tuples = 0;
2373  pgstat_info->t_counts.t_delta_dead_tuples = 0;
2374  }
2375  pgstat_info->t_counts.t_delta_live_tuples +=
2376  rec->tuples_inserted - rec->tuples_deleted;
2377  pgstat_info->t_counts.t_delta_dead_tuples +=
2378  rec->tuples_updated + rec->tuples_deleted;
2379  pgstat_info->t_counts.t_changed_tuples +=
2380  rec->tuples_inserted + rec->tuples_updated +
2381  rec->tuples_deleted;
2382 }
2383 
2384 /*
2385  * 2PC processing routine for ROLLBACK PREPARED case.
2386  *
2387  * Load the saved counts into our local pgstats state, but treat them
2388  * as aborted.
2389  */
2390 void
2392  void *recdata, uint32 len)
2393 {
2394  TwoPhasePgStatRecord *rec = (TwoPhasePgStatRecord *) recdata;
2395  PgStat_TableStatus *pgstat_info;
2396 
2397  /* Find or create a tabstat entry for the rel */
2398  pgstat_info = get_tabstat_entry(rec->t_id, rec->t_shared);
2399 
2400  /* Same math as in AtEOXact_PgStat, abort case */
2401  if (rec->t_truncated)
2402  {
2403  rec->tuples_inserted = rec->inserted_pre_trunc;
2404  rec->tuples_updated = rec->updated_pre_trunc;
2405  rec->tuples_deleted = rec->deleted_pre_trunc;
2406  }
2407  pgstat_info->t_counts.t_tuples_inserted += rec->tuples_inserted;
2408  pgstat_info->t_counts.t_tuples_updated += rec->tuples_updated;
2409  pgstat_info->t_counts.t_tuples_deleted += rec->tuples_deleted;
2410  pgstat_info->t_counts.t_delta_dead_tuples +=
2411  rec->tuples_inserted + rec->tuples_updated;
2412 }
2413 
2414 
2415 /* ----------
2416  * pgstat_fetch_stat_dbentry() -
2417  *
2418  * Support function for the SQL-callable pgstat* functions. Returns
2419  * the collected statistics for one database or NULL. NULL doesn't mean
2420  * that the database doesn't exist, it is just not yet known by the
2421  * collector, so the caller is better off to report ZERO instead.
2422  * ----------
2423  */
2426 {
2427  /*
2428  * If not done for this transaction, read the statistics collector stats
2429  * file into some hash tables.
2430  */
2432 
2433  /*
2434  * Lookup the requested database; return NULL if not found
2435  */
2436  return (PgStat_StatDBEntry *) hash_search(pgStatDBHash,
2437  (void *) &dbid,
2438  HASH_FIND, NULL);
2439 }
2440 
2441 
2442 /* ----------
2443  * pgstat_fetch_stat_tabentry() -
2444  *
2445  * Support function for the SQL-callable pgstat* functions. Returns
2446  * the collected statistics for one table or NULL. NULL doesn't mean
2447  * that the table doesn't exist, it is just not yet known by the
2448  * collector, so the caller is better off to report ZERO instead.
2449  * ----------
2450  */
2453 {
2454  Oid dbid;
2455  PgStat_StatDBEntry *dbentry;
2456  PgStat_StatTabEntry *tabentry;
2457 
2458  /*
2459  * If not done for this transaction, read the statistics collector stats
2460  * file into some hash tables.
2461  */
2463 
2464  /*
2465  * Lookup our database, then look in its table hash table.
2466  */
2467  dbid = MyDatabaseId;
2468  dbentry = (PgStat_StatDBEntry *) hash_search(pgStatDBHash,
2469  (void *) &dbid,
2470  HASH_FIND, NULL);
2471  if (dbentry != NULL && dbentry->tables != NULL)
2472  {
2473  tabentry = (PgStat_StatTabEntry *) hash_search(dbentry->tables,
2474  (void *) &relid,
2475  HASH_FIND, NULL);
2476  if (tabentry)
2477  return tabentry;
2478  }
2479 
2480  /*
2481  * If we didn't find it, maybe it's a shared table.
2482  */
2483  dbid = InvalidOid;
2484  dbentry = (PgStat_StatDBEntry *) hash_search(pgStatDBHash,
2485  (void *) &dbid,
2486  HASH_FIND, NULL);
2487  if (dbentry != NULL && dbentry->tables != NULL)
2488  {
2489  tabentry = (PgStat_StatTabEntry *) hash_search(dbentry->tables,
2490  (void *) &relid,
2491  HASH_FIND, NULL);
2492  if (tabentry)
2493  return tabentry;
2494  }
2495 
2496  return NULL;
2497 }
2498 
2499 
2500 /* ----------
2501  * pgstat_fetch_stat_funcentry() -
2502  *
2503  * Support function for the SQL-callable pgstat* functions. Returns
2504  * the collected statistics for one function or NULL.
2505  * ----------
2506  */
2509 {
2510  PgStat_StatDBEntry *dbentry;
2511  PgStat_StatFuncEntry *funcentry = NULL;
2512 
2513  /* load the stats file if needed */
2515 
2516  /* Lookup our database, then find the requested function. */
2518  if (dbentry != NULL && dbentry->functions != NULL)
2519  {
2520  funcentry = (PgStat_StatFuncEntry *) hash_search(dbentry->functions,
2521  (void *) &func_id,
2522  HASH_FIND, NULL);
2523  }
2524 
2525  return funcentry;
2526 }
2527 
2528 
2529 /* ----------
2530  * pgstat_fetch_stat_beentry() -
2531  *
2532  * Support function for the SQL-callable pgstat* functions. Returns
2533  * our local copy of the current-activity entry for one backend.
2534  *
2535  * NB: caller is responsible for a check if the user is permitted to see
2536  * this info (especially the querystring).
2537  * ----------
2538  */
2541 {
2543 
2544  if (beid < 1 || beid > localNumBackends)
2545  return NULL;
2546 
2547  return &localBackendStatusTable[beid - 1].backendStatus;
2548 }
2549 
2550 
2551 /* ----------
2552  * pgstat_fetch_stat_local_beentry() -
2553  *
2554  * Like pgstat_fetch_stat_beentry() but with locally computed additions (like
2555  * xid and xmin values of the backend)
2556  *
2557  * NB: caller is responsible for a check if the user is permitted to see
2558  * this info (especially the querystring).
2559  * ----------
2560  */
2563 {
2565 
2566  if (beid < 1 || beid > localNumBackends)
2567  return NULL;
2568 
2569  return &localBackendStatusTable[beid - 1];
2570 }
2571 
2572 
2573 /* ----------
2574  * pgstat_fetch_stat_numbackends() -
2575  *
2576  * Support function for the SQL-callable pgstat* functions. Returns
2577  * the maximum current backend id.
2578  * ----------
2579  */
2580 int
2582 {
2584 
2585  return localNumBackends;
2586 }
2587 
2588 /*
2589  * ---------
2590  * pgstat_fetch_stat_archiver() -
2591  *
2592  * Support function for the SQL-callable pgstat* functions. Returns
2593  * a pointer to the archiver statistics struct.
2594  * ---------
2595  */
2598 {
2600 
2601  return &archiverStats;
2602 }
2603 
2604 
2605 /*
2606  * ---------
2607  * pgstat_fetch_global() -
2608  *
2609  * Support function for the SQL-callable pgstat* functions. Returns
2610  * a pointer to the global statistics struct.
2611  * ---------
2612  */
2615 {
2617 
2618  return &globalStats;
2619 }
2620 
2621 
2622 /* ------------------------------------------------------------
2623  * Functions for management of the shared-memory PgBackendStatus array
2624  * ------------------------------------------------------------
2625  */
2626 
2629 static char *BackendAppnameBuffer = NULL;
2630 static char *BackendClientHostnameBuffer = NULL;
2631 static char *BackendActivityBuffer = NULL;
2633 #ifdef USE_SSL
2634 static PgBackendSSLStatus *BackendSslStatusBuffer = NULL;
2635 #endif
2636 #ifdef ENABLE_GSS
2637 static PgBackendGSSStatus *BackendGssStatusBuffer = NULL;
2638 #endif
2639 
2640 
2641 /*
2642  * Report shared-memory space needed by CreateSharedBackendStatus.
2643  */
2644 Size
2646 {
2647  Size size;
2648 
2649  /* BackendStatusArray: */
2650  size = mul_size(sizeof(PgBackendStatus), NumBackendStatSlots);
2651  /* BackendAppnameBuffer: */
2652  size = add_size(size,
2654  /* BackendClientHostnameBuffer: */
2655  size = add_size(size,
2657  /* BackendActivityBuffer: */
2658  size = add_size(size,
2660 #ifdef USE_SSL
2661  /* BackendSslStatusBuffer: */
2662  size = add_size(size,
2664 #endif
2665  return size;
2666 }
2667 
2668 /*
2669  * Initialize the shared status array and several string buffers
2670  * during postmaster startup.
2671  */
2672 void
2674 {
2675  Size size;
2676  bool found;
2677  int i;
2678  char *buffer;
2679 
2680  /* Create or attach to the shared array */
2681  size = mul_size(sizeof(PgBackendStatus), NumBackendStatSlots);
2682  BackendStatusArray = (PgBackendStatus *)
2683  ShmemInitStruct("Backend Status Array", size, &found);
2684 
2685  if (!found)
2686  {
2687  /*
2688  * We're the first - initialize.
2689  */
2690  MemSet(BackendStatusArray, 0, size);
2691  }
2692 
2693  /* Create or attach to the shared appname buffer */
2695  BackendAppnameBuffer = (char *)
2696  ShmemInitStruct("Backend Application Name Buffer", size, &found);
2697 
2698  if (!found)
2699  {
2700  MemSet(BackendAppnameBuffer, 0, size);
2701 
2702  /* Initialize st_appname pointers. */
2703  buffer = BackendAppnameBuffer;
2704  for (i = 0; i < NumBackendStatSlots; i++)
2705  {
2706  BackendStatusArray[i].st_appname = buffer;
2707  buffer += NAMEDATALEN;
2708  }
2709  }
2710 
2711  /* Create or attach to the shared client hostname buffer */
2713  BackendClientHostnameBuffer = (char *)
2714  ShmemInitStruct("Backend Client Host Name Buffer", size, &found);
2715 
2716  if (!found)
2717  {
2719 
2720  /* Initialize st_clienthostname pointers. */
2721  buffer = BackendClientHostnameBuffer;
2722  for (i = 0; i < NumBackendStatSlots; i++)
2723  {
2724  BackendStatusArray[i].st_clienthostname = buffer;
2725  buffer += NAMEDATALEN;
2726  }
2727  }
2728 
2729  /* Create or attach to the shared activity buffer */
2732  BackendActivityBuffer = (char *)
2733  ShmemInitStruct("Backend Activity Buffer",
2735  &found);
2736 
2737  if (!found)
2738  {
2740 
2741  /* Initialize st_activity pointers. */
2742  buffer = BackendActivityBuffer;
2743  for (i = 0; i < NumBackendStatSlots; i++)
2744  {
2745  BackendStatusArray[i].st_activity_raw = buffer;
2747  }
2748  }
2749 
2750 #ifdef USE_SSL
2751  /* Create or attach to the shared SSL status buffer */
2753  BackendSslStatusBuffer = (PgBackendSSLStatus *)
2754  ShmemInitStruct("Backend SSL Status Buffer", size, &found);
2755 
2756  if (!found)
2757  {
2758  PgBackendSSLStatus *ptr;
2759 
2760  MemSet(BackendSslStatusBuffer, 0, size);
2761 
2762  /* Initialize st_sslstatus pointers. */
2763  ptr = BackendSslStatusBuffer;
2764  for (i = 0; i < NumBackendStatSlots; i++)
2765  {
2766  BackendStatusArray[i].st_sslstatus = ptr;
2767  ptr++;
2768  }
2769  }
2770 #endif
2771 
2772 #ifdef ENABLE_GSS
2773  /* Create or attach to the shared GSSAPI status buffer */
2775  BackendGssStatusBuffer = (PgBackendGSSStatus *)
2776  ShmemInitStruct("Backend GSS Status Buffer", size, &found);
2777 
2778  if (!found)
2779  {
2780  PgBackendGSSStatus *ptr;
2781 
2782  MemSet(BackendGssStatusBuffer, 0, size);
2783 
2784  /* Initialize st_gssstatus pointers. */
2785  ptr = BackendGssStatusBuffer;
2786  for (i = 0; i < NumBackendStatSlots; i++)
2787  {
2788  BackendStatusArray[i].st_gssstatus = ptr;
2789  ptr++;
2790  }
2791  }
2792 #endif
2793 }
2794 
2795 
2796 /* ----------
2797  * pgstat_initialize() -
2798  *
2799  * Initialize pgstats state, and set up our on-proc-exit hook.
2800  * Called from InitPostgres and AuxiliaryProcessMain. For auxiliary process,
2801  * MyBackendId is invalid. Otherwise, MyBackendId must be set,
2802  * but we must not have started any transaction yet (since the
2803  * exit hook must run after the last transaction exit).
2804  * NOTE: MyDatabaseId isn't set yet; so the shutdown hook has to be careful.
2805  * ----------
2806  */
2807 void
2809 {
2810  /* Initialize MyBEEntry */
2812  {
2814  MyBEEntry = &BackendStatusArray[MyBackendId - 1];
2815  }
2816  else
2817  {
2818  /* Must be an auxiliary process */
2820 
2821  /*
2822  * Assign the MyBEEntry for an auxiliary process. Since it doesn't
2823  * have a BackendId, the slot is statically allocated based on the
2824  * auxiliary process type (MyAuxProcType). Backends use slots indexed
2825  * in the range from 1 to MaxBackends (inclusive), so we use
2826  * MaxBackends + AuxBackendType + 1 as the index of the slot for an
2827  * auxiliary process.
2828  */
2829  MyBEEntry = &BackendStatusArray[MaxBackends + MyAuxProcType];
2830  }
2831 
2832  /* Set up a process-exit hook to clean up */
2834 }
2835 
2836 /* ----------
2837  * pgstat_bestart() -
2838  *
2839  * Initialize this backend's entry in the PgBackendStatus array.
2840  * Called from InitPostgres.
2841  *
2842  * Apart from auxiliary processes, MyBackendId, MyDatabaseId,
2843  * session userid, and application_name must be set for a
2844  * backend (hence, this cannot be combined with pgstat_initialize).
2845  * Note also that we must be inside a transaction if this isn't an aux
2846  * process, as we may need to do encoding conversion on some strings.
2847  * ----------
2848  */
2849 void
2851 {
2852  volatile PgBackendStatus *vbeentry = MyBEEntry;
2853  PgBackendStatus lbeentry;
2854 #ifdef USE_SSL
2855  PgBackendSSLStatus lsslstatus;
2856 #endif
2857 #ifdef ENABLE_GSS
2858  PgBackendGSSStatus lgssstatus;
2859 #endif
2860 
2861  /* pgstats state must be initialized from pgstat_initialize() */
2862  Assert(vbeentry != NULL);
2863 
2864  /*
2865  * To minimize the time spent modifying the PgBackendStatus entry, and
2866  * avoid risk of errors inside the critical section, we first copy the
2867  * shared-memory struct to a local variable, then modify the data in the
2868  * local variable, then copy the local variable back to shared memory.
2869  * Only the last step has to be inside the critical section.
2870  *
2871  * Most of the data we copy from shared memory is just going to be
2872  * overwritten, but the struct's not so large that it's worth the
2873  * maintenance hassle to copy only the needful fields.
2874  */
2875  memcpy(&lbeentry,
2876  unvolatize(PgBackendStatus *, vbeentry),
2877  sizeof(PgBackendStatus));
2878 
2879  /* These structs can just start from zeroes each time, though */
2880 #ifdef USE_SSL
2881  memset(&lsslstatus, 0, sizeof(lsslstatus));
2882 #endif
2883 #ifdef ENABLE_GSS
2884  memset(&lgssstatus, 0, sizeof(lgssstatus));
2885 #endif
2886 
2887  /*
2888  * Now fill in all the fields of lbeentry, except for strings that are
2889  * out-of-line data. Those have to be handled separately, below.
2890  */
2891  lbeentry.st_procpid = MyProcPid;
2892 
2894  {
2896  {
2897  /* Autovacuum Launcher */
2899  }
2900  else if (IsAutoVacuumWorkerProcess())
2901  {
2902  /* Autovacuum Worker */
2903  lbeentry.st_backendType = B_AUTOVAC_WORKER;
2904  }
2905  else if (am_walsender)
2906  {
2907  /* Wal sender */
2908  lbeentry.st_backendType = B_WAL_SENDER;
2909  }
2910  else if (IsBackgroundWorker)
2911  {
2912  /* bgworker */
2913  lbeentry.st_backendType = B_BG_WORKER;
2914  }
2915  else
2916  {
2917  /* client-backend */
2918  lbeentry.st_backendType = B_BACKEND;
2919  }
2920  }
2921  else
2922  {
2923  /* Must be an auxiliary process */
2925  switch (MyAuxProcType)
2926  {
2927  case StartupProcess:
2928  lbeentry.st_backendType = B_STARTUP;
2929  break;
2930  case BgWriterProcess:
2931  lbeentry.st_backendType = B_BG_WRITER;
2932  break;
2933  case CheckpointerProcess:
2934  lbeentry.st_backendType = B_CHECKPOINTER;
2935  break;
2936  case WalWriterProcess:
2937  lbeentry.st_backendType = B_WAL_WRITER;
2938  break;
2939  case WalReceiverProcess:
2940  lbeentry.st_backendType = B_WAL_RECEIVER;
2941  break;
2942  default:
2943  elog(FATAL, "unrecognized process type: %d",
2944  (int) MyAuxProcType);
2945  }
2946  }
2947 
2949  lbeentry.st_activity_start_timestamp = 0;
2950  lbeentry.st_state_start_timestamp = 0;
2951  lbeentry.st_xact_start_timestamp = 0;
2952  lbeentry.st_databaseid = MyDatabaseId;
2953 
2954  /* We have userid for client-backends, wal-sender and bgworker processes */
2955  if (lbeentry.st_backendType == B_BACKEND
2956  || lbeentry.st_backendType == B_WAL_SENDER
2957  || lbeentry.st_backendType == B_BG_WORKER)
2958  lbeentry.st_userid = GetSessionUserId();
2959  else
2960  lbeentry.st_userid = InvalidOid;
2961 
2962  /*
2963  * We may not have a MyProcPort (eg, if this is the autovacuum process).
2964  * If so, use all-zeroes client address, which is dealt with specially in
2965  * pg_stat_get_backend_client_addr and pg_stat_get_backend_client_port.
2966  */
2967  if (MyProcPort)
2968  memcpy(&lbeentry.st_clientaddr, &MyProcPort->raddr,
2969  sizeof(lbeentry.st_clientaddr));
2970  else
2971  MemSet(&lbeentry.st_clientaddr, 0, sizeof(lbeentry.st_clientaddr));
2972 
2973 #ifdef USE_SSL
2974  if (MyProcPort && MyProcPort->ssl != NULL)
2975  {
2976  lbeentry.st_ssl = true;
2984  }
2985  else
2986  {
2987  lbeentry.st_ssl = false;
2988  }
2989 #else
2990  lbeentry.st_ssl = false;
2991 #endif
2992 
2993 #ifdef ENABLE_GSS
2994  if (MyProcPort && MyProcPort->gss != NULL)
2995  {
2996  lbeentry.st_gss = true;
2997  lgssstatus.gss_auth = be_gssapi_get_auth(MyProcPort);
2998  lgssstatus.gss_enc = be_gssapi_get_enc(MyProcPort);
2999 
3000  if (lgssstatus.gss_auth)
3002  }
3003  else
3004  {
3005  lbeentry.st_gss = false;
3006  }
3007 #else
3008  lbeentry.st_gss = false;
3009 #endif
3010 
3011  lbeentry.st_state = STATE_UNDEFINED;
3014 
3015  /*
3016  * we don't zero st_progress_param here to save cycles; nobody should
3017  * examine it until st_progress_command has been set to something other
3018  * than PROGRESS_COMMAND_INVALID
3019  */
3020 
3021  /*
3022  * We're ready to enter the critical section that fills the shared-memory
3023  * status entry. We follow the protocol of bumping st_changecount before
3024  * and after; and make sure it's even afterwards. We use a volatile
3025  * pointer here to ensure the compiler doesn't try to get cute.
3026  */
3027  PGSTAT_BEGIN_WRITE_ACTIVITY(vbeentry);
3028 
3029  /* make sure we'll memcpy the same st_changecount back */
3030  lbeentry.st_changecount = vbeentry->st_changecount;
3031 
3032  memcpy(unvolatize(PgBackendStatus *, vbeentry),
3033  &lbeentry,
3034  sizeof(PgBackendStatus));
3035 
3036  /*
3037  * We can write the out-of-line strings and structs using the pointers
3038  * that are in lbeentry; this saves some de-volatilizing messiness.
3039  */
3040  lbeentry.st_appname[0] = '\0';
3043  NAMEDATALEN);
3044  else
3045  lbeentry.st_clienthostname[0] = '\0';
3046  lbeentry.st_activity_raw[0] = '\0';
3047  /* Also make sure the last byte in each string area is always 0 */
3048  lbeentry.st_appname[NAMEDATALEN - 1] = '\0';
3049  lbeentry.st_clienthostname[NAMEDATALEN - 1] = '\0';
3051 
3052 #ifdef USE_SSL
3053  memcpy(lbeentry.st_sslstatus, &lsslstatus, sizeof(PgBackendSSLStatus));
3054 #endif
3055 #ifdef ENABLE_GSS
3056  memcpy(lbeentry.st_gssstatus, &lgssstatus, sizeof(PgBackendGSSStatus));
3057 #endif
3058 
3059  PGSTAT_END_WRITE_ACTIVITY(vbeentry);
3060 
3061  /* Update app name to current GUC setting */
3062  if (application_name)
3064 }
3065 
3066 /*
3067  * Shut down a single backend's statistics reporting at process exit.
3068  *
3069  * Flush any remaining statistics counts out to the collector.
3070  * Without this, operations triggered during backend exit (such as
3071  * temp table deletions) won't be counted.
3072  *
3073  * Lastly, clear out our entry in the PgBackendStatus array.
3074  */
3075 static void
3077 {
3078  volatile PgBackendStatus *beentry = MyBEEntry;
3079 
3080  /*
3081  * If we got as far as discovering our own database ID, we can report what
3082  * we did to the collector. Otherwise, we'd be sending an invalid
3083  * database ID, so forget it. (This means that accesses to pg_database
3084  * during failed backend starts might never get counted.)
3085  */
3086  if (OidIsValid(MyDatabaseId))
3087  pgstat_report_stat(true);
3088 
3089  /*
3090  * Clear my status entry, following the protocol of bumping st_changecount
3091  * before and after. We use a volatile pointer here to ensure the
3092  * compiler doesn't try to get cute.
3093  */
3094  PGSTAT_BEGIN_WRITE_ACTIVITY(beentry);
3095 
3096  beentry->st_procpid = 0; /* mark invalid */
3097 
3098  PGSTAT_END_WRITE_ACTIVITY(beentry);
3099 }
3100 
3101 
3102 /* ----------
3103  * pgstat_report_activity() -
3104  *
3105  * Called from tcop/postgres.c to report what the backend is actually doing
3106  * (but note cmd_str can be NULL for certain cases).
3107  *
3108  * All updates of the status entry follow the protocol of bumping
3109  * st_changecount before and after. We use a volatile pointer here to
3110  * ensure the compiler doesn't try to get cute.
3111  * ----------
3112  */
3113 void
3115 {
3116  volatile PgBackendStatus *beentry = MyBEEntry;
3117  TimestampTz start_timestamp;
3118  TimestampTz current_timestamp;
3119  int len = 0;
3120 
3121  TRACE_POSTGRESQL_STATEMENT_STATUS(cmd_str);
3122 
3123  if (!beentry)
3124  return;
3125 
3127  {
3128  if (beentry->st_state != STATE_DISABLED)
3129  {
3130  volatile PGPROC *proc = MyProc;
3131 
3132  /*
3133  * track_activities is disabled, but we last reported a
3134  * non-disabled state. As our final update, change the state and
3135  * clear fields we will not be updating anymore.
3136  */
3137  PGSTAT_BEGIN_WRITE_ACTIVITY(beentry);
3138  beentry->st_state = STATE_DISABLED;
3139  beentry->st_state_start_timestamp = 0;
3140  beentry->st_activity_raw[0] = '\0';
3141  beentry->st_activity_start_timestamp = 0;
3142  /* st_xact_start_timestamp and wait_event_info are also disabled */
3143  beentry->st_xact_start_timestamp = 0;
3144  proc->wait_event_info = 0;
3145  PGSTAT_END_WRITE_ACTIVITY(beentry);
3146  }
3147  return;
3148  }
3149 
3150  /*
3151  * To minimize the time spent modifying the entry, and avoid risk of
3152  * errors inside the critical section, fetch all the needed data first.
3153  */
3154  start_timestamp = GetCurrentStatementStartTimestamp();
3155  if (cmd_str != NULL)
3156  {
3157  /*
3158  * Compute length of to-be-stored string unaware of multi-byte
3159  * characters. For speed reasons that'll get corrected on read, rather
3160  * than computed every write.
3161  */
3162  len = Min(strlen(cmd_str), pgstat_track_activity_query_size - 1);
3163  }
3164  current_timestamp = GetCurrentTimestamp();
3165 
3166  /*
3167  * Now update the status entry
3168  */
3169  PGSTAT_BEGIN_WRITE_ACTIVITY(beentry);
3170 
3171  beentry->st_state = state;
3172  beentry->st_state_start_timestamp = current_timestamp;
3173 
3174  if (cmd_str != NULL)
3175  {
3176  memcpy((char *) beentry->st_activity_raw, cmd_str, len);
3177  beentry->st_activity_raw[len] = '\0';
3178  beentry->st_activity_start_timestamp = start_timestamp;
3179  }
3180 
3181  PGSTAT_END_WRITE_ACTIVITY(beentry);
3182 }
3183 
3184 /*-----------
3185  * pgstat_progress_start_command() -
3186  *
3187  * Set st_progress_command (and st_progress_command_target) in own backend
3188  * entry. Also, zero-initialize st_progress_param array.
3189  *-----------
3190  */
3191 void
3193 {
3194  volatile PgBackendStatus *beentry = MyBEEntry;
3195 
3196  if (!beentry || !pgstat_track_activities)
3197  return;
3198 
3199  PGSTAT_BEGIN_WRITE_ACTIVITY(beentry);
3200  beentry->st_progress_command = cmdtype;
3201  beentry->st_progress_command_target = relid;
3202  MemSet(&beentry->st_progress_param, 0, sizeof(beentry->st_progress_param));
3203  PGSTAT_END_WRITE_ACTIVITY(beentry);
3204 }
3205 
3206 /*-----------
3207  * pgstat_progress_update_param() -
3208  *
3209  * Update index'th member in st_progress_param[] of own backend entry.
3210  *-----------
3211  */
3212 void
3214 {
3215  volatile PgBackendStatus *beentry = MyBEEntry;
3216 
3217  Assert(index >= 0 && index < PGSTAT_NUM_PROGRESS_PARAM);
3218 
3219  if (!beentry || !pgstat_track_activities)
3220  return;
3221 
3222  PGSTAT_BEGIN_WRITE_ACTIVITY(beentry);
3223  beentry->st_progress_param[index] = val;
3224  PGSTAT_END_WRITE_ACTIVITY(beentry);
3225 }
3226 
3227 /*-----------
3228  * pgstat_progress_update_multi_param() -
3229  *
3230  * Update multiple members in st_progress_param[] of own backend entry.
3231  * This is atomic; readers won't see intermediate states.
3232  *-----------
3233  */
3234 void
3236  const int64 *val)
3237 {
3238  volatile PgBackendStatus *beentry = MyBEEntry;
3239  int i;
3240 
3241  if (!beentry || !pgstat_track_activities || nparam == 0)
3242  return;
3243 
3244  PGSTAT_BEGIN_WRITE_ACTIVITY(beentry);
3245 
3246  for (i = 0; i < nparam; ++i)
3247  {
3248  Assert(index[i] >= 0 && index[i] < PGSTAT_NUM_PROGRESS_PARAM);
3249 
3250  beentry->st_progress_param[index[i]] = val[i];
3251  }
3252 
3253  PGSTAT_END_WRITE_ACTIVITY(beentry);
3254 }
3255 
3256 /*-----------
3257  * pgstat_progress_end_command() -
3258  *
3259  * Reset st_progress_command (and st_progress_command_target) in own backend
3260  * entry. This signals the end of the command.
3261  *-----------
3262  */
3263 void
3265 {
3266  volatile PgBackendStatus *beentry = MyBEEntry;
3267 
3268  if (!beentry || !pgstat_track_activities)
3269  return;
3270 
3272  return;
3273 
3274  PGSTAT_BEGIN_WRITE_ACTIVITY(beentry);
3277  PGSTAT_END_WRITE_ACTIVITY(beentry);
3278 }
3279 
3280 /* ----------
3281  * pgstat_report_appname() -
3282  *
3283  * Called to update our application name.
3284  * ----------
3285  */
3286 void
3287 pgstat_report_appname(const char *appname)
3288 {
3289  volatile PgBackendStatus *beentry = MyBEEntry;
3290  int len;
3291 
3292  if (!beentry)
3293  return;
3294 
3295  /* This should be unnecessary if GUC did its job, but be safe */
3296  len = pg_mbcliplen(appname, strlen(appname), NAMEDATALEN - 1);
3297 
3298  /*
3299  * Update my status entry, following the protocol of bumping
3300  * st_changecount before and after. We use a volatile pointer here to
3301  * ensure the compiler doesn't try to get cute.
3302  */
3303  PGSTAT_BEGIN_WRITE_ACTIVITY(beentry);
3304 
3305  memcpy((char *) beentry->st_appname, appname, len);
3306  beentry->st_appname[len] = '\0';
3307 
3308  PGSTAT_END_WRITE_ACTIVITY(beentry);
3309 }
3310 
3311 /*
3312  * Report current transaction start timestamp as the specified value.
3313  * Zero means there is no active transaction.
3314  */
3315 void
3317 {
3318  volatile PgBackendStatus *beentry = MyBEEntry;
3319 
3320  if (!pgstat_track_activities || !beentry)
3321  return;
3322 
3323  /*
3324  * Update my status entry, following the protocol of bumping
3325  * st_changecount before and after. We use a volatile pointer here to
3326  * ensure the compiler doesn't try to get cute.
3327  */
3328  PGSTAT_BEGIN_WRITE_ACTIVITY(beentry);
3329 
3330  beentry->st_xact_start_timestamp = tstamp;
3331 
3332  PGSTAT_END_WRITE_ACTIVITY(beentry);
3333 }
3334 
3335 /* ----------
3336  * pgstat_read_current_status() -
3337  *
3338  * Copy the current contents of the PgBackendStatus array to local memory,
3339  * if not already done in this transaction.
3340  * ----------
3341  */
3342 static void
3344 {
3345  volatile PgBackendStatus *beentry;
3346  LocalPgBackendStatus *localtable;
3347  LocalPgBackendStatus *localentry;
3348  char *localappname,
3349  *localclienthostname,
3350  *localactivity;
3351 #ifdef USE_SSL
3352  PgBackendSSLStatus *localsslstatus;
3353 #endif
3354 #ifdef ENABLE_GSS
3355  PgBackendGSSStatus *localgssstatus;
3356 #endif
3357  int i;
3358 
3360  if (localBackendStatusTable)
3361  return; /* already done */
3362 
3364 
3365  /*
3366  * Allocate storage for local copy of state data. We can presume that
3367  * none of these requests overflow size_t, because we already calculated
3368  * the same values using mul_size during shmem setup. However, with
3369  * probably-silly values of pgstat_track_activity_query_size and
3370  * max_connections, the localactivity buffer could exceed 1GB, so use
3371  * "huge" allocation for that one.
3372  */
3373  localtable = (LocalPgBackendStatus *)
3374  MemoryContextAlloc(pgStatLocalContext,
3376  localappname = (char *)
3377  MemoryContextAlloc(pgStatLocalContext,
3379  localclienthostname = (char *)
3380  MemoryContextAlloc(pgStatLocalContext,
3381  NAMEDATALEN * NumBackendStatSlots);
3382  localactivity = (char *)
3383  MemoryContextAllocHuge(pgStatLocalContext,
3384  pgstat_track_activity_query_size * NumBackendStatSlots);
3385 #ifdef USE_SSL
3386  localsslstatus = (PgBackendSSLStatus *)
3387  MemoryContextAlloc(pgStatLocalContext,
3389 #endif
3390 #ifdef ENABLE_GSS
3391  localgssstatus = (PgBackendGSSStatus *)
3392  MemoryContextAlloc(pgStatLocalContext,
3394 #endif
3395 
3396  localNumBackends = 0;
3397 
3398  beentry = BackendStatusArray;
3399  localentry = localtable;
3400  for (i = 1; i <= NumBackendStatSlots; i++)
3401  {
3402  /*
3403  * Follow the protocol of retrying if st_changecount changes while we
3404  * copy the entry, or if it's odd. (The check for odd is needed to
3405  * cover the case where we are able to completely copy the entry while
3406  * the source backend is between increment steps.) We use a volatile
3407  * pointer here to ensure the compiler doesn't try to get cute.
3408  */
3409  for (;;)
3410  {
3411  int before_changecount;
3412  int after_changecount;
3413 
3414  pgstat_begin_read_activity(beentry, before_changecount);
3415 
3416  localentry->backendStatus.st_procpid = beentry->st_procpid;
3417  /* Skip all the data-copying work if entry is not in use */
3418  if (localentry->backendStatus.st_procpid > 0)
3419  {
3420  memcpy(&localentry->backendStatus, unvolatize(PgBackendStatus *, beentry), sizeof(PgBackendStatus));
3421 
3422  /*
3423  * For each PgBackendStatus field that is a pointer, copy the
3424  * pointed-to data, then adjust the local copy of the pointer
3425  * field to point at the local copy of the data.
3426  *
3427  * strcpy is safe even if the string is modified concurrently,
3428  * because there's always a \0 at the end of the buffer.
3429  */
3430  strcpy(localappname, (char *) beentry->st_appname);
3431  localentry->backendStatus.st_appname = localappname;
3432  strcpy(localclienthostname, (char *) beentry->st_clienthostname);
3433  localentry->backendStatus.st_clienthostname = localclienthostname;
3434  strcpy(localactivity, (char *) beentry->st_activity_raw);
3435  localentry->backendStatus.st_activity_raw = localactivity;
3436 #ifdef USE_SSL
3437  if (beentry->st_ssl)
3438  {
3439  memcpy(localsslstatus, beentry->st_sslstatus, sizeof(PgBackendSSLStatus));
3440  localentry->backendStatus.st_sslstatus = localsslstatus;
3441  }
3442 #endif
3443 #ifdef ENABLE_GSS
3444  if (beentry->st_gss)
3445  {
3446  memcpy(localgssstatus, beentry->st_gssstatus, sizeof(PgBackendGSSStatus));
3447  localentry->backendStatus.st_gssstatus = localgssstatus;
3448  }
3449 #endif
3450  }
3451 
3452  pgstat_end_read_activity(beentry, after_changecount);
3453 
3454  if (pgstat_read_activity_complete(before_changecount,
3455  after_changecount))
3456  break;
3457 
3458  /* Make sure we can break out of loop if stuck... */
3460  }
3461 
3462  beentry++;
3463  /* Only valid entries get included into the local array */
3464  if (localentry->backendStatus.st_procpid > 0)
3465  {
3467  &localentry->backend_xid,
3468  &localentry->backend_xmin);
3469 
3470  localentry++;
3471  localappname += NAMEDATALEN;
3472  localclienthostname += NAMEDATALEN;
3473  localactivity += pgstat_track_activity_query_size;
3474 #ifdef USE_SSL
3475  localsslstatus++;
3476 #endif
3477 #ifdef ENABLE_GSS
3478  localgssstatus++;
3479 #endif
3480  localNumBackends++;
3481  }
3482  }
3483 
3484  /* Set the pointer only after completion of a valid table */
3485  localBackendStatusTable = localtable;
3486 }
3487 
3488 /* ----------
3489  * pgstat_get_wait_event_type() -
3490  *
3491  * Return a string representing the current wait event type, backend is
3492  * waiting on.
3493  */
3494 const char *
3496 {
3497  uint32 classId;
3498  const char *event_type;
3499 
3500  /* report process as not waiting. */
3501  if (wait_event_info == 0)
3502  return NULL;
3503 
3504  classId = wait_event_info & 0xFF000000;
3505 
3506  switch (classId)
3507  {
3508  case PG_WAIT_LWLOCK:
3509  event_type = "LWLock";
3510  break;
3511  case PG_WAIT_LOCK:
3512  event_type = "Lock";
3513  break;
3514  case PG_WAIT_BUFFER_PIN:
3515  event_type = "BufferPin";
3516  break;
3517  case PG_WAIT_ACTIVITY:
3518  event_type = "Activity";
3519  break;
3520  case PG_WAIT_CLIENT:
3521  event_type = "Client";
3522  break;
3523  case PG_WAIT_EXTENSION:
3524  event_type = "Extension";
3525  break;
3526  case PG_WAIT_IPC:
3527  event_type = "IPC";
3528  break;
3529  case PG_WAIT_TIMEOUT:
3530  event_type = "Timeout";
3531  break;
3532  case PG_WAIT_IO:
3533  event_type = "IO";
3534  break;
3535  default:
3536  event_type = "???";
3537  break;
3538  }
3539 
3540  return event_type;
3541 }
3542 
3543 /* ----------
3544  * pgstat_get_wait_event() -
3545  *
3546  * Return a string representing the current wait event, backend is
3547  * waiting on.
3548  */
3549 const char *
3551 {
3552  uint32 classId;
3553  uint16 eventId;
3554  const char *event_name;
3555 
3556  /* report process as not waiting. */
3557  if (wait_event_info == 0)
3558  return NULL;
3559 
3560  classId = wait_event_info & 0xFF000000;
3561  eventId = wait_event_info & 0x0000FFFF;
3562 
3563  switch (classId)
3564  {
3565  case PG_WAIT_LWLOCK:
3566  event_name = GetLWLockIdentifier(classId, eventId);
3567  break;
3568  case PG_WAIT_LOCK:
3569  event_name = GetLockNameFromTagType(eventId);
3570  break;
3571  case PG_WAIT_BUFFER_PIN:
3572  event_name = "BufferPin";
3573  break;
3574  case PG_WAIT_ACTIVITY:
3575  {
3576  WaitEventActivity w = (WaitEventActivity) wait_event_info;
3577 
3578  event_name = pgstat_get_wait_activity(w);
3579  break;
3580  }
3581  case PG_WAIT_CLIENT:
3582  {
3583  WaitEventClient w = (WaitEventClient) wait_event_info;
3584 
3585  event_name = pgstat_get_wait_client(w);
3586  break;
3587  }
3588  case PG_WAIT_EXTENSION:
3589  event_name = "Extension";
3590  break;
3591  case PG_WAIT_IPC:
3592  {
3593  WaitEventIPC w = (WaitEventIPC) wait_event_info;
3594 
3595  event_name = pgstat_get_wait_ipc(w);
3596  break;
3597  }
3598  case PG_WAIT_TIMEOUT:
3599  {
3600  WaitEventTimeout w = (WaitEventTimeout) wait_event_info;
3601 
3602  event_name = pgstat_get_wait_timeout(w);
3603  break;
3604  }
3605  case PG_WAIT_IO:
3606  {
3607  WaitEventIO w = (WaitEventIO) wait_event_info;
3608 
3609  event_name = pgstat_get_wait_io(w);
3610  break;
3611  }
3612  default:
3613  event_name = "unknown wait event";
3614  break;
3615  }
3616 
3617  return event_name;
3618 }
3619 
3620 /* ----------
3621  * pgstat_get_wait_activity() -
3622  *
3623  * Convert WaitEventActivity to string.
3624  * ----------
3625  */
3626 static const char *
3628 {
3629  const char *event_name = "unknown wait event";
3630 
3631  switch (w)
3632  {
3634  event_name = "ArchiverMain";
3635  break;
3637  event_name = "AutoVacuumMain";
3638  break;
3640  event_name = "BgWriterHibernate";
3641  break;
3643  event_name = "BgWriterMain";
3644  break;
3646  event_name = "CheckpointerMain";
3647  break;
3649  event_name = "LogicalApplyMain";
3650  break;
3652  event_name = "LogicalLauncherMain";
3653  break;
3655  event_name = "PgStatMain";
3656  break;
3658  event_name = "RecoveryWalAll";
3659  break;
3661  event_name = "RecoveryWalStream";
3662  break;
3664  event_name = "SysLoggerMain";
3665  break;
3667  event_name = "WalReceiverMain";
3668  break;
3670  event_name = "WalSenderMain";
3671  break;
3673  event_name = "WalWriterMain";
3674  break;
3675  /* no default case, so that compiler will warn */
3676  }
3677 
3678  return event_name;
3679 }
3680 
3681 /* ----------
3682  * pgstat_get_wait_client() -
3683  *
3684  * Convert WaitEventClient to string.
3685  * ----------
3686  */
3687 static const char *
3689 {
3690  const char *event_name = "unknown wait event";
3691 
3692  switch (w)
3693  {
3695  event_name = "ClientRead";
3696  break;
3698  event_name = "ClientWrite";
3699  break;
3701  event_name = "LibPQWalReceiverConnect";
3702  break;
3704  event_name = "LibPQWalReceiverReceive";
3705  break;
3707  event_name = "SSLOpenServer";
3708  break;
3710  event_name = "WalReceiverWaitStart";
3711  break;
3713  event_name = "WalSenderWaitForWAL";
3714  break;
3716  event_name = "WalSenderWriteData";
3717  break;
3719  event_name = "GSSOpenServer";
3720  break;
3721  /* no default case, so that compiler will warn */
3722  }
3723 
3724  return event_name;
3725 }
3726 
3727 /* ----------
3728  * pgstat_get_wait_ipc() -
3729  *
3730  * Convert WaitEventIPC to string.
3731  * ----------
3732  */
3733 static const char *
3735 {
3736  const char *event_name = "unknown wait event";
3737 
3738  switch (w)
3739  {
3741  event_name = "BgWorkerShutdown";
3742  break;
3744  event_name = "BgWorkerStartup";
3745  break;
3746  case WAIT_EVENT_BTREE_PAGE:
3747  event_name = "BtreePage";
3748  break;
3750  event_name = "CheckpointDone";
3751  break;
3753  event_name = "CheckpointStart";
3754  break;
3756  event_name = "ClogGroupUpdate";
3757  break;
3759  event_name = "ExecuteGather";
3760  break;
3762  event_name = "Hash/Batch/Allocating";
3763  break;
3765  event_name = "Hash/Batch/Electing";
3766  break;
3768  event_name = "Hash/Batch/Loading";
3769  break;
3771  event_name = "Hash/Build/Allocating";
3772  break;
3774  event_name = "Hash/Build/Electing";
3775  break;
3777  event_name = "Hash/Build/HashingInner";
3778  break;
3780  event_name = "Hash/Build/HashingOuter";
3781  break;
3783  event_name = "Hash/GrowBatches/Allocating";
3784  break;
3786  event_name = "Hash/GrowBatches/Deciding";
3787  break;
3789  event_name = "Hash/GrowBatches/Electing";
3790  break;
3792  event_name = "Hash/GrowBatches/Finishing";
3793  break;
3795  event_name = "Hash/GrowBatches/Repartitioning";
3796  break;
3798  event_name = "Hash/GrowBuckets/Allocating";
3799  break;
3801  event_name = "Hash/GrowBuckets/Electing";
3802  break;
3804  event_name = "Hash/GrowBuckets/Reinserting";
3805  break;
3807  event_name = "LogicalSyncData";
3808  break;
3810  event_name = "LogicalSyncStateChange";
3811  break;
3813  event_name = "MessageQueueInternal";
3814  break;
3816  event_name = "MessageQueuePutMessage";
3817  break;
3818  case WAIT_EVENT_MQ_RECEIVE:
3819  event_name = "MessageQueueReceive";
3820  break;
3821  case WAIT_EVENT_MQ_SEND:
3822  event_name = "MessageQueueSend";
3823  break;
3825  event_name = "ParallelBitmapScan";
3826  break;
3828  event_name = "ParallelCreateIndexScan";
3829  break;
3831  event_name = "ParallelFinish";
3832  break;
3834  event_name = "ProcArrayGroupUpdate";
3835  break;
3836  case WAIT_EVENT_PROMOTE:
3837  event_name = "Promote";
3838  break;
3840  event_name = "ReplicationOriginDrop";
3841  break;
3843  event_name = "ReplicationSlotDrop";
3844  break;
3846  event_name = "SafeSnapshot";
3847  break;
3848  case WAIT_EVENT_SYNC_REP:
3849  event_name = "SyncRep";
3850  break;
3851  /* no default case, so that compiler will warn */
3852  }
3853 
3854  return event_name;
3855 }
3856 
3857 /* ----------
3858  * pgstat_get_wait_timeout() -
3859  *
3860  * Convert WaitEventTimeout to string.
3861  * ----------
3862  */
3863 static const char *
3865 {
3866  const char *event_name = "unknown wait event";
3867 
3868  switch (w)
3869  {
3871  event_name = "BaseBackupThrottle";
3872  break;
3873  case WAIT_EVENT_PG_SLEEP:
3874  event_name = "PgSleep";
3875  break;
3877  event_name = "RecoveryApplyDelay";
3878  break;
3879  /* no default case, so that compiler will warn */
3880  }
3881 
3882  return event_name;
3883 }
3884 
3885 /* ----------
3886  * pgstat_get_wait_io() -
3887  *
3888  * Convert WaitEventIO to string.
3889  * ----------
3890  */
3891 static const char *
3893 {
3894  const char *event_name = "unknown wait event";
3895 
3896  switch (w)
3897  {
3899  event_name = "BufFileRead";
3900  break;
3902  event_name = "BufFileWrite";
3903  break;
3905  event_name = "ControlFileRead";
3906  break;
3908  event_name = "ControlFileSync";
3909  break;
3911  event_name = "ControlFileSyncUpdate";
3912  break;
3914  event_name = "ControlFileWrite";
3915  break;
3917  event_name = "ControlFileWriteUpdate";
3918  break;
3920  event_name = "CopyFileRead";
3921  break;
3923  event_name = "CopyFileWrite";
3924  break;
3926  event_name = "DataFileExtend";
3927  break;
3929  event_name = "DataFileFlush";
3930  break;
3932  event_name = "DataFileImmediateSync";
3933  break;
3935  event_name = "DataFilePrefetch";
3936  break;
3938  event_name = "DataFileRead";
3939  break;
3941  event_name = "DataFileSync";
3942  break;
3944  event_name = "DataFileTruncate";
3945  break;
3947  event_name = "DataFileWrite";
3948  break;
3950  event_name = "DSMFillZeroWrite";
3951  break;
3953  event_name = "LockFileAddToDataDirRead";
3954  break;
3956  event_name = "LockFileAddToDataDirSync";
3957  break;
3959  event_name = "LockFileAddToDataDirWrite";
3960  break;
3962  event_name = "LockFileCreateRead";
3963  break;
3965  event_name = "LockFileCreateSync";
3966  break;
3968  event_name = "LockFileCreateWrite";
3969  break;
3971  event_name = "LockFileReCheckDataDirRead";
3972  break;
3974  event_name = "LogicalRewriteCheckpointSync";
3975  break;
3977  event_name = "LogicalRewriteMappingSync";
3978  break;
3980  event_name = "LogicalRewriteMappingWrite";
3981  break;
3983  event_name = "LogicalRewriteSync";
3984  break;
3986  event_name = "LogicalRewriteTruncate";
3987  break;
3989  event_name = "LogicalRewriteWrite";
3990  break;
3992  event_name = "ProcSignalBarrier";
3993  break;
3995  event_name = "RelationMapRead";
3996  break;
3998  event_name = "RelationMapSync";
3999  break;
4001  event_name = "RelationMapWrite";
4002  break;
4004  event_name = "ReorderBufferRead";
4005  break;
4007  event_name = "ReorderBufferWrite";
4008  break;
4010  event_name = "ReorderLogicalMappingRead";
4011  break;
4013  event_name = "ReplicationSlotRead";
4014  break;
4016  event_name = "ReplicationSlotRestoreSync";
4017  break;
4019  event_name = "ReplicationSlotSync";
4020  break;
4022  event_name = "ReplicationSlotWrite";
4023  break;
4025  event_name = "SLRUFlushSync";
4026  break;
4027  case WAIT_EVENT_SLRU_READ:
4028  event_name = "SLRURead";
4029  break;
4030  case WAIT_EVENT_SLRU_SYNC:
4031  event_name = "SLRUSync";
4032  break;
4033  case WAIT_EVENT_SLRU_WRITE:
4034  event_name = "SLRUWrite";
4035  break;
4037  event_name = "SnapbuildRead";
4038  break;
4040  event_name = "SnapbuildSync";
4041  break;
4043  event_name = "SnapbuildWrite";
4044  break;
4046  event_name = "TimelineHistoryFileSync";
4047  break;
4049  event_name = "TimelineHistoryFileWrite";
4050  break;
4052  event_name = "TimelineHistoryRead";
4053  break;
4055  event_name = "TimelineHistorySync";
4056  break;
4058  event_name = "TimelineHistoryWrite";
4059  break;
4061  event_name = "TwophaseFileRead";
4062  break;
4064  event_name = "TwophaseFileSync";
4065  break;
4067  event_name = "TwophaseFileWrite";
4068  break;
4070  event_name = "WALSenderTimelineHistoryRead";
4071  break;
4073  event_name = "WALBootstrapSync";
4074  break;
4076  event_name = "WALBootstrapWrite";
4077  break;
4079  event_name = "WALCopyRead";
4080  break;
4082  event_name = "WALCopySync";
4083  break;
4085  event_name = "WALCopyWrite";
4086  break;
4088  event_name = "WALInitSync";
4089  break;
4091  event_name = "WALInitWrite";
4092  break;
4093  case WAIT_EVENT_WAL_READ:
4094  event_name = "WALRead";
4095  break;
4096  case WAIT_EVENT_WAL_SYNC:
4097  event_name = "WALSync";
4098  break;
4100  event_name = "WALSyncMethodAssign";
4101  break;
4102  case WAIT_EVENT_WAL_WRITE:
4103  event_name = "WALWrite";
4104  break;
4105 
4106  /* no default case, so that compiler will warn */
4107  }
4108 
4109  return event_name;
4110 }
4111 
4112 
4113 /* ----------
4114  * pgstat_get_backend_current_activity() -
4115  *
4116  * Return a string representing the current activity of the backend with
4117  * the specified PID. This looks directly at the BackendStatusArray,
4118  * and so will provide current information regardless of the age of our
4119  * transaction's snapshot of the status array.
4120  *
4121  * It is the caller's responsibility to invoke this only for backends whose
4122  * state is expected to remain stable while the result is in use. The
4123  * only current use is in deadlock reporting, where we can expect that
4124  * the target backend is blocked on a lock. (There are corner cases
4125  * where the target's wait could get aborted while we are looking at it,
4126  * but the very worst consequence is to return a pointer to a string
4127  * that's been changed, so we won't worry too much.)
4128  *
4129  * Note: return strings for special cases match pg_stat_get_backend_activity.
4130  * ----------
4131  */
4132 const char *
4133 pgstat_get_backend_current_activity(int pid, bool checkUser)
4134 {
4135  PgBackendStatus *beentry;
4136  int i;
4137 
4138  beentry = BackendStatusArray;
4139  for (i = 1; i <= MaxBackends; i++)
4140  {
4141  /*
4142  * Although we expect the target backend's entry to be stable, that
4143  * doesn't imply that anyone else's is. To avoid identifying the
4144  * wrong backend, while we check for a match to the desired PID we
4145  * must follow the protocol of retrying if st_changecount changes
4146  * while we examine the entry, or if it's odd. (This might be
4147  * unnecessary, since fetching or storing an int is almost certainly
4148  * atomic, but let's play it safe.) We use a volatile pointer here to
4149  * ensure the compiler doesn't try to get cute.
4150  */
4151  volatile PgBackendStatus *vbeentry = beentry;
4152  bool found;
4153 
4154  for (;;)
4155  {
4156  int before_changecount;
4157  int after_changecount;
4158 
4159  pgstat_begin_read_activity(vbeentry, before_changecount);
4160 
4161  found = (vbeentry->st_procpid == pid);
4162 
4163  pgstat_end_read_activity(vbeentry, after_changecount);
4164 
4165  if (pgstat_read_activity_complete(before_changecount,
4166  after_changecount))
4167  break;
4168 
4169  /* Make sure we can break out of loop if stuck... */
4171  }
4172 
4173  if (found)
4174  {
4175  /* Now it is safe to use the non-volatile pointer */
4176  if (checkUser && !superuser() && beentry->st_userid != GetUserId())
4177  return "<insufficient privilege>";
4178  else if (*(beentry->st_activity_raw) == '\0')
4179  return "<command string not enabled>";
4180  else
4181  {
4182  /* this'll leak a bit of memory, but that seems acceptable */
4183  return pgstat_clip_activity(beentry->st_activity_raw);
4184  }
4185  }
4186 
4187  beentry++;
4188  }
4189 
4190  /* If we get here, caller is in error ... */
4191  return "<backend information not available>";
4192 }
4193 
4194 /* ----------
4195  * pgstat_get_crashed_backend_activity() -
4196  *
4197  * Return a string representing the current activity of the backend with
4198  * the specified PID. Like the function above, but reads shared memory with
4199  * the expectation that it may be corrupt. On success, copy the string
4200  * into the "buffer" argument and return that pointer. On failure,
4201  * return NULL.
4202  *
4203  * This function is only intended to be used by the postmaster to report the
4204  * query that crashed a backend. In particular, no attempt is made to
4205  * follow the correct concurrency protocol when accessing the
4206  * BackendStatusArray. But that's OK, in the worst case we'll return a
4207  * corrupted message. We also must take care not to trip on ereport(ERROR).
4208  * ----------
4209  */
4210 const char *
4211 pgstat_get_crashed_backend_activity(int pid, char *buffer, int buflen)
4212 {
4213  volatile PgBackendStatus *beentry;
4214  int i;
4215 
4216  beentry = BackendStatusArray;
4217 
4218  /*
4219  * We probably shouldn't get here before shared memory has been set up,
4220  * but be safe.
4221  */
4222  if (beentry == NULL || BackendActivityBuffer == NULL)
4223  return NULL;
4224 
4225  for (i = 1; i <= MaxBackends; i++)
4226  {
4227  if (beentry->st_procpid == pid)
4228  {
4229  /* Read pointer just once, so it can't change after validation */
4230  const char *activity = beentry->st_activity_raw;
4231  const char *activity_last;
4232 
4233  /*
4234  * We mustn't access activity string before we verify that it
4235  * falls within the BackendActivityBuffer. To make sure that the
4236  * entire string including its ending is contained within the
4237  * buffer, subtract one activity length from the buffer size.
4238  */
4241 
4242  if (activity < BackendActivityBuffer ||
4243  activity > activity_last)
4244  return NULL;
4245 
4246  /* If no string available, no point in a report */
4247  if (activity[0] == '\0')
4248  return NULL;
4249 
4250  /*
4251  * Copy only ASCII-safe characters so we don't run into encoding
4252  * problems when reporting the message; and be sure not to run off
4253  * the end of memory. As only ASCII characters are reported, it
4254  * doesn't seem necessary to perform multibyte aware clipping.
4255  */
4256  ascii_safe_strlcpy(buffer, activity,
4257  Min(buflen, pgstat_track_activity_query_size));
4258 
4259  return buffer;
4260  }
4261 
4262  beentry++;
4263  }
4264 
4265  /* PID not found */
4266  return NULL;
4267 }
4268 
4269 const char *
4271 {
4272  const char *backendDesc = "unknown process type";
4273 
4274  switch (backendType)
4275  {
4276  case B_AUTOVAC_LAUNCHER:
4277  backendDesc = "autovacuum launcher";
4278  break;
4279  case B_AUTOVAC_WORKER:
4280  backendDesc = "autovacuum worker";
4281  break;
4282  case B_BACKEND:
4283  backendDesc = "client backend";
4284  break;
4285  case B_BG_WORKER:
4286  backendDesc = "background worker";
4287  break;
4288  case B_BG_WRITER:
4289  backendDesc = "background writer";
4290  break;
4291  case B_CHECKPOINTER:
4292  backendDesc = "checkpointer";
4293  break;
4294  case B_STARTUP:
4295  backendDesc = "startup";
4296  break;
4297  case B_WAL_RECEIVER:
4298  backendDesc = "walreceiver";
4299  break;
4300  case B_WAL_SENDER:
4301  backendDesc = "walsender";
4302  break;
4303  case B_WAL_WRITER:
4304  backendDesc = "walwriter";
4305  break;
4306  }
4307 
4308  return backendDesc;
4309 }
4310 
4311 /* ------------------------------------------------------------
4312  * Local support functions follow
4313  * ------------------------------------------------------------
4314  */
4315 
4316 
4317 /* ----------
4318  * pgstat_setheader() -
4319  *
4320  * Set common header fields in a statistics message
4321  * ----------
4322  */
4323 static void
4325 {
4326  hdr->m_type = mtype;
4327 }
4328 
4329 
4330 /* ----------
4331  * pgstat_send() -
4332  *
4333  * Send out one statistics message to the collector
4334  * ----------
4335  */
4336 static void
4337 pgstat_send(void *msg, int len)
4338 {
4339  int rc;
4340 
4342  return;
4343 
4344  ((PgStat_MsgHdr *) msg)->m_size = len;
4345 
4346  /* We'll retry after EINTR, but ignore all other failures */
4347  do
4348  {
4349  rc = send(pgStatSock, msg, len, 0);
4350  } while (rc < 0 && errno == EINTR);
4351 
4352 #ifdef USE_ASSERT_CHECKING
4353  /* In debug builds, log send failures ... */
4354  if (rc < 0)
4355  elog(LOG, "could not send to statistics collector: %m");
4356 #endif
4357 }
4358 
4359 /* ----------
4360  * pgstat_send_archiver() -
4361  *
4362  * Tell the collector about the WAL file that we successfully
4363  * archived or failed to archive.
4364  * ----------
4365  */
4366 void
4367 pgstat_send_archiver(const char *xlog, bool failed)
4368 {
4369  PgStat_MsgArchiver msg;
4370 
4371  /*
4372  * Prepare and send the message
4373  */
4375  msg.m_failed = failed;
4376  StrNCpy(msg.m_xlog, xlog, sizeof(msg.m_xlog));
4378  pgstat_send(&msg, sizeof(msg));
4379 }
4380 
4381 /* ----------
4382  * pgstat_send_bgwriter() -
4383  *
4384  * Send bgwriter statistics to the collector
4385  * ----------
4386  */
4387 void
4389 {
4390  /* We assume this initializes to zeroes */
4391  static const PgStat_MsgBgWriter all_zeroes;
4392 
4393  /*
4394  * This function can be called even if nothing at all has happened. In
4395  * this case, avoid sending a completely empty message to the stats
4396  * collector.
4397  */
4398  if (memcmp(&BgWriterStats, &all_zeroes, sizeof(PgStat_MsgBgWriter)) == 0)
4399  return;
4400 
4401  /*
4402  * Prepare and send the message
4403  */
4404  pgstat_setheader(&BgWriterStats.m_hdr, PGSTAT_MTYPE_BGWRITER);
4405  pgstat_send(&BgWriterStats, sizeof(BgWriterStats));
4406 
4407  /*
4408  * Clear out the statistics buffer, so it can be re-used.
4409  */
4410  MemSet(&BgWriterStats, 0, sizeof(BgWriterStats));
4411 }
4412 
4413 
4414 /* ----------
4415  * PgstatCollectorMain() -
4416  *
4417  * Start up the statistics collector process. This is the body of the
4418  * postmaster child process.
4419  *
4420  * The argc/argv parameters are valid only in EXEC_BACKEND case.
4421  * ----------
4422  */
4423 NON_EXEC_STATIC void
4424 PgstatCollectorMain(int argc, char *argv[])
4425 {
4426  int len;
4427  PgStat_Msg msg;
4428  int wr;
4429 
4430  /*
4431  * Ignore all signals usually bound to some action in the postmaster,
4432  * except SIGHUP and SIGQUIT. Note we don't need a SIGUSR1 handler to
4433  * support latch operations, because we only use a local latch.
4434  */
4436  pqsignal(SIGINT, SIG_IGN);
4437  pqsignal(SIGTERM, SIG_IGN);
4443  /* Reset some signals that are accepted by postmaster but not here */
4446 
4447  /*
4448  * Identify myself via ps
4449  */
4450  init_ps_display("stats collector", "", "", "");
4451 
4452  /*
4453  * Read in existing stats files or initialize the stats to zero.
4454  */
4455  pgStatRunningInCollector = true;
4456  pgStatDBHash = pgstat_read_statsfiles(InvalidOid, true, true);
4457 
4458  /*
4459  * Loop to process messages until we get SIGQUIT or detect ungraceful
4460  * death of our parent postmaster.
4461  *
4462  * For performance reasons, we don't want to do ResetLatch/WaitLatch after
4463  * every message; instead, do that only after a recv() fails to obtain a
4464  * message. (This effectively means that if backends are sending us stuff
4465  * like mad, we won't notice postmaster death until things slack off a
4466  * bit; which seems fine.) To do that, we have an inner loop that
4467  * iterates as long as recv() succeeds. We do check ConfigReloadPending
4468  * inside the inner loop, which means that such interrupts will get
4469  * serviced but the latch won't get cleared until next time there is a
4470  * break in the action.
4471  */
4472  for (;;)
4473  {
4474  /* Clear any already-pending wakeups */
4476 
4477  /*
4478  * Quit if we get SIGQUIT from the postmaster.
4479  */
4481  break;
4482 
4483  /*
4484  * Inner loop iterates as long as we keep getting messages, or until
4485  * ShutdownRequestPending becomes set.
4486  */
4487  while (!ShutdownRequestPending)
4488  {
4489  /*
4490  * Reload configuration if we got SIGHUP from the postmaster.
4491  */
4492  if (ConfigReloadPending)
4493  {
4494  ConfigReloadPending = false;
4496  }
4497 
4498  /*
4499  * Write the stats file(s) if a new request has arrived that is
4500  * not satisfied by existing file(s).
4501  */
4503  pgstat_write_statsfiles(false, false);
4504 
4505  /*
4506  * Try to receive and process a message. This will not block,
4507  * since the socket is set to non-blocking mode.
4508  *
4509  * XXX On Windows, we have to force pgwin32_recv to cooperate,
4510  * despite the previous use of pg_set_noblock() on the socket.
4511  * This is extremely broken and should be fixed someday.
4512  */
4513 #ifdef WIN32
4514  pgwin32_noblock = 1;
4515 #endif
4516 
4517  len = recv(pgStatSock, (char *) &msg,
4518  sizeof(PgStat_Msg), 0);
4519 
4520 #ifdef WIN32
4521  pgwin32_noblock = 0;
4522 #endif
4523 
4524  if (len < 0)
4525  {
4526  if (errno == EAGAIN || errno == EWOULDBLOCK || errno == EINTR)
4527  break; /* out of inner loop */
4528  ereport(ERROR,
4530  errmsg("could not read statistics message: %m")));
4531  }
4532 
4533  /*
4534  * We ignore messages that are smaller than our common header
4535  */
4536  if (len < sizeof(PgStat_MsgHdr))
4537  continue;
4538 
4539  /*
4540  * The received length must match the length in the header
4541  */
4542  if (msg.msg_hdr.m_size != len)
4543  continue;
4544 
4545  /*
4546  * O.K. - we accept this message. Process it.
4547  */
4548  switch (msg.msg_hdr.m_type)
4549  {
4550  case PGSTAT_MTYPE_DUMMY:
4551  break;
4552 
4553  case PGSTAT_MTYPE_INQUIRY:
4554  pgstat_recv_inquiry(&msg.msg_inquiry, len);
4555  break;
4556 
4557  case PGSTAT_MTYPE_TABSTAT:
4558  pgstat_recv_tabstat(&msg.msg_tabstat, len);
4559  break;
4560 
4561  case PGSTAT_MTYPE_TABPURGE:
4562  pgstat_recv_tabpurge(&msg.msg_tabpurge, len);
4563  break;
4564 
4565  case PGSTAT_MTYPE_DROPDB:
4566  pgstat_recv_dropdb(&msg.msg_dropdb, len);
4567  break;
4568 
4570  pgstat_recv_resetcounter(&msg.msg_resetcounter, len);
4571  break;
4572 
4575  &msg.msg_resetsharedcounter,
4576  len);
4577  break;
4578 
4581  &msg.msg_resetsinglecounter,
4582  len);
4583  break;
4584 
4586  pgstat_recv_autovac(&msg.msg_autovacuum_start, len);
4587  break;
4588 
4589  case PGSTAT_MTYPE_VACUUM:
4590  pgstat_recv_vacuum(&msg.msg_vacuum, len);
4591  break;
4592 
4593  case PGSTAT_MTYPE_ANALYZE:
4594  pgstat_recv_analyze(&msg.msg_analyze, len);
4595  break;
4596 
4597  case PGSTAT_MTYPE_ARCHIVER:
4598  pgstat_recv_archiver(&msg.msg_archiver, len);
4599  break;
4600 
4601  case PGSTAT_MTYPE_BGWRITER:
4602  pgstat_recv_bgwriter(&msg.msg_bgwriter, len);
4603  break;
4604 
4605  case PGSTAT_MTYPE_FUNCSTAT:
4606  pgstat_recv_funcstat(&msg.msg_funcstat, len);
4607  break;
4608 
4610  pgstat_recv_funcpurge(&msg.msg_funcpurge, len);
4611  break;
4612 
4615  &msg.msg_recoveryconflict,
4616  len);
4617  break;
4618 
4619  case PGSTAT_MTYPE_DEADLOCK:
4620  pgstat_recv_deadlock(&msg.msg_deadlock, len);
4621  break;
4622 
4623  case PGSTAT_MTYPE_TEMPFILE:
4624  pgstat_recv_tempfile(&msg.msg_tempfile, len);
4625  break;
4626 
4629  &msg.msg_checksumfailure,
4630  len);
4631  break;
4632 
4633  default:
4634  break;
4635  }
4636  } /* end of inner message-processing loop */
4637 
4638  /* Sleep until there's something to do */
4639 #ifndef WIN32
4642  pgStatSock, -1L,
4644 #else
4645 
4646  /*
4647  * Windows, at least in its Windows Server 2003 R2 incarnation,
4648  * sometimes loses FD_READ events. Waking up and retrying the recv()
4649  * fixes that, so don't sleep indefinitely. This is a crock of the
4650  * first water, but until somebody wants to debug exactly what's
4651  * happening there, this is the best we can do. The two-second
4652  * timeout matches our pre-9.2 behavior, and needs to be short enough
4653  * to not provoke "using stale statistics" complaints from
4654  * backend_read_statsfile.
4655  */
4658  pgStatSock,
4659  2 * 1000L /* msec */ ,
4661 #endif
4662 
4663  /*
4664  * Emergency bailout if postmaster has died. This is to avoid the
4665  * necessity for manual cleanup of all postmaster children.
4666  */
4667  if (wr & WL_POSTMASTER_DEATH)
4668  break;
4669  } /* end of outer loop */
4670 
4671  /*
4672  * Save the final stats to reuse at next startup.
4673  */
4674  pgstat_write_statsfiles(true, true);
4675 
4676  exit(0);
4677 }
4678 
4679 /*
4680  * Subroutine to clear stats in a database entry
4681  *
4682  * Tables and functions hashes are initialized to empty.
4683  */
4684 static void
4686 {
4687  HASHCTL hash_ctl;
4688 
4689  dbentry->n_xact_commit = 0;
4690  dbentry->n_xact_rollback = 0;
4691  dbentry->n_blocks_fetched = 0;
4692  dbentry->n_blocks_hit = 0;
4693  dbentry->n_tuples_returned = 0;
4694  dbentry->n_tuples_fetched = 0;
4695  dbentry->n_tuples_inserted = 0;
4696  dbentry->n_tuples_updated = 0;
4697  dbentry->n_tuples_deleted = 0;
4698  dbentry->last_autovac_time = 0;
4699  dbentry->n_conflict_tablespace = 0;
4700  dbentry->n_conflict_lock = 0;
4701  dbentry->n_conflict_snapshot = 0;
4702  dbentry->n_conflict_bufferpin = 0;
4703  dbentry->n_conflict_startup_deadlock = 0;
4704  dbentry->n_temp_files = 0;
4705  dbentry->n_temp_bytes = 0;
4706  dbentry->n_deadlocks = 0;
4707  dbentry->n_checksum_failures = 0;
4708  dbentry->last_checksum_failure = 0;
4709  dbentry->n_block_read_time = 0;
4710  dbentry->n_block_write_time = 0;
4711 
4713  dbentry->stats_timestamp = 0;
4714 
4715  memset(&hash_ctl, 0, sizeof(hash_ctl));
4716  hash_ctl.keysize = sizeof(Oid);
4717  hash_ctl.entrysize = sizeof(PgStat_StatTabEntry);
4718  dbentry->tables = hash_create("Per-database table",
4720  &hash_ctl,
4721  HASH_ELEM | HASH_BLOBS);
4722 
4723  hash_ctl.keysize = sizeof(Oid);
4724  hash_ctl.entrysize = sizeof(PgStat_StatFuncEntry);
4725  dbentry->functions = hash_create("Per-database function",
4727  &hash_ctl,
4728  HASH_ELEM | HASH_BLOBS);
4729 }
4730 
4731 /*
4732  * Lookup the hash table entry for the specified database. If no hash
4733  * table entry exists, initialize it, if the create parameter is true.
4734  * Else, return NULL.
4735  */
4736 static PgStat_StatDBEntry *
4737 pgstat_get_db_entry(Oid databaseid, bool create)
4738 {
4739  PgStat_StatDBEntry *result;
4740  bool found;
4741  HASHACTION action = (create ? HASH_ENTER : HASH_FIND);
4742 
4743  /* Lookup or create the hash table entry for this database */
4744  result = (PgStat_StatDBEntry *) hash_search(pgStatDBHash,
4745  &databaseid,
4746  action, &found);
4747 
4748  if (!create && !found)
4749  return NULL;
4750 
4751  /*
4752  * If not found, initialize the new one. This creates empty hash tables
4753  * for tables and functions, too.
4754  */
4755  if (!found)
4756  reset_dbentry_counters(result);
4757 
4758  return result;
4759 }
4760 
4761 
4762 /*
4763  * Lookup the hash table entry for the specified table. If no hash
4764  * table entry exists, initialize it, if the create parameter is true.
4765  * Else, return NULL.
4766  */
4767 static PgStat_StatTabEntry *
4768 pgstat_get_tab_entry(PgStat_StatDBEntry *dbentry, Oid tableoid, bool create)
4769 {
4770  PgStat_StatTabEntry *result;
4771  bool found;
4772  HASHACTION action = (create ? HASH_ENTER : HASH_FIND);
4773 
4774  /* Lookup or create the hash table entry for this table */
4775  result = (PgStat_StatTabEntry *) hash_search(dbentry->tables,
4776  &tableoid,
4777  action, &found);
4778 
4779  if (!create && !found)
4780  return NULL;
4781 
4782  /* If not found, initialize the new one. */
4783  if (!found)
4784  {
4785  result->numscans = 0;
4786  result->tuples_returned = 0;
4787  result->tuples_fetched = 0;
4788  result->tuples_inserted = 0;
4789  result->tuples_updated = 0;
4790  result->tuples_deleted = 0;
4791  result->tuples_hot_updated = 0;
4792  result->n_live_tuples = 0;
4793  result->n_dead_tuples = 0;
4794  result->changes_since_analyze = 0;
4795  result->blocks_fetched = 0;
4796  result->blocks_hit = 0;
4797  result->vacuum_timestamp = 0;
4798  result->vacuum_count = 0;
4799  result->autovac_vacuum_timestamp = 0;
4800  result->autovac_vacuum_count = 0;
4801  result->analyze_timestamp = 0;
4802  result->analyze_count = 0;
4803  result->autovac_analyze_timestamp = 0;
4804  result->autovac_analyze_count = 0;
4805  }
4806 
4807  return result;
4808 }
4809 
4810 
4811 /* ----------
4812  * pgstat_write_statsfiles() -
4813  * Write the global statistics file, as well as requested DB files.
4814  *
4815  * 'permanent' specifies writing to the permanent files not temporary ones.
4816  * When true (happens only when the collector is shutting down), also remove
4817  * the temporary files so that backends starting up under a new postmaster
4818  * can't read old data before the new collector is ready.
4819  *
4820  * When 'allDbs' is false, only the requested databases (listed in
4821  * pending_write_requests) will be written; otherwise, all databases
4822  * will be written.
4823  * ----------
4824  */
4825 static void
4826 pgstat_write_statsfiles(bool permanent, bool allDbs)
4827 {
4828  HASH_SEQ_STATUS hstat;
4829  PgStat_StatDBEntry *dbentry;
4830  FILE *fpout;
4831  int32 format_id;
4832  const char *tmpfile = permanent ? PGSTAT_STAT_PERMANENT_TMPFILE : pgstat_stat_tmpname;
4833  const char *statfile = permanent ? PGSTAT_STAT_PERMANENT_FILENAME : pgstat_stat_filename;
4834  int rc;
4835 
4836  elog(DEBUG2, "writing stats file \"%s\"", statfile);
4837 
4838  /*
4839  * Open the statistics temp file to write out the current values.
4840  */
4841  fpout = AllocateFile(tmpfile, PG_BINARY_W);
4842  if (fpout == NULL)
4843  {
4844  ereport(LOG,
4846  errmsg("could not open temporary statistics file \"%s\": %m",
4847  tmpfile)));
4848  return;
4849  }
4850 
4851  /*
4852  * Set the timestamp of the stats file.
4853  */
4854  globalStats.stats_timestamp = GetCurrentTimestamp();
4855 
4856  /*
4857  * Write the file header --- currently just a format ID.
4858  */
4859  format_id = PGSTAT_FILE_FORMAT_ID;
4860  rc = fwrite(&format_id, sizeof(format_id), 1, fpout);
4861  (void) rc; /* we'll check for error with ferror */
4862 
4863  /*
4864  * Write global stats struct
4865  */
4866  rc = fwrite(&globalStats, sizeof(globalStats), 1, fpout);
4867  (void) rc; /* we'll check for error with ferror */
4868 
4869  /*
4870  * Write archiver stats struct
4871  */
4872  rc = fwrite(&archiverStats, sizeof(archiverStats), 1, fpout);
4873  (void) rc; /* we'll check for error with ferror */
4874 
4875  /*
4876  * Walk through the database table.
4877  */
4878  hash_seq_init(&hstat, pgStatDBHash);
4879  while ((dbentry = (PgStat_StatDBEntry *) hash_seq_search(&hstat)) != NULL)
4880  {
4881  /*
4882  * Write out the table and function stats for this DB into the
4883  * appropriate per-DB stat file, if required.
4884  */
4885  if (allDbs || pgstat_db_requested(dbentry->databaseid))
4886  {
4887  /* Make DB's timestamp consistent with the global stats */
4888  dbentry->stats_timestamp = globalStats.stats_timestamp;
4889 
4890  pgstat_write_db_statsfile(dbentry, permanent);
4891  }
4892 
4893  /*
4894  * Write out the DB entry. We don't write the tables or functions
4895  * pointers, since they're of no use to any other process.
4896  */
4897  fputc('D', fpout);
4898  rc = fwrite(dbentry, offsetof(PgStat_StatDBEntry, tables), 1, fpout);
4899  (void) rc; /* we'll check for error with ferror */
4900  }
4901 
4902  /*
4903  * No more output to be done. Close the temp file and replace the old
4904  * pgstat.stat with it. The ferror() check replaces testing for error
4905  * after each individual fputc or fwrite above.
4906  */
4907  fputc('E', fpout);
4908 
4909  if (ferror(fpout))
4910  {
4911  ereport(LOG,
4913  errmsg("could not write temporary statistics file \"%s\": %m",
4914  tmpfile)));
4915  FreeFile(fpout);
4916  unlink(tmpfile);
4917  }
4918  else if (FreeFile(fpout) < 0)
4919  {
4920  ereport(LOG,
4922  errmsg("could not close temporary statistics file \"%s\": %m",
4923  tmpfile)));
4924  unlink(tmpfile);
4925  }
4926  else if (rename(tmpfile, statfile) < 0)
4927  {
4928  ereport(LOG,
4930  errmsg("could not rename temporary statistics file \"%s\" to \"%s\": %m",
4931  tmpfile, statfile)));
4932  unlink(tmpfile);
4933  }
4934 
4935  if (permanent)
4936  unlink(pgstat_stat_filename);
4937 
4938  /*
4939  * Now throw away the list of requests. Note that requests sent after we
4940  * started the write are still waiting on the network socket.
4941  */
4942  list_free(pending_write_requests);
4943  pending_write_requests = NIL;
4944 }
4945 
4946 /*
4947  * return the filename for a DB stat file; filename is the output buffer,
4948  * of length len.
4949  */
4950 static void
4951 get_dbstat_filename(bool permanent, bool tempname, Oid databaseid,
4952  char *filename, int len)
4953 {
4954  int printed;
4955 
4956  /* NB -- pgstat_reset_remove_files knows about the pattern this uses */
4957  printed = snprintf(filename, len, "%s/db_%u.%s",
4958  permanent ? PGSTAT_STAT_PERMANENT_DIRECTORY :
4960  databaseid,
4961  tempname ? "tmp" : "stat");
4962  if (printed >= len)
4963  elog(ERROR, "overlength pgstat path");
4964 }
4965 
4966 /* ----------
4967  * pgstat_write_db_statsfile() -
4968  * Write the stat file for a single database.
4969  *
4970  * If writing to the permanent file (happens when the collector is
4971  * shutting down only), remove the temporary file so that backends
4972  * starting up under a new postmaster can't read the old data before
4973  * the new collector is ready.
4974  * ----------
4975  */
4976 static void
4978 {
4979  HASH_SEQ_STATUS tstat;
4980  HASH_SEQ_STATUS fstat;
4981  PgStat_StatTabEntry *tabentry;
4982  PgStat_StatFuncEntry *funcentry;
4983  FILE *fpout;
4984  int32 format_id;
4985  Oid dbid = dbentry->databaseid;
4986  int rc;
4987  char tmpfile[MAXPGPATH];
4988  char statfile[MAXPGPATH];
4989 
4990  get_dbstat_filename(permanent, true, dbid, tmpfile, MAXPGPATH);
4991  get_dbstat_filename(permanent, false, dbid, statfile, MAXPGPATH);
4992 
4993  elog(DEBUG2, "writing stats file \"%s\"", statfile);
4994 
4995  /*
4996  * Open the statistics temp file to write out the current values.
4997  */
4998  fpout = AllocateFile(tmpfile, PG_BINARY_W);
4999  if (fpout == NULL)
5000  {
5001  ereport(LOG,
5003  errmsg("could not open temporary statistics file \"%s\": %m",
5004  tmpfile)));
5005  return;
5006  }
5007 
5008  /*
5009  * Write the file header --- currently just a format ID.
5010  */
5011  format_id = PGSTAT_FILE_FORMAT_ID;
5012  rc = fwrite(&format_id, sizeof(format_id), 1, fpout);
5013  (void) rc; /* we'll check for error with ferror */
5014 
5015  /*
5016  * Walk through the database's access stats per table.
5017  */
5018  hash_seq_init(&tstat, dbentry->tables);
5019  while ((tabentry = (PgStat_StatTabEntry *) hash_seq_search(&tstat)) != NULL)
5020  {
5021  fputc('T', fpout);
5022  rc = fwrite(tabentry, sizeof(PgStat_StatTabEntry), 1, fpout);
5023  (void) rc; /* we'll check for error with ferror */
5024  }
5025 
5026  /*
5027  * Walk through the database's function stats table.
5028  */
5029  hash_seq_init(&fstat, dbentry->functions);
5030  while ((funcentry = (PgStat_StatFuncEntry *) hash_seq_search(&fstat)) != NULL)
5031  {
5032  fputc('F', fpout);
5033  rc = fwrite(funcentry, sizeof(PgStat_StatFuncEntry), 1, fpout);
5034  (void) rc; /* we'll check for error with ferror */
5035  }
5036 
5037  /*
5038  * No more output to be done. Close the temp file and replace the old
5039  * pgstat.stat with it. The ferror() check replaces testing for error
5040  * after each individual fputc or fwrite above.
5041  */
5042  fputc('E', fpout);
5043 
5044  if (ferror(fpout))
5045  {
5046  ereport(LOG,
5048  errmsg("could not write temporary statistics file \"%s\": %m",
5049  tmpfile)));
5050  FreeFile(fpout);
5051  unlink(tmpfile);
5052  }
5053  else if (FreeFile(fpout) < 0)
5054  {
5055  ereport(LOG,
5057  errmsg("could not close temporary statistics file \"%s\": %m",
5058  tmpfile)));
5059  unlink(tmpfile);
5060  }
5061  else if (rename(tmpfile, statfile) < 0)
5062  {
5063  ereport(LOG,
5065  errmsg("could not rename temporary statistics file \"%s\" to \"%s\": %m",
5066  tmpfile, statfile)));
5067  unlink(tmpfile);
5068  }
5069 
5070  if (permanent)
5071  {
5072  get_dbstat_filename(false, false, dbid, statfile, MAXPGPATH);
5073 
5074  elog(DEBUG2, "removing temporary stats file \"%s\"", statfile);
5075  unlink(statfile);
5076  }
5077 }
5078 
5079 /* ----------
5080  * pgstat_read_statsfiles() -
5081  *
5082  * Reads in some existing statistics collector files and returns the
5083  * databases hash table that is the top level of the data.
5084  *
5085  * If 'onlydb' is not InvalidOid, it means we only want data for that DB
5086  * plus the shared catalogs ("DB 0"). We'll still populate the DB hash
5087  * table for all databases, but we don't bother even creating table/function
5088  * hash tables for other databases.
5089  *
5090  * 'permanent' specifies reading from the permanent files not temporary ones.
5091  * When true (happens only when the collector is starting up), remove the
5092  * files after reading; the in-memory status is now authoritative, and the
5093  * files would be out of date in case somebody else reads them.
5094  *
5095  * If a 'deep' read is requested, table/function stats are read, otherwise
5096  * the table/function hash tables remain empty.
5097  * ----------
5098  */
5099 static HTAB *
5100 pgstat_read_statsfiles(Oid onlydb, bool permanent, bool deep)
5101 {
5102  PgStat_StatDBEntry *dbentry;
5103  PgStat_StatDBEntry dbbuf;
5104  HASHCTL hash_ctl;
5105  HTAB *dbhash;
5106  FILE *fpin;
5107  int32 format_id;
5108  bool found;
5109  const char *statfile = permanent ? PGSTAT_STAT_PERMANENT_FILENAME : pgstat_stat_filename;
5110 
5111  /*
5112  * The tables will live in pgStatLocalContext.
5113  */
5115 
5116  /*
5117  * Create the DB hashtable
5118  */
5119  memset(&hash_ctl, 0, sizeof(hash_ctl));
5120  hash_ctl.keysize = sizeof(Oid);
5121  hash_ctl.entrysize = sizeof(PgStat_StatDBEntry);
5122  hash_ctl.hcxt = pgStatLocalContext;
5123  dbhash = hash_create("Databases hash", PGSTAT_DB_HASH_SIZE, &hash_ctl,
5125 
5126  /*
5127  * Clear out global and archiver statistics so they start from zero in
5128  * case we can't load an existing statsfile.
5129  */
5130  memset(&globalStats, 0, sizeof(globalStats));
5131  memset(&archiverStats, 0, sizeof(archiverStats));
5132 
5133  /*
5134  * Set the current timestamp (will be kept only in case we can't load an
5135  * existing statsfile).
5136  */
5137  globalStats.stat_reset_timestamp = GetCurrentTimestamp();
5138  archiverStats.stat_reset_timestamp = globalStats.stat_reset_timestamp;
5139 
5140  /*
5141  * Try to open the stats file. If it doesn't exist, the backends simply
5142  * return zero for anything and the collector simply starts from scratch
5143  * with empty counters.
5144  *
5145  * ENOENT is a possibility if the stats collector is not running or has
5146  * not yet written the stats file the first time. Any other failure
5147  * condition is suspicious.
5148  */
5149  if ((fpin = AllocateFile(statfile, PG_BINARY_R)) == NULL)
5150  {
5151  if (errno != ENOENT)
5154  errmsg("could not open statistics file \"%s\": %m",
5155  statfile)));
5156  return dbhash;
5157  }
5158 
5159  /*
5160  * Verify it's of the expected format.
5161  */
5162  if (fread(&format_id, 1, sizeof(format_id), fpin) != sizeof(format_id) ||
5163  format_id != PGSTAT_FILE_FORMAT_ID)
5164  {
5166  (errmsg("corrupted statistics file \"%s\"", statfile)));
5167  goto done;
5168  }
5169 
5170  /*
5171  * Read global stats struct
5172  */
5173  if (fread(&globalStats, 1, sizeof(globalStats), fpin) != sizeof(globalStats))
5174  {
5176  (errmsg("corrupted statistics file \"%s\"", statfile)));
5177  memset(&globalStats, 0, sizeof(globalStats));
5178  goto done;
5179  }
5180 
5181  /*
5182  * In the collector, disregard the timestamp we read from the permanent
5183  * stats file; we should be willing to write a temp stats file immediately
5184  * upon the first request from any backend. This only matters if the old
5185  * file's timestamp is less than PGSTAT_STAT_INTERVAL ago, but that's not
5186  * an unusual scenario.
5187  */
5189  globalStats.stats_timestamp = 0;
5190 
5191  /*
5192  * Read archiver stats struct
5193  */
5194  if (fread(&archiverStats, 1, sizeof(archiverStats), fpin) != sizeof(archiverStats))
5195  {
5197  (errmsg("corrupted statistics file \"%s\"", statfile)));
5198  memset(&archiverStats, 0, sizeof(archiverStats));
5199  goto done;
5200  }
5201 
5202  /*
5203  * We found an existing collector stats file. Read it and put all the
5204  * hashtable entries into place.
5205  */
5206  for (;;)
5207  {
5208  switch (fgetc(fpin))
5209  {
5210  /*
5211  * 'D' A PgStat_StatDBEntry struct describing a database
5212  * follows.
5213  */
5214  case 'D':
5215  if (fread(&dbbuf, 1, offsetof(PgStat_StatDBEntry, tables),
5216  fpin) != offsetof(PgStat_StatDBEntry, tables))
5217  {
5219  (errmsg("corrupted statistics file \"%s\"",
5220  statfile)));
5221  goto done;
5222  }
5223 
5224  /*
5225  * Add to the DB hash
5226  */
5227  dbentry = (PgStat_StatDBEntry *) hash_search(dbhash,
5228  (void *) &dbbuf.databaseid,
5229  HASH_ENTER,
5230  &found);
5231  if (found)
5232  {
5234  (errmsg("corrupted statistics file \"%s\"",
5235  statfile)));
5236  goto done;
5237  }
5238 
5239  memcpy(dbentry, &dbbuf, sizeof(PgStat_StatDBEntry));
5240  dbentry->tables = NULL;
5241  dbentry->functions = NULL;
5242 
5243  /*
5244  * In the collector, disregard the timestamp we read from the
5245  * permanent stats file; we should be willing to write a temp
5246  * stats file immediately upon the first request from any
5247  * backend.
5248  */
5250  dbentry->stats_timestamp = 0;
5251 
5252  /*
5253  * Don't create tables/functions hashtables for uninteresting
5254  * databases.
5255  */
5256  if (onlydb != InvalidOid)
5257  {
5258  if (dbbuf.databaseid != onlydb &&
5259  dbbuf.databaseid != InvalidOid)
5260  break;
5261  }
5262 
5263  memset(&hash_ctl, 0, sizeof(hash_ctl));
5264  hash_ctl.keysize = sizeof(Oid);
5265  hash_ctl.entrysize = sizeof(PgStat_StatTabEntry);
5266  hash_ctl.hcxt = pgStatLocalContext;
5267  dbentry->tables = hash_create("Per-database table",
5269  &hash_ctl,
5271 
5272  hash_ctl.keysize = sizeof(Oid);
5273  hash_ctl.entrysize = sizeof(PgStat_StatFuncEntry);
5274  hash_ctl.hcxt = pgStatLocalContext;
5275  dbentry->functions = hash_create("Per-database function",
5277  &hash_ctl,
5279 
5280  /*
5281  * If requested, read the data from the database-specific
5282  * file. Otherwise we just leave the hashtables empty.
5283  */
5284  if (deep)
5286  dbentry->tables,
5287  dbentry->functions,
5288  permanent);
5289 
5290  break;
5291 
5292  case 'E':
5293  goto done;
5294 
5295  default:
5297  (errmsg("corrupted statistics file \"%s\"",
5298  statfile)));
5299  goto done;
5300  }
5301  }
5302 
5303 done:
5304  FreeFile(fpin);
5305 
5306  /* If requested to read the permanent file, also get rid of it. */
5307  if (permanent)
5308  {
5309  elog(DEBUG2, "removing permanent stats file \"%s\"", statfile);
5310  unlink(statfile);
5311  }
5312 
5313  return dbhash;
5314 }
5315 
5316 
5317 /* ----------
5318  * pgstat_read_db_statsfile() -
5319  *
5320  * Reads in the existing statistics collector file for the given database,
5321  * filling the passed-in tables and functions hash tables.
5322  *
5323  * As in pgstat_read_statsfiles, if the permanent file is requested, it is
5324  * removed after reading.
5325  *
5326  * Note: this code has the ability to skip storing per-table or per-function
5327  * data, if NULL is passed for the corresponding hashtable. That's not used
5328  * at the moment though.
5329  * ----------
5330  */
5331 static void
5332 pgstat_read_db_statsfile(Oid databaseid, HTAB *tabhash, HTAB *funchash,
5333  bool permanent)
5334 {
5335  PgStat_StatTabEntry *tabentry;
5336  PgStat_StatTabEntry tabbuf;
5337  PgStat_StatFuncEntry funcbuf;
5338  PgStat_StatFuncEntry *funcentry;
5339  FILE *fpin;
5340  int32 format_id;
5341  bool found;
5342  char statfile[MAXPGPATH];
5343 
5344  get_dbstat_filename(permanent, false, databaseid, statfile, MAXPGPATH);
5345 
5346  /*
5347  * Try to open the stats file. If it doesn't exist, the backends simply
5348  * return zero for anything and the collector simply starts from scratch
5349  * with empty counters.
5350  *
5351  * ENOENT is a possibility if the stats collector is not running or has
5352  * not yet written the stats file the first time. Any other failure
5353  * condition is suspicious.
5354  */
5355  if ((fpin = AllocateFile(statfile, PG_BINARY_R)) == NULL)
5356  {
5357  if (errno != ENOENT)
5360  errmsg("could not open statistics file \"%s\": %m",
5361  statfile)));
5362  return;
5363  }
5364 
5365  /*
5366  * Verify it's of the expected format.
5367  */
5368  if (fread(&format_id, 1, sizeof(format_id), fpin) != sizeof(format_id) ||
5369  format_id != PGSTAT_FILE_FORMAT_ID)
5370  {
5372  (errmsg("corrupted statistics file \"%s\"", statfile)));
5373  goto done;
5374  }
5375 
5376  /*
5377  * We found an existing collector stats file. Read it and put all the
5378  * hashtable entries into place.
5379  */
5380  for (;;)
5381  {
5382  switch (fgetc(fpin))
5383  {
5384  /*
5385  * 'T' A PgStat_StatTabEntry follows.
5386  */
5387  case 'T':
5388  if (fread(&tabbuf, 1, sizeof(PgStat_StatTabEntry),
5389  fpin) != sizeof(PgStat_StatTabEntry))
5390  {
5392  (errmsg("corrupted statistics file \"%s\"",
5393  statfile)));
5394  goto done;
5395  }
5396 
5397  /*
5398  * Skip if table data not wanted.
5399  */
5400  if (tabhash == NULL)
5401  break;
5402 
5403  tabentry = (PgStat_StatTabEntry *) hash_search(tabhash,
5404  (void *) &tabbuf.tableid,
5405  HASH_ENTER, &found);
5406 
5407  if (found)
5408  {
5410  (errmsg("corrupted statistics file \"%s\"",
5411  statfile)));
5412  goto done;
5413  }
5414 
5415  memcpy(tabentry, &tabbuf, sizeof(tabbuf));
5416  break;
5417 
5418  /*
5419  * 'F' A PgStat_StatFuncEntry follows.
5420  */
5421  case 'F':
5422  if (fread(&funcbuf, 1, sizeof(PgStat_StatFuncEntry),
5423  fpin) != sizeof(PgStat_StatFuncEntry))
5424  {
5426  (errmsg("corrupted statistics file \"%s\"",
5427  statfile)));
5428  goto done;
5429  }
5430 
5431  /*
5432  * Skip if function data not wanted.
5433  */
5434  if (funchash == NULL)
5435  break;
5436 
5437  funcentry = (PgStat_StatFuncEntry *) hash_search(funchash,
5438  (void *) &funcbuf.functionid,
5439  HASH_ENTER, &found);
5440 
5441  if (found)
5442  {
5444  (errmsg("corrupted statistics file \"%s\"",
5445  statfile)));
5446  goto done;
5447  }
5448 
5449  memcpy(funcentry, &funcbuf, sizeof(funcbuf));
5450  break;
5451 
5452  /*
5453  * 'E' The EOF marker of a complete stats file.
5454  */
5455  case 'E':
5456  goto done;
5457 
5458  default:
5460  (errmsg("corrupted statistics file \"%s\"",
5461  statfile)));
5462  goto done;
5463  }
5464  }
5465 
5466 done:
5467  FreeFile(fpin);
5468 
5469  if (permanent)
5470  {
5471  elog(DEBUG2, "removing permanent stats file \"%s\"", statfile);
5472  unlink(statfile);
5473  }
5474 }
5475 
5476 /* ----------
5477  * pgstat_read_db_statsfile_timestamp() -
5478  *
5479  * Attempt to determine the timestamp of the last db statfile write.
5480  * Returns true if successful; the timestamp is stored in *ts.
5481  *
5482  * This needs to be careful about handling databases for which no stats file
5483  * exists, such as databases without a stat entry or those not yet written:
5484  *
5485  * - if there's a database entry in the global file, return the corresponding
5486  * stats_timestamp value.
5487  *
5488  * - if there's no db stat entry (e.g. for a new or inactive database),
5489  * there's no stats_timestamp value, but also nothing to write so we return
5490  * the timestamp of the global statfile.
5491  * ----------
5492  */
5493 static bool
5494 pgstat_read_db_statsfile_timestamp(Oid databaseid, bool permanent,
5495  TimestampTz *ts)
5496 {
5497  PgStat_StatDBEntry dbentry;
5498  PgStat_GlobalStats myGlobalStats;
5499  PgStat_ArchiverStats myArchiverStats;
5500  FILE *fpin;
5501  int32 format_id;
5502  const char *statfile = permanent ? PGSTAT_STAT_PERMANENT_FILENAME : pgstat_stat_filename;
5503 
5504  /*
5505  * Try to open the stats file. As above, anything but ENOENT is worthy of
5506  * complaining about.
5507  */
5508  if ((fpin = AllocateFile(statfile, PG_BINARY_R)) == NULL)
5509  {
5510  if (errno != ENOENT)
5513  errmsg("could not open statistics file \"%s\": %m",
5514  statfile)));
5515  return false;
5516  }
5517 
5518  /*
5519  * Verify it's of the expected format.
5520  */
5521  if (fread(&format_id, 1, sizeof(format_id), fpin) != sizeof(format_id) ||
5522  format_id != PGSTAT_FILE_FORMAT_ID)
5523  {
5525  (errmsg("corrupted statistics file \"%s\"", statfile)));
5526  FreeFile(fpin);
5527  return false;
5528  }
5529 
5530  /*
5531  * Read global stats struct
5532  */
5533  if (fread(&myGlobalStats, 1, sizeof(myGlobalStats),
5534  fpin) != sizeof(myGlobalStats))
5535  {
5537  (errmsg("corrupted statistics file \"%s\"", statfile)));
5538  FreeFile(fpin);
5539  return false;
5540  }
5541 
5542  /*
5543  * Read archiver stats struct
5544  */
5545  if (fread(&myArchiverStats, 1, sizeof(myArchiverStats),
5546  fpin) != sizeof(myArchiverStats))
5547  {
5549  (errmsg("corrupted statistics file \"%s\"", statfile)));
5550  FreeFile(fpin);
5551  return false;
5552  }
5553 
5554  /* By default, we're going to return the timestamp of the global file. */
5555  *ts = myGlobalStats.stats_timestamp;
5556 
5557  /*
5558  * We found an existing collector stats file. Read it and look for a
5559  * record for the requested database. If found, use its timestamp.
5560  */
5561  for (;;)
5562  {
5563  switch (fgetc(fpin))
5564  {
5565  /*
5566  * 'D' A PgStat_StatDBEntry struct describing a database
5567  * follows.
5568  */
5569  case 'D':
5570  if (fread(&dbentry, 1, offsetof(PgStat_StatDBEntry, tables),
5571  fpin) != offsetof(PgStat_StatDBEntry, tables))
5572  {
5574  (errmsg("corrupted statistics file \"%s\"",
5575  statfile)));
5576  goto done;
5577  }
5578 
5579  /*
5580  * If this is the DB we're looking for, save its timestamp and
5581  * we're done.
5582  */
5583  if (dbentry.databaseid == databaseid)
5584  {
5585  *ts = dbentry.stats_timestamp;
5586  goto done;
5587  }
5588 
5589  break;
5590 
5591  case 'E':
5592  goto done;
5593 
5594  default:
5596  (errmsg("corrupted statistics file \"%s\"",
5597  statfile)));
5598  goto done;
5599  }
5600  }
5601 
5602 done:
5603  FreeFile(fpin);
5604  return true;
5605 }
5606 
5607 /*
5608  * If not already done, read the statistics collector stats file into
5609  * some hash tables. The results will be kept until pgstat_clear_snapshot()
5610  * is called (typically, at end of transaction).
5611  */
5612 static void
5614 {
5615  TimestampTz min_ts = 0;
5616  TimestampTz ref_ts = 0;
5617  Oid inquiry_db;
5618  int count;
5619 
5620  /* already read it? */
5621  if (pgStatDBHash)
5622  return;
5624 
5625  /*
5626  * In a normal backend, we check staleness of the data for our own DB, and
5627  * so we send MyDatabaseId in inquiry messages. In the autovac launcher,
5628  * check staleness of the shared-catalog data, and send InvalidOid in
5629  * inquiry messages so as not to force writing unnecessary data.
5630  */
5632  inquiry_db = InvalidOid;
5633  else
5634  inquiry_db = MyDatabaseId;
5635 
5636  /*
5637  * Loop until fresh enough stats file is available or we ran out of time.
5638  * The stats inquiry message is sent repeatedly in case collector drops
5639  * it; but not every single time, as that just swamps the collector.
5640  */
5641  for (count = 0; count < PGSTAT_POLL_LOOP_COUNT; count++)
5642  {
5643  bool ok;
5644  TimestampTz file_ts = 0;
5645  TimestampTz cur_ts;
5646 
5648 
5649  ok = pgstat_read_db_statsfile_timestamp(inquiry_db, false, &file_ts);
5650 
5651  cur_ts = GetCurrentTimestamp();
5652  /* Calculate min acceptable timestamp, if we didn't already */
5653  if (count == 0 || cur_ts < ref_ts)
5654  {
5655  /*
5656  * We set the minimum acceptable timestamp to PGSTAT_STAT_INTERVAL
5657  * msec before now. This indirectly ensures that the collector
5658  * needn't write the file more often than PGSTAT_STAT_INTERVAL. In
5659  * an autovacuum worker, however, we want a lower delay to avoid
5660  * using stale data, so we use PGSTAT_RETRY_DELAY (since the
5661  * number of workers is low, this shouldn't be a problem).
5662  *
5663  * We don't recompute min_ts after sleeping, except in the
5664  * unlikely case that cur_ts went backwards. So we might end up
5665  * accepting a file a bit older than PGSTAT_STAT_INTERVAL. In
5666  * practice that shouldn't happen, though, as long as the sleep
5667  * time is less than PGSTAT_STAT_INTERVAL; and we don't want to
5668  * tell the collector that our cutoff time is less than what we'd
5669  * actually accept.
5670  */
5671  ref_ts = cur_ts;
5673  min_ts = TimestampTzPlusMilliseconds(ref_ts,
5675  else
5676  min_ts = TimestampTzPlusMilliseconds(ref_ts,
5678  }
5679 
5680  /*
5681  * If the file timestamp is actually newer than cur_ts, we must have
5682  * had a clock glitch (system time went backwards) or there is clock
5683  * skew between our processor and the stats collector's processor.
5684  * Accept the file, but send an inquiry message anyway to make
5685  * pgstat_recv_inquiry do a sanity check on the collector's time.
5686  */
5687  if (ok && file_ts > cur_ts)
5688  {
5689  /*
5690  * A small amount of clock skew between processors isn't terribly
5691  * surprising, but a large difference is worth logging. We
5692  * arbitrarily define "large" as 1000 msec.
5693  */
5694  if (file_ts >= TimestampTzPlusMilliseconds(cur_ts, 1000))
5695  {
5696  char *filetime;
5697  char *mytime;
5698 
5699  /* Copy because timestamptz_to_str returns a static buffer */
5700  filetime = pstrdup(timestamptz_to_str(file_ts));
5701  mytime = pstrdup(timestamptz_to_str(cur_ts));
5702  elog(LOG, "stats collector's time %s is later than backend local time %s",
5703  filetime, mytime);
5704  pfree(filetime);
5705  pfree(mytime);
5706  }
5707 
5708  pgstat_send_inquiry(cur_ts, min_ts, inquiry_db);
5709  break;
5710  }
5711 
5712  /* Normal acceptance case: file is not older than cutoff time */
5713  if (ok && file_ts >= min_ts)
5714  break;
5715 
5716  /* Not there or too old, so kick the collector and wait a bit */
5717  if ((count % PGSTAT_INQ_LOOP_COUNT) == 0)
5718  pgstat_send_inquiry(cur_ts, min_ts, inquiry_db);
5719 
5720  pg_usleep(PGSTAT_RETRY_DELAY * 1000L);
5721  }
5722 
5723  if (count >= PGSTAT_POLL_LOOP_COUNT)
5724  ereport(LOG,
5725  (errmsg("using stale statistics instead of current ones "
5726  "because stats collector is not responding")));
5727 
5728  /*
5729  * Autovacuum launcher wants stats about all databases, but a shallow read
5730  * is sufficient. Regular backends want a deep read for just the tables
5731  * they can see (MyDatabaseId + shared catalogs).
5732  */
5734  pgStatDBHash = pgstat_read_statsfiles(InvalidOid, false, false);
5735  else
5736  pgStatDBHash = pgstat_read_statsfiles(MyDatabaseId, false, true);
5737 }
5738 
5739 
5740 /* ----------
5741  * pgstat_setup_memcxt() -
5742  *
5743  * Create pgStatLocalContext, if not already done.
5744  * ----------
5745  */
5746 static void
5748 {
5749  if (!pgStatLocalContext)
5750  pgStatLocalContext = AllocSetContextCreate(TopMemoryContext,
5751  "Statistics snapshot",
5753 }
5754 
5755 
5756 /* ----------
5757  * pgstat_clear_snapshot() -
5758  *
5759  * Discard any data collected in the current transaction. Any subsequent
5760  * request will cause new snapshots to be read.
5761  *
5762  * This is also invoked during transaction commit or abort to discard
5763  * the no-longer-wanted snapshot.
5764  * ----------
5765  */
5766 void
5768 {
5769  /* Release memory, if any was allocated */
5770  if (pgStatLocalContext)
5771  MemoryContextDelete(pgStatLocalContext);
5772 
5773  /* Reset variables */
5774  pgStatLocalContext = NULL;
5775  pgStatDBHash = NULL;
5776  localBackendStatusTable = NULL;
5777  localNumBackends = 0;
5778 }
5779 
5780 
5781 /* ----------
5782  * pgstat_recv_inquiry() -
5783  *
5784  * Process stat inquiry requests.
5785  * ----------
5786  */
5787 static void
5789 {
5790  PgStat_StatDBEntry *dbentry;
5791 
5792  elog(DEBUG2, "received inquiry for database %u", msg->databaseid);
5793 
5794  /*
5795  * If there's already a write request for this DB, there's nothing to do.
5796  *
5797  * Note that if a request is found, we return early and skip the below
5798  * check for clock skew. This is okay, since the only way for a DB
5799  * request to be present in the list is that we have been here since the
5800  * last write round. It seems sufficient to check for clock skew once per
5801  * write round.
5802  */
5803  if (list_member_oid(pending_write_requests, msg->databaseid))
5804  return;
5805 
5806  /*
5807  * Check to see if we last wrote this database at a time >= the requested
5808  * cutoff time. If so, this is a stale request that was generated before
5809  * we updated the DB file, and we don't need to do so again.
5810  *
5811  * If the requestor's local clock time is older than stats_timestamp, we
5812  * should suspect a clock glitch, ie system time going backwards; though
5813  * the more likely explanation is just delayed message receipt. It is
5814  * worth expending a GetCurrentTimestamp call to be sure, since a large
5815  * retreat in the system clock reading could otherwise cause us to neglect
5816  * to update the stats file for a long time.
5817  */
5818  dbentry = pgstat_get_db_entry(msg->databaseid, false);
5819  if (dbentry == NULL)
5820  {
5821  /*
5822  * We have no data for this DB. Enter a write request anyway so that
5823  * the global stats will get updated. This is needed to prevent
5824  * backend_read_statsfile from waiting for data that we cannot supply,
5825  * in the case of a new DB that nobody has yet reported any stats for.
5826  * See the behavior of pgstat_read_db_statsfile_timestamp.
5827  */
5828  }
5829  else if (msg->clock_time < dbentry->stats_timestamp)
5830  {
5831  TimestampTz cur_ts = GetCurrentTimestamp();
5832 
5833  if (cur_ts < dbentry->stats_timestamp)
5834  {
5835  /*
5836  * Sure enough, time went backwards. Force a new stats file write
5837  * to get back in sync; but first, log a complaint.
5838  */
5839  char *writetime;
5840  char *mytime;
5841 
5842  /* Copy because timestamptz_to_str returns a static buffer */
5843  writetime = pstrdup(timestamptz_to_str(dbentry->stats_timestamp));
5844  mytime = pstrdup(timestamptz_to_str(cur_ts));
5845  elog(LOG,
5846  "stats_timestamp %s is later than collector's time %s for database %u",
5847  writetime, mytime, dbentry->databaseid);
5848  pfree(writetime);
5849  pfree(mytime);
5850  }
5851  else
5852  {
5853  /*
5854  * Nope, it's just an old request. Assuming msg's clock_time is
5855  * >= its cutoff_time, it must be stale, so we can ignore it.
5856  */
5857  return;
5858  }
5859  }
5860  else if (msg->cutoff_time <= dbentry->stats_timestamp)
5861  {
5862  /* Stale request, ignore it */
5863  return;
5864  }
5865 
5866  /*
5867  * We need to write this DB, so create a request.
5868  */
5869  pending_write_requests = lappend_oid(pending_write_requests,
5870  msg->databaseid);
5871 }
5872 
5873 
5874 /* ----------
5875  * pgstat_recv_tabstat() -
5876  *
5877  * Count what the backend has done.
5878  * ----------
5879  */
5880 static void
5882 {
5883  PgStat_StatDBEntry *dbentry;
5884  PgStat_StatTabEntry *tabentry;
5885  int i;
5886  bool found;
5887 
5888  dbentry = pgstat_get_db_entry(msg->m_databaseid, true);
5889 
5890  /*
5891  * Update database-wide stats.
5892  */
5893  dbentry->n_xact_commit += (PgStat_Counter) (msg->m_xact_commit);
5894  dbentry->n_xact_rollback += (PgStat_Counter) (msg->m_xact_rollback);
5895  dbentry->n_block_read_time += msg->m_block_read_time;
5896  dbentry->n_block_write_time += msg->m_block_write_time;
5897 
5898  /*
5899  * Process all table entries in the message.
5900  */
5901  for (i = 0; i < msg->m_nentries; i++)
5902  {
5903  PgStat_TableEntry *tabmsg = &(msg->m_entry[i]);
5904 
5905  tabentry = (PgStat_StatTabEntry *) hash_search(dbentry->tables,
5906  (void *) &(tabmsg->t_id),
5907  HASH_ENTER, &found);
5908 
5909  if (!found)
5910  {
5911  /*
5912  * If it's a new table entry, initialize counters to the values we
5913  * just got.
5914  */
5915  tabentry->numscans = tabmsg->t_counts.t_numscans;
5916  tabentry->tuples_returned = tabmsg->t_counts.t_tuples_returned;
5917  tabentry->tuples_fetched = tabmsg->t_counts.t_tuples_fetched;
5918  tabentry->tuples_inserted = tabmsg->t_counts.t_tuples_inserted;
5919  tabentry->tuples_updated = tabmsg->t_counts.t_tuples_updated;
5920  tabentry->tuples_deleted = tabmsg->t_counts.t_tuples_deleted;
5921  tabentry->tuples_hot_updated = tabmsg->t_counts.t_tuples_hot_updated;
5922  tabentry->n_live_tuples = tabmsg->t_counts.t_delta_live_tuples;
5923  tabentry->n_dead_tuples = tabmsg->t_counts.t_delta_dead_tuples;
5924  tabentry->