PostgreSQL Source Code  git master
costsize.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * costsize.c
4  * Routines to compute (and set) relation sizes and path costs
5  *
6  * Path costs are measured in arbitrary units established by these basic
7  * parameters:
8  *
9  * seq_page_cost Cost of a sequential page fetch
10  * random_page_cost Cost of a non-sequential page fetch
11  * cpu_tuple_cost Cost of typical CPU time to process a tuple
12  * cpu_index_tuple_cost Cost of typical CPU time to process an index tuple
13  * cpu_operator_cost Cost of CPU time to execute an operator or function
14  * parallel_tuple_cost Cost of CPU time to pass a tuple from worker to leader backend
15  * parallel_setup_cost Cost of setting up shared memory for parallelism
16  *
17  * We expect that the kernel will typically do some amount of read-ahead
18  * optimization; this in conjunction with seek costs means that seq_page_cost
19  * is normally considerably less than random_page_cost. (However, if the
20  * database is fully cached in RAM, it is reasonable to set them equal.)
21  *
22  * We also use a rough estimate "effective_cache_size" of the number of
23  * disk pages in Postgres + OS-level disk cache. (We can't simply use
24  * NBuffers for this purpose because that would ignore the effects of
25  * the kernel's disk cache.)
26  *
27  * Obviously, taking constants for these values is an oversimplification,
28  * but it's tough enough to get any useful estimates even at this level of
29  * detail. Note that all of these parameters are user-settable, in case
30  * the default values are drastically off for a particular platform.
31  *
32  * seq_page_cost and random_page_cost can also be overridden for an individual
33  * tablespace, in case some data is on a fast disk and other data is on a slow
34  * disk. Per-tablespace overrides never apply to temporary work files such as
35  * an external sort or a materialize node that overflows work_mem.
36  *
37  * We compute two separate costs for each path:
38  * total_cost: total estimated cost to fetch all tuples
39  * startup_cost: cost that is expended before first tuple is fetched
40  * In some scenarios, such as when there is a LIMIT or we are implementing
41  * an EXISTS(...) sub-select, it is not necessary to fetch all tuples of the
42  * path's result. A caller can estimate the cost of fetching a partial
43  * result by interpolating between startup_cost and total_cost. In detail:
44  * actual_cost = startup_cost +
45  * (total_cost - startup_cost) * tuples_to_fetch / path->rows;
46  * Note that a base relation's rows count (and, by extension, plan_rows for
47  * plan nodes below the LIMIT node) are set without regard to any LIMIT, so
48  * that this equation works properly. (Note: while path->rows is never zero
49  * for ordinary relations, it is zero for paths for provably-empty relations,
50  * so beware of division-by-zero.) The LIMIT is applied as a top-level
51  * plan node.
52  *
53  * For largely historical reasons, most of the routines in this module use
54  * the passed result Path only to store their results (rows, startup_cost and
55  * total_cost) into. All the input data they need is passed as separate
56  * parameters, even though much of it could be extracted from the Path.
57  * An exception is made for the cost_XXXjoin() routines, which expect all
58  * the other fields of the passed XXXPath to be filled in, and similarly
59  * cost_index() assumes the passed IndexPath is valid except for its output
60  * values.
61  *
62  *
63  * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
64  * Portions Copyright (c) 1994, Regents of the University of California
65  *
66  * IDENTIFICATION
67  * src/backend/optimizer/path/costsize.c
68  *
69  *-------------------------------------------------------------------------
70  */
71 
72 #include "postgres.h"
73 
74 #include <limits.h>
75 #include <math.h>
76 
77 #include "access/amapi.h"
78 #include "access/htup_details.h"
79 #include "access/tsmapi.h"
80 #include "executor/executor.h"
81 #include "executor/nodeAgg.h"
82 #include "executor/nodeHash.h"
83 #include "executor/nodeMemoize.h"
84 #include "miscadmin.h"
85 #include "nodes/makefuncs.h"
86 #include "nodes/nodeFuncs.h"
87 #include "optimizer/clauses.h"
88 #include "optimizer/cost.h"
89 #include "optimizer/optimizer.h"
90 #include "optimizer/pathnode.h"
91 #include "optimizer/paths.h"
92 #include "optimizer/placeholder.h"
93 #include "optimizer/plancat.h"
94 #include "optimizer/planmain.h"
95 #include "optimizer/restrictinfo.h"
96 #include "parser/parsetree.h"
97 #include "utils/lsyscache.h"
98 #include "utils/selfuncs.h"
99 #include "utils/spccache.h"
100 #include "utils/tuplesort.h"
101 
102 
103 #define LOG2(x) (log(x) / 0.693147180559945)
104 
105 /*
106  * Append and MergeAppend nodes are less expensive than some other operations
107  * which use cpu_tuple_cost; instead of adding a separate GUC, estimate the
108  * per-tuple cost as cpu_tuple_cost multiplied by this value.
109  */
110 #define APPEND_CPU_COST_MULTIPLIER 0.5
111 
112 /*
113  * Maximum value for row estimates. We cap row estimates to this to help
114  * ensure that costs based on these estimates remain within the range of what
115  * double can represent. add_path() wouldn't act sanely given infinite or NaN
116  * cost values.
117  */
118 #define MAXIMUM_ROWCOUNT 1e100
119 
128 
130 
132 
134 
135 bool enable_seqscan = true;
136 bool enable_indexscan = true;
138 bool enable_bitmapscan = true;
139 bool enable_tidscan = true;
140 bool enable_sort = true;
142 bool enable_hashagg = true;
143 bool enable_nestloop = true;
144 bool enable_material = true;
145 bool enable_memoize = true;
146 bool enable_mergejoin = true;
147 bool enable_hashjoin = true;
148 bool enable_gathermerge = true;
156 
157 typedef struct
158 {
162 
163 static List *extract_nonindex_conditions(List *qual_clauses, List *indexclauses);
165  RestrictInfo *rinfo,
166  PathKey *pathkey);
167 static void cost_rescan(PlannerInfo *root, Path *path,
168  Cost *rescan_startup_cost, Cost *rescan_total_cost);
169 static bool cost_qual_eval_walker(Node *node, cost_qual_eval_context *context);
170 static void get_restriction_qual_cost(PlannerInfo *root, RelOptInfo *baserel,
171  ParamPathInfo *param_info,
172  QualCost *qpqual_cost);
173 static bool has_indexed_join_quals(NestPath *path);
174 static double approx_tuple_count(PlannerInfo *root, JoinPath *path,
175  List *quals);
176 static double calc_joinrel_size_estimate(PlannerInfo *root,
177  RelOptInfo *joinrel,
178  RelOptInfo *outer_rel,
179  RelOptInfo *inner_rel,
180  double outer_rows,
181  double inner_rows,
182  SpecialJoinInfo *sjinfo,
183  List *restrictlist);
185  Relids outer_relids,
186  Relids inner_relids,
187  SpecialJoinInfo *sjinfo,
188  List **restrictlist);
189 static Cost append_nonpartial_cost(List *subpaths, int numpaths,
190  int parallel_workers);
191 static void set_rel_width(PlannerInfo *root, RelOptInfo *rel);
192 static int32 get_expr_width(PlannerInfo *root, const Node *expr);
193 static double relation_byte_size(double tuples, int width);
194 static double page_size(double tuples, int width);
195 static double get_parallel_divisor(Path *path);
196 
197 
198 /*
199  * clamp_row_est
200  * Force a row-count estimate to a sane value.
201  */
202 double
203 clamp_row_est(double nrows)
204 {
205  /*
206  * Avoid infinite and NaN row estimates. Costs derived from such values
207  * are going to be useless. Also force the estimate to be at least one
208  * row, to make explain output look better and to avoid possible
209  * divide-by-zero when interpolating costs. Make it an integer, too.
210  */
211  if (nrows > MAXIMUM_ROWCOUNT || isnan(nrows))
212  nrows = MAXIMUM_ROWCOUNT;
213  else if (nrows <= 1.0)
214  nrows = 1.0;
215  else
216  nrows = rint(nrows);
217 
218  return nrows;
219 }
220 
221 /*
222  * clamp_width_est
223  * Force a tuple-width estimate to a sane value.
224  *
225  * The planner represents datatype width and tuple width estimates as int32.
226  * When summing column width estimates to create a tuple width estimate,
227  * it's possible to reach integer overflow in edge cases. To ensure sane
228  * behavior, we form such sums in int64 arithmetic and then apply this routine
229  * to clamp to int32 range.
230  */
231 int32
232 clamp_width_est(int64 tuple_width)
233 {
234  /*
235  * Anything more than MaxAllocSize is clearly bogus, since we could not
236  * create a tuple that large.
237  */
238  if (tuple_width > MaxAllocSize)
239  return (int32) MaxAllocSize;
240 
241  /*
242  * Unlike clamp_row_est, we just Assert that the value isn't negative,
243  * rather than masking such errors.
244  */
245  Assert(tuple_width >= 0);
246 
247  return (int32) tuple_width;
248 }
249 
250 /*
251  * clamp_cardinality_to_long
252  * Cast a Cardinality value to a sane long value.
253  */
254 long
256 {
257  /*
258  * Just for paranoia's sake, ensure we do something sane with negative or
259  * NaN values.
260  */
261  if (isnan(x))
262  return LONG_MAX;
263  if (x <= 0)
264  return 0;
265 
266  /*
267  * If "long" is 64 bits, then LONG_MAX cannot be represented exactly as a
268  * double. Casting it to double and back may well result in overflow due
269  * to rounding, so avoid doing that. We trust that any double value that
270  * compares strictly less than "(double) LONG_MAX" will cast to a
271  * representable "long" value.
272  */
273  return (x < (double) LONG_MAX) ? (long) x : LONG_MAX;
274 }
275 
276 
277 /*
278  * cost_seqscan
279  * Determines and returns the cost of scanning a relation sequentially.
280  *
281  * 'baserel' is the relation to be scanned
282  * 'param_info' is the ParamPathInfo if this is a parameterized path, else NULL
283  */
284 void
286  RelOptInfo *baserel, ParamPathInfo *param_info)
287 {
288  Cost startup_cost = 0;
289  Cost cpu_run_cost;
290  Cost disk_run_cost;
291  double spc_seq_page_cost;
292  QualCost qpqual_cost;
293  Cost cpu_per_tuple;
294 
295  /* Should only be applied to base relations */
296  Assert(baserel->relid > 0);
297  Assert(baserel->rtekind == RTE_RELATION);
298 
299  /* Mark the path with the correct row estimate */
300  if (param_info)
301  path->rows = param_info->ppi_rows;
302  else
303  path->rows = baserel->rows;
304 
305  if (!enable_seqscan)
306  startup_cost += disable_cost;
307 
308  /* fetch estimated page cost for tablespace containing table */
310  NULL,
311  &spc_seq_page_cost);
312 
313  /*
314  * disk costs
315  */
316  disk_run_cost = spc_seq_page_cost * baserel->pages;
317 
318  /* CPU costs */
319  get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
320 
321  startup_cost += qpqual_cost.startup;
322  cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple;
323  cpu_run_cost = cpu_per_tuple * baserel->tuples;
324  /* tlist eval costs are paid per output row, not per tuple scanned */
325  startup_cost += path->pathtarget->cost.startup;
326  cpu_run_cost += path->pathtarget->cost.per_tuple * path->rows;
327 
328  /* Adjust costing for parallelism, if used. */
329  if (path->parallel_workers > 0)
330  {
331  double parallel_divisor = get_parallel_divisor(path);
332 
333  /* The CPU cost is divided among all the workers. */
334  cpu_run_cost /= parallel_divisor;
335 
336  /*
337  * It may be possible to amortize some of the I/O cost, but probably
338  * not very much, because most operating systems already do aggressive
339  * prefetching. For now, we assume that the disk run cost can't be
340  * amortized at all.
341  */
342 
343  /*
344  * In the case of a parallel plan, the row count needs to represent
345  * the number of tuples processed per worker.
346  */
347  path->rows = clamp_row_est(path->rows / parallel_divisor);
348  }
349 
350  path->startup_cost = startup_cost;
351  path->total_cost = startup_cost + cpu_run_cost + disk_run_cost;
352 }
353 
354 /*
355  * cost_samplescan
356  * Determines and returns the cost of scanning a relation using sampling.
357  *
358  * 'baserel' is the relation to be scanned
359  * 'param_info' is the ParamPathInfo if this is a parameterized path, else NULL
360  */
361 void
363  RelOptInfo *baserel, ParamPathInfo *param_info)
364 {
365  Cost startup_cost = 0;
366  Cost run_cost = 0;
367  RangeTblEntry *rte;
368  TableSampleClause *tsc;
369  TsmRoutine *tsm;
370  double spc_seq_page_cost,
371  spc_random_page_cost,
372  spc_page_cost;
373  QualCost qpqual_cost;
374  Cost cpu_per_tuple;
375 
376  /* Should only be applied to base relations with tablesample clauses */
377  Assert(baserel->relid > 0);
378  rte = planner_rt_fetch(baserel->relid, root);
379  Assert(rte->rtekind == RTE_RELATION);
380  tsc = rte->tablesample;
381  Assert(tsc != NULL);
382  tsm = GetTsmRoutine(tsc->tsmhandler);
383 
384  /* Mark the path with the correct row estimate */
385  if (param_info)
386  path->rows = param_info->ppi_rows;
387  else
388  path->rows = baserel->rows;
389 
390  /* fetch estimated page cost for tablespace containing table */
392  &spc_random_page_cost,
393  &spc_seq_page_cost);
394 
395  /* if NextSampleBlock is used, assume random access, else sequential */
396  spc_page_cost = (tsm->NextSampleBlock != NULL) ?
397  spc_random_page_cost : spc_seq_page_cost;
398 
399  /*
400  * disk costs (recall that baserel->pages has already been set to the
401  * number of pages the sampling method will visit)
402  */
403  run_cost += spc_page_cost * baserel->pages;
404 
405  /*
406  * CPU costs (recall that baserel->tuples has already been set to the
407  * number of tuples the sampling method will select). Note that we ignore
408  * execution cost of the TABLESAMPLE parameter expressions; they will be
409  * evaluated only once per scan, and in most usages they'll likely be
410  * simple constants anyway. We also don't charge anything for the
411  * calculations the sampling method might do internally.
412  */
413  get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
414 
415  startup_cost += qpqual_cost.startup;
416  cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple;
417  run_cost += cpu_per_tuple * baserel->tuples;
418  /* tlist eval costs are paid per output row, not per tuple scanned */
419  startup_cost += path->pathtarget->cost.startup;
420  run_cost += path->pathtarget->cost.per_tuple * path->rows;
421 
422  path->startup_cost = startup_cost;
423  path->total_cost = startup_cost + run_cost;
424 }
425 
426 /*
427  * cost_gather
428  * Determines and returns the cost of gather path.
429  *
430  * 'rel' is the relation to be operated upon
431  * 'param_info' is the ParamPathInfo if this is a parameterized path, else NULL
432  * 'rows' may be used to point to a row estimate; if non-NULL, it overrides
433  * both 'rel' and 'param_info'. This is useful when the path doesn't exactly
434  * correspond to any particular RelOptInfo.
435  */
436 void
438  RelOptInfo *rel, ParamPathInfo *param_info,
439  double *rows)
440 {
441  Cost startup_cost = 0;
442  Cost run_cost = 0;
443 
444  /* Mark the path with the correct row estimate */
445  if (rows)
446  path->path.rows = *rows;
447  else if (param_info)
448  path->path.rows = param_info->ppi_rows;
449  else
450  path->path.rows = rel->rows;
451 
452  startup_cost = path->subpath->startup_cost;
453 
454  run_cost = path->subpath->total_cost - path->subpath->startup_cost;
455 
456  /* Parallel setup and communication cost. */
457  startup_cost += parallel_setup_cost;
458  run_cost += parallel_tuple_cost * path->path.rows;
459 
460  path->path.startup_cost = startup_cost;
461  path->path.total_cost = (startup_cost + run_cost);
462 }
463 
464 /*
465  * cost_gather_merge
466  * Determines and returns the cost of gather merge path.
467  *
468  * GatherMerge merges several pre-sorted input streams, using a heap that at
469  * any given instant holds the next tuple from each stream. If there are N
470  * streams, we need about N*log2(N) tuple comparisons to construct the heap at
471  * startup, and then for each output tuple, about log2(N) comparisons to
472  * replace the top heap entry with the next tuple from the same stream.
473  */
474 void
476  RelOptInfo *rel, ParamPathInfo *param_info,
477  Cost input_startup_cost, Cost input_total_cost,
478  double *rows)
479 {
480  Cost startup_cost = 0;
481  Cost run_cost = 0;
482  Cost comparison_cost;
483  double N;
484  double logN;
485 
486  /* Mark the path with the correct row estimate */
487  if (rows)
488  path->path.rows = *rows;
489  else if (param_info)
490  path->path.rows = param_info->ppi_rows;
491  else
492  path->path.rows = rel->rows;
493 
494  if (!enable_gathermerge)
495  startup_cost += disable_cost;
496 
497  /*
498  * Add one to the number of workers to account for the leader. This might
499  * be overgenerous since the leader will do less work than other workers
500  * in typical cases, but we'll go with it for now.
501  */
502  Assert(path->num_workers > 0);
503  N = (double) path->num_workers + 1;
504  logN = LOG2(N);
505 
506  /* Assumed cost per tuple comparison */
507  comparison_cost = 2.0 * cpu_operator_cost;
508 
509  /* Heap creation cost */
510  startup_cost += comparison_cost * N * logN;
511 
512  /* Per-tuple heap maintenance cost */
513  run_cost += path->path.rows * comparison_cost * logN;
514 
515  /* small cost for heap management, like cost_merge_append */
516  run_cost += cpu_operator_cost * path->path.rows;
517 
518  /*
519  * Parallel setup and communication cost. Since Gather Merge, unlike
520  * Gather, requires us to block until a tuple is available from every
521  * worker, we bump the IPC cost up a little bit as compared with Gather.
522  * For lack of a better idea, charge an extra 5%.
523  */
524  startup_cost += parallel_setup_cost;
525  run_cost += parallel_tuple_cost * path->path.rows * 1.05;
526 
527  path->path.startup_cost = startup_cost + input_startup_cost;
528  path->path.total_cost = (startup_cost + run_cost + input_total_cost);
529 }
530 
531 /*
532  * cost_index
533  * Determines and returns the cost of scanning a relation using an index.
534  *
535  * 'path' describes the indexscan under consideration, and is complete
536  * except for the fields to be set by this routine
537  * 'loop_count' is the number of repetitions of the indexscan to factor into
538  * estimates of caching behavior
539  *
540  * In addition to rows, startup_cost and total_cost, cost_index() sets the
541  * path's indextotalcost and indexselectivity fields. These values will be
542  * needed if the IndexPath is used in a BitmapIndexScan.
543  *
544  * NOTE: path->indexquals must contain only clauses usable as index
545  * restrictions. Any additional quals evaluated as qpquals may reduce the
546  * number of returned tuples, but they won't reduce the number of tuples
547  * we have to fetch from the table, so they don't reduce the scan cost.
548  */
549 void
550 cost_index(IndexPath *path, PlannerInfo *root, double loop_count,
551  bool partial_path)
552 {
553  IndexOptInfo *index = path->indexinfo;
554  RelOptInfo *baserel = index->rel;
555  bool indexonly = (path->path.pathtype == T_IndexOnlyScan);
556  amcostestimate_function amcostestimate;
557  List *qpquals;
558  Cost startup_cost = 0;
559  Cost run_cost = 0;
560  Cost cpu_run_cost = 0;
561  Cost indexStartupCost;
562  Cost indexTotalCost;
563  Selectivity indexSelectivity;
564  double indexCorrelation,
565  csquared;
566  double spc_seq_page_cost,
567  spc_random_page_cost;
568  Cost min_IO_cost,
569  max_IO_cost;
570  QualCost qpqual_cost;
571  Cost cpu_per_tuple;
572  double tuples_fetched;
573  double pages_fetched;
574  double rand_heap_pages;
575  double index_pages;
576 
577  /* Should only be applied to base relations */
578  Assert(IsA(baserel, RelOptInfo) &&
580  Assert(baserel->relid > 0);
581  Assert(baserel->rtekind == RTE_RELATION);
582 
583  /*
584  * Mark the path with the correct row estimate, and identify which quals
585  * will need to be enforced as qpquals. We need not check any quals that
586  * are implied by the index's predicate, so we can use indrestrictinfo not
587  * baserestrictinfo as the list of relevant restriction clauses for the
588  * rel.
589  */
590  if (path->path.param_info)
591  {
592  path->path.rows = path->path.param_info->ppi_rows;
593  /* qpquals come from the rel's restriction clauses and ppi_clauses */
595  path->indexclauses),
596  extract_nonindex_conditions(path->path.param_info->ppi_clauses,
597  path->indexclauses));
598  }
599  else
600  {
601  path->path.rows = baserel->rows;
602  /* qpquals come from just the rel's restriction clauses */
604  path->indexclauses);
605  }
606 
607  if (!enable_indexscan)
608  startup_cost += disable_cost;
609  /* we don't need to check enable_indexonlyscan; indxpath.c does that */
610 
611  /*
612  * Call index-access-method-specific code to estimate the processing cost
613  * for scanning the index, as well as the selectivity of the index (ie,
614  * the fraction of main-table tuples we will have to retrieve) and its
615  * correlation to the main-table tuple order. We need a cast here because
616  * pathnodes.h uses a weak function type to avoid including amapi.h.
617  */
618  amcostestimate = (amcostestimate_function) index->amcostestimate;
619  amcostestimate(root, path, loop_count,
620  &indexStartupCost, &indexTotalCost,
621  &indexSelectivity, &indexCorrelation,
622  &index_pages);
623 
624  /*
625  * Save amcostestimate's results for possible use in bitmap scan planning.
626  * We don't bother to save indexStartupCost or indexCorrelation, because a
627  * bitmap scan doesn't care about either.
628  */
629  path->indextotalcost = indexTotalCost;
630  path->indexselectivity = indexSelectivity;
631 
632  /* all costs for touching index itself included here */
633  startup_cost += indexStartupCost;
634  run_cost += indexTotalCost - indexStartupCost;
635 
636  /* estimate number of main-table tuples fetched */
637  tuples_fetched = clamp_row_est(indexSelectivity * baserel->tuples);
638 
639  /* fetch estimated page costs for tablespace containing table */
641  &spc_random_page_cost,
642  &spc_seq_page_cost);
643 
644  /*----------
645  * Estimate number of main-table pages fetched, and compute I/O cost.
646  *
647  * When the index ordering is uncorrelated with the table ordering,
648  * we use an approximation proposed by Mackert and Lohman (see
649  * index_pages_fetched() for details) to compute the number of pages
650  * fetched, and then charge spc_random_page_cost per page fetched.
651  *
652  * When the index ordering is exactly correlated with the table ordering
653  * (just after a CLUSTER, for example), the number of pages fetched should
654  * be exactly selectivity * table_size. What's more, all but the first
655  * will be sequential fetches, not the random fetches that occur in the
656  * uncorrelated case. So if the number of pages is more than 1, we
657  * ought to charge
658  * spc_random_page_cost + (pages_fetched - 1) * spc_seq_page_cost
659  * For partially-correlated indexes, we ought to charge somewhere between
660  * these two estimates. We currently interpolate linearly between the
661  * estimates based on the correlation squared (XXX is that appropriate?).
662  *
663  * If it's an index-only scan, then we will not need to fetch any heap
664  * pages for which the visibility map shows all tuples are visible.
665  * Hence, reduce the estimated number of heap fetches accordingly.
666  * We use the measured fraction of the entire heap that is all-visible,
667  * which might not be particularly relevant to the subset of the heap
668  * that this query will fetch; but it's not clear how to do better.
669  *----------
670  */
671  if (loop_count > 1)
672  {
673  /*
674  * For repeated indexscans, the appropriate estimate for the
675  * uncorrelated case is to scale up the number of tuples fetched in
676  * the Mackert and Lohman formula by the number of scans, so that we
677  * estimate the number of pages fetched by all the scans; then
678  * pro-rate the costs for one scan. In this case we assume all the
679  * fetches are random accesses.
680  */
681  pages_fetched = index_pages_fetched(tuples_fetched * loop_count,
682  baserel->pages,
683  (double) index->pages,
684  root);
685 
686  if (indexonly)
687  pages_fetched = ceil(pages_fetched * (1.0 - baserel->allvisfrac));
688 
689  rand_heap_pages = pages_fetched;
690 
691  max_IO_cost = (pages_fetched * spc_random_page_cost) / loop_count;
692 
693  /*
694  * In the perfectly correlated case, the number of pages touched by
695  * each scan is selectivity * table_size, and we can use the Mackert
696  * and Lohman formula at the page level to estimate how much work is
697  * saved by caching across scans. We still assume all the fetches are
698  * random, though, which is an overestimate that's hard to correct for
699  * without double-counting the cache effects. (But in most cases
700  * where such a plan is actually interesting, only one page would get
701  * fetched per scan anyway, so it shouldn't matter much.)
702  */
703  pages_fetched = ceil(indexSelectivity * (double) baserel->pages);
704 
705  pages_fetched = index_pages_fetched(pages_fetched * loop_count,
706  baserel->pages,
707  (double) index->pages,
708  root);
709 
710  if (indexonly)
711  pages_fetched = ceil(pages_fetched * (1.0 - baserel->allvisfrac));
712 
713  min_IO_cost = (pages_fetched * spc_random_page_cost) / loop_count;
714  }
715  else
716  {
717  /*
718  * Normal case: apply the Mackert and Lohman formula, and then
719  * interpolate between that and the correlation-derived result.
720  */
721  pages_fetched = index_pages_fetched(tuples_fetched,
722  baserel->pages,
723  (double) index->pages,
724  root);
725 
726  if (indexonly)
727  pages_fetched = ceil(pages_fetched * (1.0 - baserel->allvisfrac));
728 
729  rand_heap_pages = pages_fetched;
730 
731  /* max_IO_cost is for the perfectly uncorrelated case (csquared=0) */
732  max_IO_cost = pages_fetched * spc_random_page_cost;
733 
734  /* min_IO_cost is for the perfectly correlated case (csquared=1) */
735  pages_fetched = ceil(indexSelectivity * (double) baserel->pages);
736 
737  if (indexonly)
738  pages_fetched = ceil(pages_fetched * (1.0 - baserel->allvisfrac));
739 
740  if (pages_fetched > 0)
741  {
742  min_IO_cost = spc_random_page_cost;
743  if (pages_fetched > 1)
744  min_IO_cost += (pages_fetched - 1) * spc_seq_page_cost;
745  }
746  else
747  min_IO_cost = 0;
748  }
749 
750  if (partial_path)
751  {
752  /*
753  * For index only scans compute workers based on number of index pages
754  * fetched; the number of heap pages we fetch might be so small as to
755  * effectively rule out parallelism, which we don't want to do.
756  */
757  if (indexonly)
758  rand_heap_pages = -1;
759 
760  /*
761  * Estimate the number of parallel workers required to scan index. Use
762  * the number of heap pages computed considering heap fetches won't be
763  * sequential as for parallel scans the pages are accessed in random
764  * order.
765  */
767  rand_heap_pages,
768  index_pages,
770 
771  /*
772  * Fall out if workers can't be assigned for parallel scan, because in
773  * such a case this path will be rejected. So there is no benefit in
774  * doing extra computation.
775  */
776  if (path->path.parallel_workers <= 0)
777  return;
778 
779  path->path.parallel_aware = true;
780  }
781 
782  /*
783  * Now interpolate based on estimated index order correlation to get total
784  * disk I/O cost for main table accesses.
785  */
786  csquared = indexCorrelation * indexCorrelation;
787 
788  run_cost += max_IO_cost + csquared * (min_IO_cost - max_IO_cost);
789 
790  /*
791  * Estimate CPU costs per tuple.
792  *
793  * What we want here is cpu_tuple_cost plus the evaluation costs of any
794  * qual clauses that we have to evaluate as qpquals.
795  */
796  cost_qual_eval(&qpqual_cost, qpquals, root);
797 
798  startup_cost += qpqual_cost.startup;
799  cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple;
800 
801  cpu_run_cost += cpu_per_tuple * tuples_fetched;
802 
803  /* tlist eval costs are paid per output row, not per tuple scanned */
804  startup_cost += path->path.pathtarget->cost.startup;
805  cpu_run_cost += path->path.pathtarget->cost.per_tuple * path->path.rows;
806 
807  /* Adjust costing for parallelism, if used. */
808  if (path->path.parallel_workers > 0)
809  {
810  double parallel_divisor = get_parallel_divisor(&path->path);
811 
812  path->path.rows = clamp_row_est(path->path.rows / parallel_divisor);
813 
814  /* The CPU cost is divided among all the workers. */
815  cpu_run_cost /= parallel_divisor;
816  }
817 
818  run_cost += cpu_run_cost;
819 
820  path->path.startup_cost = startup_cost;
821  path->path.total_cost = startup_cost + run_cost;
822 }
823 
824 /*
825  * extract_nonindex_conditions
826  *
827  * Given a list of quals to be enforced in an indexscan, extract the ones that
828  * will have to be applied as qpquals (ie, the index machinery won't handle
829  * them). Here we detect only whether a qual clause is directly redundant
830  * with some indexclause. If the index path is chosen for use, createplan.c
831  * will try a bit harder to get rid of redundant qual conditions; specifically
832  * it will see if quals can be proven to be implied by the indexquals. But
833  * it does not seem worth the cycles to try to factor that in at this stage,
834  * since we're only trying to estimate qual eval costs. Otherwise this must
835  * match the logic in create_indexscan_plan().
836  *
837  * qual_clauses, and the result, are lists of RestrictInfos.
838  * indexclauses is a list of IndexClauses.
839  */
840 static List *
841 extract_nonindex_conditions(List *qual_clauses, List *indexclauses)
842 {
843  List *result = NIL;
844  ListCell *lc;
845 
846  foreach(lc, qual_clauses)
847  {
848  RestrictInfo *rinfo = lfirst_node(RestrictInfo, lc);
849 
850  if (rinfo->pseudoconstant)
851  continue; /* we may drop pseudoconstants here */
852  if (is_redundant_with_indexclauses(rinfo, indexclauses))
853  continue; /* dup or derived from same EquivalenceClass */
854  /* ... skip the predicate proof attempt createplan.c will try ... */
855  result = lappend(result, rinfo);
856  }
857  return result;
858 }
859 
860 /*
861  * index_pages_fetched
862  * Estimate the number of pages actually fetched after accounting for
863  * cache effects.
864  *
865  * We use an approximation proposed by Mackert and Lohman, "Index Scans
866  * Using a Finite LRU Buffer: A Validated I/O Model", ACM Transactions
867  * on Database Systems, Vol. 14, No. 3, September 1989, Pages 401-424.
868  * The Mackert and Lohman approximation is that the number of pages
869  * fetched is
870  * PF =
871  * min(2TNs/(2T+Ns), T) when T <= b
872  * 2TNs/(2T+Ns) when T > b and Ns <= 2Tb/(2T-b)
873  * b + (Ns - 2Tb/(2T-b))*(T-b)/T when T > b and Ns > 2Tb/(2T-b)
874  * where
875  * T = # pages in table
876  * N = # tuples in table
877  * s = selectivity = fraction of table to be scanned
878  * b = # buffer pages available (we include kernel space here)
879  *
880  * We assume that effective_cache_size is the total number of buffer pages
881  * available for the whole query, and pro-rate that space across all the
882  * tables in the query and the index currently under consideration. (This
883  * ignores space needed for other indexes used by the query, but since we
884  * don't know which indexes will get used, we can't estimate that very well;
885  * and in any case counting all the tables may well be an overestimate, since
886  * depending on the join plan not all the tables may be scanned concurrently.)
887  *
888  * The product Ns is the number of tuples fetched; we pass in that
889  * product rather than calculating it here. "pages" is the number of pages
890  * in the object under consideration (either an index or a table).
891  * "index_pages" is the amount to add to the total table space, which was
892  * computed for us by make_one_rel.
893  *
894  * Caller is expected to have ensured that tuples_fetched is greater than zero
895  * and rounded to integer (see clamp_row_est). The result will likewise be
896  * greater than zero and integral.
897  */
898 double
899 index_pages_fetched(double tuples_fetched, BlockNumber pages,
900  double index_pages, PlannerInfo *root)
901 {
902  double pages_fetched;
903  double total_pages;
904  double T,
905  b;
906 
907  /* T is # pages in table, but don't allow it to be zero */
908  T = (pages > 1) ? (double) pages : 1.0;
909 
910  /* Compute number of pages assumed to be competing for cache space */
911  total_pages = root->total_table_pages + index_pages;
912  total_pages = Max(total_pages, 1.0);
913  Assert(T <= total_pages);
914 
915  /* b is pro-rated share of effective_cache_size */
916  b = (double) effective_cache_size * T / total_pages;
917 
918  /* force it positive and integral */
919  if (b <= 1.0)
920  b = 1.0;
921  else
922  b = ceil(b);
923 
924  /* This part is the Mackert and Lohman formula */
925  if (T <= b)
926  {
927  pages_fetched =
928  (2.0 * T * tuples_fetched) / (2.0 * T + tuples_fetched);
929  if (pages_fetched >= T)
930  pages_fetched = T;
931  else
932  pages_fetched = ceil(pages_fetched);
933  }
934  else
935  {
936  double lim;
937 
938  lim = (2.0 * T * b) / (2.0 * T - b);
939  if (tuples_fetched <= lim)
940  {
941  pages_fetched =
942  (2.0 * T * tuples_fetched) / (2.0 * T + tuples_fetched);
943  }
944  else
945  {
946  pages_fetched =
947  b + (tuples_fetched - lim) * (T - b) / T;
948  }
949  pages_fetched = ceil(pages_fetched);
950  }
951  return pages_fetched;
952 }
953 
954 /*
955  * get_indexpath_pages
956  * Determine the total size of the indexes used in a bitmap index path.
957  *
958  * Note: if the same index is used more than once in a bitmap tree, we will
959  * count it multiple times, which perhaps is the wrong thing ... but it's
960  * not completely clear, and detecting duplicates is difficult, so ignore it
961  * for now.
962  */
963 static double
965 {
966  double result = 0;
967  ListCell *l;
968 
969  if (IsA(bitmapqual, BitmapAndPath))
970  {
971  BitmapAndPath *apath = (BitmapAndPath *) bitmapqual;
972 
973  foreach(l, apath->bitmapquals)
974  {
975  result += get_indexpath_pages((Path *) lfirst(l));
976  }
977  }
978  else if (IsA(bitmapqual, BitmapOrPath))
979  {
980  BitmapOrPath *opath = (BitmapOrPath *) bitmapqual;
981 
982  foreach(l, opath->bitmapquals)
983  {
984  result += get_indexpath_pages((Path *) lfirst(l));
985  }
986  }
987  else if (IsA(bitmapqual, IndexPath))
988  {
989  IndexPath *ipath = (IndexPath *) bitmapqual;
990 
991  result = (double) ipath->indexinfo->pages;
992  }
993  else
994  elog(ERROR, "unrecognized node type: %d", nodeTag(bitmapqual));
995 
996  return result;
997 }
998 
999 /*
1000  * cost_bitmap_heap_scan
1001  * Determines and returns the cost of scanning a relation using a bitmap
1002  * index-then-heap plan.
1003  *
1004  * 'baserel' is the relation to be scanned
1005  * 'param_info' is the ParamPathInfo if this is a parameterized path, else NULL
1006  * 'bitmapqual' is a tree of IndexPaths, BitmapAndPaths, and BitmapOrPaths
1007  * 'loop_count' is the number of repetitions of the indexscan to factor into
1008  * estimates of caching behavior
1009  *
1010  * Note: the component IndexPaths in bitmapqual should have been costed
1011  * using the same loop_count.
1012  */
1013 void
1015  ParamPathInfo *param_info,
1016  Path *bitmapqual, double loop_count)
1017 {
1018  Cost startup_cost = 0;
1019  Cost run_cost = 0;
1020  Cost indexTotalCost;
1021  QualCost qpqual_cost;
1022  Cost cpu_per_tuple;
1023  Cost cost_per_page;
1024  Cost cpu_run_cost;
1025  double tuples_fetched;
1026  double pages_fetched;
1027  double spc_seq_page_cost,
1028  spc_random_page_cost;
1029  double T;
1030 
1031  /* Should only be applied to base relations */
1032  Assert(IsA(baserel, RelOptInfo));
1033  Assert(baserel->relid > 0);
1034  Assert(baserel->rtekind == RTE_RELATION);
1035 
1036  /* Mark the path with the correct row estimate */
1037  if (param_info)
1038  path->rows = param_info->ppi_rows;
1039  else
1040  path->rows = baserel->rows;
1041 
1042  if (!enable_bitmapscan)
1043  startup_cost += disable_cost;
1044 
1045  pages_fetched = compute_bitmap_pages(root, baserel, bitmapqual,
1046  loop_count, &indexTotalCost,
1047  &tuples_fetched);
1048 
1049  startup_cost += indexTotalCost;
1050  T = (baserel->pages > 1) ? (double) baserel->pages : 1.0;
1051 
1052  /* Fetch estimated page costs for tablespace containing table. */
1054  &spc_random_page_cost,
1055  &spc_seq_page_cost);
1056 
1057  /*
1058  * For small numbers of pages we should charge spc_random_page_cost
1059  * apiece, while if nearly all the table's pages are being read, it's more
1060  * appropriate to charge spc_seq_page_cost apiece. The effect is
1061  * nonlinear, too. For lack of a better idea, interpolate like this to
1062  * determine the cost per page.
1063  */
1064  if (pages_fetched >= 2.0)
1065  cost_per_page = spc_random_page_cost -
1066  (spc_random_page_cost - spc_seq_page_cost)
1067  * sqrt(pages_fetched / T);
1068  else
1069  cost_per_page = spc_random_page_cost;
1070 
1071  run_cost += pages_fetched * cost_per_page;
1072 
1073  /*
1074  * Estimate CPU costs per tuple.
1075  *
1076  * Often the indexquals don't need to be rechecked at each tuple ... but
1077  * not always, especially not if there are enough tuples involved that the
1078  * bitmaps become lossy. For the moment, just assume they will be
1079  * rechecked always. This means we charge the full freight for all the
1080  * scan clauses.
1081  */
1082  get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
1083 
1084  startup_cost += qpqual_cost.startup;
1085  cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple;
1086  cpu_run_cost = cpu_per_tuple * tuples_fetched;
1087 
1088  /* Adjust costing for parallelism, if used. */
1089  if (path->parallel_workers > 0)
1090  {
1091  double parallel_divisor = get_parallel_divisor(path);
1092 
1093  /* The CPU cost is divided among all the workers. */
1094  cpu_run_cost /= parallel_divisor;
1095 
1096  path->rows = clamp_row_est(path->rows / parallel_divisor);
1097  }
1098 
1099 
1100  run_cost += cpu_run_cost;
1101 
1102  /* tlist eval costs are paid per output row, not per tuple scanned */
1103  startup_cost += path->pathtarget->cost.startup;
1104  run_cost += path->pathtarget->cost.per_tuple * path->rows;
1105 
1106  path->startup_cost = startup_cost;
1107  path->total_cost = startup_cost + run_cost;
1108 }
1109 
1110 /*
1111  * cost_bitmap_tree_node
1112  * Extract cost and selectivity from a bitmap tree node (index/and/or)
1113  */
1114 void
1116 {
1117  if (IsA(path, IndexPath))
1118  {
1119  *cost = ((IndexPath *) path)->indextotalcost;
1120  *selec = ((IndexPath *) path)->indexselectivity;
1121 
1122  /*
1123  * Charge a small amount per retrieved tuple to reflect the costs of
1124  * manipulating the bitmap. This is mostly to make sure that a bitmap
1125  * scan doesn't look to be the same cost as an indexscan to retrieve a
1126  * single tuple.
1127  */
1128  *cost += 0.1 * cpu_operator_cost * path->rows;
1129  }
1130  else if (IsA(path, BitmapAndPath))
1131  {
1132  *cost = path->total_cost;
1133  *selec = ((BitmapAndPath *) path)->bitmapselectivity;
1134  }
1135  else if (IsA(path, BitmapOrPath))
1136  {
1137  *cost = path->total_cost;
1138  *selec = ((BitmapOrPath *) path)->bitmapselectivity;
1139  }
1140  else
1141  {
1142  elog(ERROR, "unrecognized node type: %d", nodeTag(path));
1143  *cost = *selec = 0; /* keep compiler quiet */
1144  }
1145 }
1146 
1147 /*
1148  * cost_bitmap_and_node
1149  * Estimate the cost of a BitmapAnd node
1150  *
1151  * Note that this considers only the costs of index scanning and bitmap
1152  * creation, not the eventual heap access. In that sense the object isn't
1153  * truly a Path, but it has enough path-like properties (costs in particular)
1154  * to warrant treating it as one. We don't bother to set the path rows field,
1155  * however.
1156  */
1157 void
1159 {
1160  Cost totalCost;
1161  Selectivity selec;
1162  ListCell *l;
1163 
1164  /*
1165  * We estimate AND selectivity on the assumption that the inputs are
1166  * independent. This is probably often wrong, but we don't have the info
1167  * to do better.
1168  *
1169  * The runtime cost of the BitmapAnd itself is estimated at 100x
1170  * cpu_operator_cost for each tbm_intersect needed. Probably too small,
1171  * definitely too simplistic?
1172  */
1173  totalCost = 0.0;
1174  selec = 1.0;
1175  foreach(l, path->bitmapquals)
1176  {
1177  Path *subpath = (Path *) lfirst(l);
1178  Cost subCost;
1179  Selectivity subselec;
1180 
1181  cost_bitmap_tree_node(subpath, &subCost, &subselec);
1182 
1183  selec *= subselec;
1184 
1185  totalCost += subCost;
1186  if (l != list_head(path->bitmapquals))
1187  totalCost += 100.0 * cpu_operator_cost;
1188  }
1189  path->bitmapselectivity = selec;
1190  path->path.rows = 0; /* per above, not used */
1191  path->path.startup_cost = totalCost;
1192  path->path.total_cost = totalCost;
1193 }
1194 
1195 /*
1196  * cost_bitmap_or_node
1197  * Estimate the cost of a BitmapOr node
1198  *
1199  * See comments for cost_bitmap_and_node.
1200  */
1201 void
1203 {
1204  Cost totalCost;
1205  Selectivity selec;
1206  ListCell *l;
1207 
1208  /*
1209  * We estimate OR selectivity on the assumption that the inputs are
1210  * non-overlapping, since that's often the case in "x IN (list)" type
1211  * situations. Of course, we clamp to 1.0 at the end.
1212  *
1213  * The runtime cost of the BitmapOr itself is estimated at 100x
1214  * cpu_operator_cost for each tbm_union needed. Probably too small,
1215  * definitely too simplistic? We are aware that the tbm_unions are
1216  * optimized out when the inputs are BitmapIndexScans.
1217  */
1218  totalCost = 0.0;
1219  selec = 0.0;
1220  foreach(l, path->bitmapquals)
1221  {
1222  Path *subpath = (Path *) lfirst(l);
1223  Cost subCost;
1224  Selectivity subselec;
1225 
1226  cost_bitmap_tree_node(subpath, &subCost, &subselec);
1227 
1228  selec += subselec;
1229 
1230  totalCost += subCost;
1231  if (l != list_head(path->bitmapquals) &&
1232  !IsA(subpath, IndexPath))
1233  totalCost += 100.0 * cpu_operator_cost;
1234  }
1235  path->bitmapselectivity = Min(selec, 1.0);
1236  path->path.rows = 0; /* per above, not used */
1237  path->path.startup_cost = totalCost;
1238  path->path.total_cost = totalCost;
1239 }
1240 
1241 /*
1242  * cost_tidscan
1243  * Determines and returns the cost of scanning a relation using TIDs.
1244  *
1245  * 'baserel' is the relation to be scanned
1246  * 'tidquals' is the list of TID-checkable quals
1247  * 'param_info' is the ParamPathInfo if this is a parameterized path, else NULL
1248  */
1249 void
1251  RelOptInfo *baserel, List *tidquals, ParamPathInfo *param_info)
1252 {
1253  Cost startup_cost = 0;
1254  Cost run_cost = 0;
1255  bool isCurrentOf = false;
1256  QualCost qpqual_cost;
1257  Cost cpu_per_tuple;
1258  QualCost tid_qual_cost;
1259  double ntuples;
1260  ListCell *l;
1261  double spc_random_page_cost;
1262 
1263  /* Should only be applied to base relations */
1264  Assert(baserel->relid > 0);
1265  Assert(baserel->rtekind == RTE_RELATION);
1266 
1267  /* Mark the path with the correct row estimate */
1268  if (param_info)
1269  path->rows = param_info->ppi_rows;
1270  else
1271  path->rows = baserel->rows;
1272 
1273  /* Count how many tuples we expect to retrieve */
1274  ntuples = 0;
1275  foreach(l, tidquals)
1276  {
1277  RestrictInfo *rinfo = lfirst_node(RestrictInfo, l);
1278  Expr *qual = rinfo->clause;
1279 
1280  if (IsA(qual, ScalarArrayOpExpr))
1281  {
1282  /* Each element of the array yields 1 tuple */
1283  ScalarArrayOpExpr *saop = (ScalarArrayOpExpr *) qual;
1284  Node *arraynode = (Node *) lsecond(saop->args);
1285 
1286  ntuples += estimate_array_length(root, arraynode);
1287  }
1288  else if (IsA(qual, CurrentOfExpr))
1289  {
1290  /* CURRENT OF yields 1 tuple */
1291  isCurrentOf = true;
1292  ntuples++;
1293  }
1294  else
1295  {
1296  /* It's just CTID = something, count 1 tuple */
1297  ntuples++;
1298  }
1299  }
1300 
1301  /*
1302  * We must force TID scan for WHERE CURRENT OF, because only nodeTidscan.c
1303  * understands how to do it correctly. Therefore, honor enable_tidscan
1304  * only when CURRENT OF isn't present. Also note that cost_qual_eval
1305  * counts a CurrentOfExpr as having startup cost disable_cost, which we
1306  * subtract off here; that's to prevent other plan types such as seqscan
1307  * from winning.
1308  */
1309  if (isCurrentOf)
1310  {
1312  startup_cost -= disable_cost;
1313  }
1314  else if (!enable_tidscan)
1315  startup_cost += disable_cost;
1316 
1317  /*
1318  * The TID qual expressions will be computed once, any other baserestrict
1319  * quals once per retrieved tuple.
1320  */
1321  cost_qual_eval(&tid_qual_cost, tidquals, root);
1322 
1323  /* fetch estimated page cost for tablespace containing table */
1325  &spc_random_page_cost,
1326  NULL);
1327 
1328  /* disk costs --- assume each tuple on a different page */
1329  run_cost += spc_random_page_cost * ntuples;
1330 
1331  /* Add scanning CPU costs */
1332  get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
1333 
1334  /* XXX currently we assume TID quals are a subset of qpquals */
1335  startup_cost += qpqual_cost.startup + tid_qual_cost.per_tuple;
1336  cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple -
1337  tid_qual_cost.per_tuple;
1338  run_cost += cpu_per_tuple * ntuples;
1339 
1340  /* tlist eval costs are paid per output row, not per tuple scanned */
1341  startup_cost += path->pathtarget->cost.startup;
1342  run_cost += path->pathtarget->cost.per_tuple * path->rows;
1343 
1344  path->startup_cost = startup_cost;
1345  path->total_cost = startup_cost + run_cost;
1346 }
1347 
1348 /*
1349  * cost_tidrangescan
1350  * Determines and sets the costs of scanning a relation using a range of
1351  * TIDs for 'path'
1352  *
1353  * 'baserel' is the relation to be scanned
1354  * 'tidrangequals' is the list of TID-checkable range quals
1355  * 'param_info' is the ParamPathInfo if this is a parameterized path, else NULL
1356  */
1357 void
1359  RelOptInfo *baserel, List *tidrangequals,
1360  ParamPathInfo *param_info)
1361 {
1362  Selectivity selectivity;
1363  double pages;
1364  Cost startup_cost = 0;
1365  Cost run_cost = 0;
1366  QualCost qpqual_cost;
1367  Cost cpu_per_tuple;
1368  QualCost tid_qual_cost;
1369  double ntuples;
1370  double nseqpages;
1371  double spc_random_page_cost;
1372  double spc_seq_page_cost;
1373 
1374  /* Should only be applied to base relations */
1375  Assert(baserel->relid > 0);
1376  Assert(baserel->rtekind == RTE_RELATION);
1377 
1378  /* Mark the path with the correct row estimate */
1379  if (param_info)
1380  path->rows = param_info->ppi_rows;
1381  else
1382  path->rows = baserel->rows;
1383 
1384  /* Count how many tuples and pages we expect to scan */
1385  selectivity = clauselist_selectivity(root, tidrangequals, baserel->relid,
1386  JOIN_INNER, NULL);
1387  pages = ceil(selectivity * baserel->pages);
1388 
1389  if (pages <= 0.0)
1390  pages = 1.0;
1391 
1392  /*
1393  * The first page in a range requires a random seek, but each subsequent
1394  * page is just a normal sequential page read. NOTE: it's desirable for
1395  * TID Range Scans to cost more than the equivalent Sequential Scans,
1396  * because Seq Scans have some performance advantages such as scan
1397  * synchronization and parallelizability, and we'd prefer one of them to
1398  * be picked unless a TID Range Scan really is better.
1399  */
1400  ntuples = selectivity * baserel->tuples;
1401  nseqpages = pages - 1.0;
1402 
1403  if (!enable_tidscan)
1404  startup_cost += disable_cost;
1405 
1406  /*
1407  * The TID qual expressions will be computed once, any other baserestrict
1408  * quals once per retrieved tuple.
1409  */
1410  cost_qual_eval(&tid_qual_cost, tidrangequals, root);
1411 
1412  /* fetch estimated page cost for tablespace containing table */
1414  &spc_random_page_cost,
1415  &spc_seq_page_cost);
1416 
1417  /* disk costs; 1 random page and the remainder as seq pages */
1418  run_cost += spc_random_page_cost + spc_seq_page_cost * nseqpages;
1419 
1420  /* Add scanning CPU costs */
1421  get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
1422 
1423  /*
1424  * XXX currently we assume TID quals are a subset of qpquals at this
1425  * point; they will be removed (if possible) when we create the plan, so
1426  * we subtract their cost from the total qpqual cost. (If the TID quals
1427  * can't be removed, this is a mistake and we're going to underestimate
1428  * the CPU cost a bit.)
1429  */
1430  startup_cost += qpqual_cost.startup + tid_qual_cost.per_tuple;
1431  cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple -
1432  tid_qual_cost.per_tuple;
1433  run_cost += cpu_per_tuple * ntuples;
1434 
1435  /* tlist eval costs are paid per output row, not per tuple scanned */
1436  startup_cost += path->pathtarget->cost.startup;
1437  run_cost += path->pathtarget->cost.per_tuple * path->rows;
1438 
1439  path->startup_cost = startup_cost;
1440  path->total_cost = startup_cost + run_cost;
1441 }
1442 
1443 /*
1444  * cost_subqueryscan
1445  * Determines and returns the cost of scanning a subquery RTE.
1446  *
1447  * 'baserel' is the relation to be scanned
1448  * 'param_info' is the ParamPathInfo if this is a parameterized path, else NULL
1449  * 'trivial_pathtarget' is true if the pathtarget is believed to be trivial.
1450  */
1451 void
1453  RelOptInfo *baserel, ParamPathInfo *param_info,
1454  bool trivial_pathtarget)
1455 {
1456  Cost startup_cost;
1457  Cost run_cost;
1458  List *qpquals;
1459  QualCost qpqual_cost;
1460  Cost cpu_per_tuple;
1461 
1462  /* Should only be applied to base relations that are subqueries */
1463  Assert(baserel->relid > 0);
1464  Assert(baserel->rtekind == RTE_SUBQUERY);
1465 
1466  /*
1467  * We compute the rowcount estimate as the subplan's estimate times the
1468  * selectivity of relevant restriction clauses. In simple cases this will
1469  * come out the same as baserel->rows; but when dealing with parallelized
1470  * paths we must do it like this to get the right answer.
1471  */
1472  if (param_info)
1473  qpquals = list_concat_copy(param_info->ppi_clauses,
1474  baserel->baserestrictinfo);
1475  else
1476  qpquals = baserel->baserestrictinfo;
1477 
1478  path->path.rows = clamp_row_est(path->subpath->rows *
1480  qpquals,
1481  0,
1482  JOIN_INNER,
1483  NULL));
1484 
1485  /*
1486  * Cost of path is cost of evaluating the subplan, plus cost of evaluating
1487  * any restriction clauses and tlist that will be attached to the
1488  * SubqueryScan node, plus cpu_tuple_cost to account for selection and
1489  * projection overhead.
1490  */
1491  path->path.startup_cost = path->subpath->startup_cost;
1492  path->path.total_cost = path->subpath->total_cost;
1493 
1494  /*
1495  * However, if there are no relevant restriction clauses and the
1496  * pathtarget is trivial, then we expect that setrefs.c will optimize away
1497  * the SubqueryScan plan node altogether, so we should just make its cost
1498  * and rowcount equal to the input path's.
1499  *
1500  * Note: there are some edge cases where createplan.c will apply a
1501  * different targetlist to the SubqueryScan node, thus falsifying our
1502  * current estimate of whether the target is trivial, and making the cost
1503  * estimate (though not the rowcount) wrong. It does not seem worth the
1504  * extra complication to try to account for that exactly, especially since
1505  * that behavior falsifies other cost estimates as well.
1506  */
1507  if (qpquals == NIL && trivial_pathtarget)
1508  return;
1509 
1510  get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
1511 
1512  startup_cost = qpqual_cost.startup;
1513  cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple;
1514  run_cost = cpu_per_tuple * path->subpath->rows;
1515 
1516  /* tlist eval costs are paid per output row, not per tuple scanned */
1517  startup_cost += path->path.pathtarget->cost.startup;
1518  run_cost += path->path.pathtarget->cost.per_tuple * path->path.rows;
1519 
1520  path->path.startup_cost += startup_cost;
1521  path->path.total_cost += startup_cost + run_cost;
1522 }
1523 
1524 /*
1525  * cost_functionscan
1526  * Determines and returns the cost of scanning a function RTE.
1527  *
1528  * 'baserel' is the relation to be scanned
1529  * 'param_info' is the ParamPathInfo if this is a parameterized path, else NULL
1530  */
1531 void
1533  RelOptInfo *baserel, ParamPathInfo *param_info)
1534 {
1535  Cost startup_cost = 0;
1536  Cost run_cost = 0;
1537  QualCost qpqual_cost;
1538  Cost cpu_per_tuple;
1539  RangeTblEntry *rte;
1540  QualCost exprcost;
1541 
1542  /* Should only be applied to base relations that are functions */
1543  Assert(baserel->relid > 0);
1544  rte = planner_rt_fetch(baserel->relid, root);
1545  Assert(rte->rtekind == RTE_FUNCTION);
1546 
1547  /* Mark the path with the correct row estimate */
1548  if (param_info)
1549  path->rows = param_info->ppi_rows;
1550  else
1551  path->rows = baserel->rows;
1552 
1553  /*
1554  * Estimate costs of executing the function expression(s).
1555  *
1556  * Currently, nodeFunctionscan.c always executes the functions to
1557  * completion before returning any rows, and caches the results in a
1558  * tuplestore. So the function eval cost is all startup cost, and per-row
1559  * costs are minimal.
1560  *
1561  * XXX in principle we ought to charge tuplestore spill costs if the
1562  * number of rows is large. However, given how phony our rowcount
1563  * estimates for functions tend to be, there's not a lot of point in that
1564  * refinement right now.
1565  */
1566  cost_qual_eval_node(&exprcost, (Node *) rte->functions, root);
1567 
1568  startup_cost += exprcost.startup + exprcost.per_tuple;
1569 
1570  /* Add scanning CPU costs */
1571  get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
1572 
1573  startup_cost += qpqual_cost.startup;
1574  cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple;
1575  run_cost += cpu_per_tuple * baserel->tuples;
1576 
1577  /* tlist eval costs are paid per output row, not per tuple scanned */
1578  startup_cost += path->pathtarget->cost.startup;
1579  run_cost += path->pathtarget->cost.per_tuple * path->rows;
1580 
1581  path->startup_cost = startup_cost;
1582  path->total_cost = startup_cost + run_cost;
1583 }
1584 
1585 /*
1586  * cost_tablefuncscan
1587  * Determines and returns the cost of scanning a table function.
1588  *
1589  * 'baserel' is the relation to be scanned
1590  * 'param_info' is the ParamPathInfo if this is a parameterized path, else NULL
1591  */
1592 void
1594  RelOptInfo *baserel, ParamPathInfo *param_info)
1595 {
1596  Cost startup_cost = 0;
1597  Cost run_cost = 0;
1598  QualCost qpqual_cost;
1599  Cost cpu_per_tuple;
1600  RangeTblEntry *rte;
1601  QualCost exprcost;
1602 
1603  /* Should only be applied to base relations that are functions */
1604  Assert(baserel->relid > 0);
1605  rte = planner_rt_fetch(baserel->relid, root);
1606  Assert(rte->rtekind == RTE_TABLEFUNC);
1607 
1608  /* Mark the path with the correct row estimate */
1609  if (param_info)
1610  path->rows = param_info->ppi_rows;
1611  else
1612  path->rows = baserel->rows;
1613 
1614  /*
1615  * Estimate costs of executing the table func expression(s).
1616  *
1617  * XXX in principle we ought to charge tuplestore spill costs if the
1618  * number of rows is large. However, given how phony our rowcount
1619  * estimates for tablefuncs tend to be, there's not a lot of point in that
1620  * refinement right now.
1621  */
1622  cost_qual_eval_node(&exprcost, (Node *) rte->tablefunc, root);
1623 
1624  startup_cost += exprcost.startup + exprcost.per_tuple;
1625 
1626  /* Add scanning CPU costs */
1627  get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
1628 
1629  startup_cost += qpqual_cost.startup;
1630  cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple;
1631  run_cost += cpu_per_tuple * baserel->tuples;
1632 
1633  /* tlist eval costs are paid per output row, not per tuple scanned */
1634  startup_cost += path->pathtarget->cost.startup;
1635  run_cost += path->pathtarget->cost.per_tuple * path->rows;
1636 
1637  path->startup_cost = startup_cost;
1638  path->total_cost = startup_cost + run_cost;
1639 }
1640 
1641 /*
1642  * cost_valuesscan
1643  * Determines and returns the cost of scanning a VALUES RTE.
1644  *
1645  * 'baserel' is the relation to be scanned
1646  * 'param_info' is the ParamPathInfo if this is a parameterized path, else NULL
1647  */
1648 void
1650  RelOptInfo *baserel, ParamPathInfo *param_info)
1651 {
1652  Cost startup_cost = 0;
1653  Cost run_cost = 0;
1654  QualCost qpqual_cost;
1655  Cost cpu_per_tuple;
1656 
1657  /* Should only be applied to base relations that are values lists */
1658  Assert(baserel->relid > 0);
1659  Assert(baserel->rtekind == RTE_VALUES);
1660 
1661  /* Mark the path with the correct row estimate */
1662  if (param_info)
1663  path->rows = param_info->ppi_rows;
1664  else
1665  path->rows = baserel->rows;
1666 
1667  /*
1668  * For now, estimate list evaluation cost at one operator eval per list
1669  * (probably pretty bogus, but is it worth being smarter?)
1670  */
1671  cpu_per_tuple = cpu_operator_cost;
1672 
1673  /* Add scanning CPU costs */
1674  get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
1675 
1676  startup_cost += qpqual_cost.startup;
1677  cpu_per_tuple += cpu_tuple_cost + qpqual_cost.per_tuple;
1678  run_cost += cpu_per_tuple * baserel->tuples;
1679 
1680  /* tlist eval costs are paid per output row, not per tuple scanned */
1681  startup_cost += path->pathtarget->cost.startup;
1682  run_cost += path->pathtarget->cost.per_tuple * path->rows;
1683 
1684  path->startup_cost = startup_cost;
1685  path->total_cost = startup_cost + run_cost;
1686 }
1687 
1688 /*
1689  * cost_ctescan
1690  * Determines and returns the cost of scanning a CTE RTE.
1691  *
1692  * Note: this is used for both self-reference and regular CTEs; the
1693  * possible cost differences are below the threshold of what we could
1694  * estimate accurately anyway. Note that the costs of evaluating the
1695  * referenced CTE query are added into the final plan as initplan costs,
1696  * and should NOT be counted here.
1697  */
1698 void
1700  RelOptInfo *baserel, ParamPathInfo *param_info)
1701 {
1702  Cost startup_cost = 0;
1703  Cost run_cost = 0;
1704  QualCost qpqual_cost;
1705  Cost cpu_per_tuple;
1706 
1707  /* Should only be applied to base relations that are CTEs */
1708  Assert(baserel->relid > 0);
1709  Assert(baserel->rtekind == RTE_CTE);
1710 
1711  /* Mark the path with the correct row estimate */
1712  if (param_info)
1713  path->rows = param_info->ppi_rows;
1714  else
1715  path->rows = baserel->rows;
1716 
1717  /* Charge one CPU tuple cost per row for tuplestore manipulation */
1718  cpu_per_tuple = cpu_tuple_cost;
1719 
1720  /* Add scanning CPU costs */
1721  get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
1722 
1723  startup_cost += qpqual_cost.startup;
1724  cpu_per_tuple += cpu_tuple_cost + qpqual_cost.per_tuple;
1725  run_cost += cpu_per_tuple * baserel->tuples;
1726 
1727  /* tlist eval costs are paid per output row, not per tuple scanned */
1728  startup_cost += path->pathtarget->cost.startup;
1729  run_cost += path->pathtarget->cost.per_tuple * path->rows;
1730 
1731  path->startup_cost = startup_cost;
1732  path->total_cost = startup_cost + run_cost;
1733 }
1734 
1735 /*
1736  * cost_namedtuplestorescan
1737  * Determines and returns the cost of scanning a named tuplestore.
1738  */
1739 void
1741  RelOptInfo *baserel, ParamPathInfo *param_info)
1742 {
1743  Cost startup_cost = 0;
1744  Cost run_cost = 0;
1745  QualCost qpqual_cost;
1746  Cost cpu_per_tuple;
1747 
1748  /* Should only be applied to base relations that are Tuplestores */
1749  Assert(baserel->relid > 0);
1750  Assert(baserel->rtekind == RTE_NAMEDTUPLESTORE);
1751 
1752  /* Mark the path with the correct row estimate */
1753  if (param_info)
1754  path->rows = param_info->ppi_rows;
1755  else
1756  path->rows = baserel->rows;
1757 
1758  /* Charge one CPU tuple cost per row for tuplestore manipulation */
1759  cpu_per_tuple = cpu_tuple_cost;
1760 
1761  /* Add scanning CPU costs */
1762  get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
1763 
1764  startup_cost += qpqual_cost.startup;
1765  cpu_per_tuple += cpu_tuple_cost + qpqual_cost.per_tuple;
1766  run_cost += cpu_per_tuple * baserel->tuples;
1767 
1768  path->startup_cost = startup_cost;
1769  path->total_cost = startup_cost + run_cost;
1770 }
1771 
1772 /*
1773  * cost_resultscan
1774  * Determines and returns the cost of scanning an RTE_RESULT relation.
1775  */
1776 void
1778  RelOptInfo *baserel, ParamPathInfo *param_info)
1779 {
1780  Cost startup_cost = 0;
1781  Cost run_cost = 0;
1782  QualCost qpqual_cost;
1783  Cost cpu_per_tuple;
1784 
1785  /* Should only be applied to RTE_RESULT base relations */
1786  Assert(baserel->relid > 0);
1787  Assert(baserel->rtekind == RTE_RESULT);
1788 
1789  /* Mark the path with the correct row estimate */
1790  if (param_info)
1791  path->rows = param_info->ppi_rows;
1792  else
1793  path->rows = baserel->rows;
1794 
1795  /* We charge qual cost plus cpu_tuple_cost */
1796  get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
1797 
1798  startup_cost += qpqual_cost.startup;
1799  cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple;
1800  run_cost += cpu_per_tuple * baserel->tuples;
1801 
1802  path->startup_cost = startup_cost;
1803  path->total_cost = startup_cost + run_cost;
1804 }
1805 
1806 /*
1807  * cost_recursive_union
1808  * Determines and returns the cost of performing a recursive union,
1809  * and also the estimated output size.
1810  *
1811  * We are given Paths for the nonrecursive and recursive terms.
1812  */
1813 void
1814 cost_recursive_union(Path *runion, Path *nrterm, Path *rterm)
1815 {
1816  Cost startup_cost;
1817  Cost total_cost;
1818  double total_rows;
1819 
1820  /* We probably have decent estimates for the non-recursive term */
1821  startup_cost = nrterm->startup_cost;
1822  total_cost = nrterm->total_cost;
1823  total_rows = nrterm->rows;
1824 
1825  /*
1826  * We arbitrarily assume that about 10 recursive iterations will be
1827  * needed, and that we've managed to get a good fix on the cost and output
1828  * size of each one of them. These are mighty shaky assumptions but it's
1829  * hard to see how to do better.
1830  */
1831  total_cost += 10 * rterm->total_cost;
1832  total_rows += 10 * rterm->rows;
1833 
1834  /*
1835  * Also charge cpu_tuple_cost per row to account for the costs of
1836  * manipulating the tuplestores. (We don't worry about possible
1837  * spill-to-disk costs.)
1838  */
1839  total_cost += cpu_tuple_cost * total_rows;
1840 
1841  runion->startup_cost = startup_cost;
1842  runion->total_cost = total_cost;
1843  runion->rows = total_rows;
1844  runion->pathtarget->width = Max(nrterm->pathtarget->width,
1845  rterm->pathtarget->width);
1846 }
1847 
1848 /*
1849  * cost_tuplesort
1850  * Determines and returns the cost of sorting a relation using tuplesort,
1851  * not including the cost of reading the input data.
1852  *
1853  * If the total volume of data to sort is less than sort_mem, we will do
1854  * an in-memory sort, which requires no I/O and about t*log2(t) tuple
1855  * comparisons for t tuples.
1856  *
1857  * If the total volume exceeds sort_mem, we switch to a tape-style merge
1858  * algorithm. There will still be about t*log2(t) tuple comparisons in
1859  * total, but we will also need to write and read each tuple once per
1860  * merge pass. We expect about ceil(logM(r)) merge passes where r is the
1861  * number of initial runs formed and M is the merge order used by tuplesort.c.
1862  * Since the average initial run should be about sort_mem, we have
1863  * disk traffic = 2 * relsize * ceil(logM(p / sort_mem))
1864  * cpu = comparison_cost * t * log2(t)
1865  *
1866  * If the sort is bounded (i.e., only the first k result tuples are needed)
1867  * and k tuples can fit into sort_mem, we use a heap method that keeps only
1868  * k tuples in the heap; this will require about t*log2(k) tuple comparisons.
1869  *
1870  * The disk traffic is assumed to be 3/4ths sequential and 1/4th random
1871  * accesses (XXX can't we refine that guess?)
1872  *
1873  * By default, we charge two operator evals per tuple comparison, which should
1874  * be in the right ballpark in most cases. The caller can tweak this by
1875  * specifying nonzero comparison_cost; typically that's used for any extra
1876  * work that has to be done to prepare the inputs to the comparison operators.
1877  *
1878  * 'tuples' is the number of tuples in the relation
1879  * 'width' is the average tuple width in bytes
1880  * 'comparison_cost' is the extra cost per comparison, if any
1881  * 'sort_mem' is the number of kilobytes of work memory allowed for the sort
1882  * 'limit_tuples' is the bound on the number of output tuples; -1 if no bound
1883  */
1884 static void
1885 cost_tuplesort(Cost *startup_cost, Cost *run_cost,
1886  double tuples, int width,
1887  Cost comparison_cost, int sort_mem,
1888  double limit_tuples)
1889 {
1890  double input_bytes = relation_byte_size(tuples, width);
1891  double output_bytes;
1892  double output_tuples;
1893  long sort_mem_bytes = sort_mem * 1024L;
1894 
1895  /*
1896  * We want to be sure the cost of a sort is never estimated as zero, even
1897  * if passed-in tuple count is zero. Besides, mustn't do log(0)...
1898  */
1899  if (tuples < 2.0)
1900  tuples = 2.0;
1901 
1902  /* Include the default cost-per-comparison */
1903  comparison_cost += 2.0 * cpu_operator_cost;
1904 
1905  /* Do we have a useful LIMIT? */
1906  if (limit_tuples > 0 && limit_tuples < tuples)
1907  {
1908  output_tuples = limit_tuples;
1909  output_bytes = relation_byte_size(output_tuples, width);
1910  }
1911  else
1912  {
1913  output_tuples = tuples;
1914  output_bytes = input_bytes;
1915  }
1916 
1917  if (output_bytes > sort_mem_bytes)
1918  {
1919  /*
1920  * We'll have to use a disk-based sort of all the tuples
1921  */
1922  double npages = ceil(input_bytes / BLCKSZ);
1923  double nruns = input_bytes / sort_mem_bytes;
1924  double mergeorder = tuplesort_merge_order(sort_mem_bytes);
1925  double log_runs;
1926  double npageaccesses;
1927 
1928  /*
1929  * CPU costs
1930  *
1931  * Assume about N log2 N comparisons
1932  */
1933  *startup_cost = comparison_cost * tuples * LOG2(tuples);
1934 
1935  /* Disk costs */
1936 
1937  /* Compute logM(r) as log(r) / log(M) */
1938  if (nruns > mergeorder)
1939  log_runs = ceil(log(nruns) / log(mergeorder));
1940  else
1941  log_runs = 1.0;
1942  npageaccesses = 2.0 * npages * log_runs;
1943  /* Assume 3/4ths of accesses are sequential, 1/4th are not */
1944  *startup_cost += npageaccesses *
1945  (seq_page_cost * 0.75 + random_page_cost * 0.25);
1946  }
1947  else if (tuples > 2 * output_tuples || input_bytes > sort_mem_bytes)
1948  {
1949  /*
1950  * We'll use a bounded heap-sort keeping just K tuples in memory, for
1951  * a total number of tuple comparisons of N log2 K; but the constant
1952  * factor is a bit higher than for quicksort. Tweak it so that the
1953  * cost curve is continuous at the crossover point.
1954  */
1955  *startup_cost = comparison_cost * tuples * LOG2(2.0 * output_tuples);
1956  }
1957  else
1958  {
1959  /* We'll use plain quicksort on all the input tuples */
1960  *startup_cost = comparison_cost * tuples * LOG2(tuples);
1961  }
1962 
1963  /*
1964  * Also charge a small amount (arbitrarily set equal to operator cost) per
1965  * extracted tuple. We don't charge cpu_tuple_cost because a Sort node
1966  * doesn't do qual-checking or projection, so it has less overhead than
1967  * most plan nodes. Note it's correct to use tuples not output_tuples
1968  * here --- the upper LIMIT will pro-rate the run cost so we'd be double
1969  * counting the LIMIT otherwise.
1970  */
1971  *run_cost = cpu_operator_cost * tuples;
1972 }
1973 
1974 /*
1975  * cost_incremental_sort
1976  * Determines and returns the cost of sorting a relation incrementally, when
1977  * the input path is presorted by a prefix of the pathkeys.
1978  *
1979  * 'presorted_keys' is the number of leading pathkeys by which the input path
1980  * is sorted.
1981  *
1982  * We estimate the number of groups into which the relation is divided by the
1983  * leading pathkeys, and then calculate the cost of sorting a single group
1984  * with tuplesort using cost_tuplesort().
1985  */
1986 void
1988  PlannerInfo *root, List *pathkeys, int presorted_keys,
1989  Cost input_startup_cost, Cost input_total_cost,
1990  double input_tuples, int width, Cost comparison_cost, int sort_mem,
1991  double limit_tuples)
1992 {
1993  Cost startup_cost,
1994  run_cost,
1995  input_run_cost = input_total_cost - input_startup_cost;
1996  double group_tuples,
1997  input_groups;
1998  Cost group_startup_cost,
1999  group_run_cost,
2000  group_input_run_cost;
2001  List *presortedExprs = NIL;
2002  ListCell *l;
2003  bool unknown_varno = false;
2004 
2005  Assert(presorted_keys > 0 && presorted_keys < list_length(pathkeys));
2006 
2007  /*
2008  * We want to be sure the cost of a sort is never estimated as zero, even
2009  * if passed-in tuple count is zero. Besides, mustn't do log(0)...
2010  */
2011  if (input_tuples < 2.0)
2012  input_tuples = 2.0;
2013 
2014  /* Default estimate of number of groups, capped to one group per row. */
2015  input_groups = Min(input_tuples, DEFAULT_NUM_DISTINCT);
2016 
2017  /*
2018  * Extract presorted keys as list of expressions.
2019  *
2020  * We need to be careful about Vars containing "varno 0" which might have
2021  * been introduced by generate_append_tlist, which would confuse
2022  * estimate_num_groups (in fact it'd fail for such expressions). See
2023  * recurse_set_operations which has to deal with the same issue.
2024  *
2025  * Unlike recurse_set_operations we can't access the original target list
2026  * here, and even if we could it's not very clear how useful would that be
2027  * for a set operation combining multiple tables. So we simply detect if
2028  * there are any expressions with "varno 0" and use the default
2029  * DEFAULT_NUM_DISTINCT in that case.
2030  *
2031  * We might also use either 1.0 (a single group) or input_tuples (each row
2032  * being a separate group), pretty much the worst and best case for
2033  * incremental sort. But those are extreme cases and using something in
2034  * between seems reasonable. Furthermore, generate_append_tlist is used
2035  * for set operations, which are likely to produce mostly unique output
2036  * anyway - from that standpoint the DEFAULT_NUM_DISTINCT is defensive
2037  * while maintaining lower startup cost.
2038  */
2039  foreach(l, pathkeys)
2040  {
2041  PathKey *key = (PathKey *) lfirst(l);
2042  EquivalenceMember *member = (EquivalenceMember *)
2043  linitial(key->pk_eclass->ec_members);
2044 
2045  /*
2046  * Check if the expression contains Var with "varno 0" so that we
2047  * don't call estimate_num_groups in that case.
2048  */
2049  if (bms_is_member(0, pull_varnos(root, (Node *) member->em_expr)))
2050  {
2051  unknown_varno = true;
2052  break;
2053  }
2054 
2055  /* expression not containing any Vars with "varno 0" */
2056  presortedExprs = lappend(presortedExprs, member->em_expr);
2057 
2058  if (foreach_current_index(l) + 1 >= presorted_keys)
2059  break;
2060  }
2061 
2062  /* Estimate the number of groups with equal presorted keys. */
2063  if (!unknown_varno)
2064  input_groups = estimate_num_groups(root, presortedExprs, input_tuples,
2065  NULL, NULL);
2066 
2067  group_tuples = input_tuples / input_groups;
2068  group_input_run_cost = input_run_cost / input_groups;
2069 
2070  /*
2071  * Estimate the average cost of sorting of one group where presorted keys
2072  * are equal.
2073  */
2074  cost_tuplesort(&group_startup_cost, &group_run_cost,
2075  group_tuples, width, comparison_cost, sort_mem,
2076  limit_tuples);
2077 
2078  /*
2079  * Startup cost of incremental sort is the startup cost of its first group
2080  * plus the cost of its input.
2081  */
2082  startup_cost = group_startup_cost + input_startup_cost +
2083  group_input_run_cost;
2084 
2085  /*
2086  * After we started producing tuples from the first group, the cost of
2087  * producing all the tuples is given by the cost to finish processing this
2088  * group, plus the total cost to process the remaining groups, plus the
2089  * remaining cost of input.
2090  */
2091  run_cost = group_run_cost + (group_run_cost + group_startup_cost) *
2092  (input_groups - 1) + group_input_run_cost * (input_groups - 1);
2093 
2094  /*
2095  * Incremental sort adds some overhead by itself. Firstly, it has to
2096  * detect the sort groups. This is roughly equal to one extra copy and
2097  * comparison per tuple.
2098  */
2099  run_cost += (cpu_tuple_cost + comparison_cost) * input_tuples;
2100 
2101  /*
2102  * Additionally, we charge double cpu_tuple_cost for each input group to
2103  * account for the tuplesort_reset that's performed after each group.
2104  */
2105  run_cost += 2.0 * cpu_tuple_cost * input_groups;
2106 
2107  path->rows = input_tuples;
2108  path->startup_cost = startup_cost;
2109  path->total_cost = startup_cost + run_cost;
2110 }
2111 
2112 /*
2113  * cost_sort
2114  * Determines and returns the cost of sorting a relation, including
2115  * the cost of reading the input data.
2116  *
2117  * NOTE: some callers currently pass NIL for pathkeys because they
2118  * can't conveniently supply the sort keys. Since this routine doesn't
2119  * currently do anything with pathkeys anyway, that doesn't matter...
2120  * but if it ever does, it should react gracefully to lack of key data.
2121  * (Actually, the thing we'd most likely be interested in is just the number
2122  * of sort keys, which all callers *could* supply.)
2123  */
2124 void
2126  List *pathkeys, Cost input_cost, double tuples, int width,
2127  Cost comparison_cost, int sort_mem,
2128  double limit_tuples)
2129 
2130 {
2131  Cost startup_cost;
2132  Cost run_cost;
2133 
2134  cost_tuplesort(&startup_cost, &run_cost,
2135  tuples, width,
2136  comparison_cost, sort_mem,
2137  limit_tuples);
2138 
2139  if (!enable_sort)
2140  startup_cost += disable_cost;
2141 
2142  startup_cost += input_cost;
2143 
2144  path->rows = tuples;
2145  path->startup_cost = startup_cost;
2146  path->total_cost = startup_cost + run_cost;
2147 }
2148 
2149 /*
2150  * append_nonpartial_cost
2151  * Estimate the cost of the non-partial paths in a Parallel Append.
2152  * The non-partial paths are assumed to be the first "numpaths" paths
2153  * from the subpaths list, and to be in order of decreasing cost.
2154  */
2155 static Cost
2156 append_nonpartial_cost(List *subpaths, int numpaths, int parallel_workers)
2157 {
2158  Cost *costarr;
2159  int arrlen;
2160  ListCell *l;
2161  ListCell *cell;
2162  int path_index;
2163  int min_index;
2164  int max_index;
2165 
2166  if (numpaths == 0)
2167  return 0;
2168 
2169  /*
2170  * Array length is number of workers or number of relevant paths,
2171  * whichever is less.
2172  */
2173  arrlen = Min(parallel_workers, numpaths);
2174  costarr = (Cost *) palloc(sizeof(Cost) * arrlen);
2175 
2176  /* The first few paths will each be claimed by a different worker. */
2177  path_index = 0;
2178  foreach(cell, subpaths)
2179  {
2180  Path *subpath = (Path *) lfirst(cell);
2181 
2182  if (path_index == arrlen)
2183  break;
2184  costarr[path_index++] = subpath->total_cost;
2185  }
2186 
2187  /*
2188  * Since subpaths are sorted by decreasing cost, the last one will have
2189  * the minimum cost.
2190  */
2191  min_index = arrlen - 1;
2192 
2193  /*
2194  * For each of the remaining subpaths, add its cost to the array element
2195  * with minimum cost.
2196  */
2197  for_each_cell(l, subpaths, cell)
2198  {
2199  Path *subpath = (Path *) lfirst(l);
2200 
2201  /* Consider only the non-partial paths */
2202  if (path_index++ == numpaths)
2203  break;
2204 
2205  costarr[min_index] += subpath->total_cost;
2206 
2207  /* Update the new min cost array index */
2208  min_index = 0;
2209  for (int i = 0; i < arrlen; i++)
2210  {
2211  if (costarr[i] < costarr[min_index])
2212  min_index = i;
2213  }
2214  }
2215 
2216  /* Return the highest cost from the array */
2217  max_index = 0;
2218  for (int i = 0; i < arrlen; i++)
2219  {
2220  if (costarr[i] > costarr[max_index])
2221  max_index = i;
2222  }
2223 
2224  return costarr[max_index];
2225 }
2226 
2227 /*
2228  * cost_append
2229  * Determines and returns the cost of an Append node.
2230  */
2231 void
2233 {
2234  ListCell *l;
2235 
2236  apath->path.startup_cost = 0;
2237  apath->path.total_cost = 0;
2238  apath->path.rows = 0;
2239 
2240  if (apath->subpaths == NIL)
2241  return;
2242 
2243  if (!apath->path.parallel_aware)
2244  {
2245  List *pathkeys = apath->path.pathkeys;
2246 
2247  if (pathkeys == NIL)
2248  {
2249  Path *firstsubpath = (Path *) linitial(apath->subpaths);
2250 
2251  /*
2252  * For an unordered, non-parallel-aware Append we take the startup
2253  * cost as the startup cost of the first subpath.
2254  */
2255  apath->path.startup_cost = firstsubpath->startup_cost;
2256 
2257  /* Compute rows and costs as sums of subplan rows and costs. */
2258  foreach(l, apath->subpaths)
2259  {
2260  Path *subpath = (Path *) lfirst(l);
2261 
2262  apath->path.rows += subpath->rows;
2263  apath->path.total_cost += subpath->total_cost;
2264  }
2265  }
2266  else
2267  {
2268  /*
2269  * For an ordered, non-parallel-aware Append we take the startup
2270  * cost as the sum of the subpath startup costs. This ensures
2271  * that we don't underestimate the startup cost when a query's
2272  * LIMIT is such that several of the children have to be run to
2273  * satisfy it. This might be overkill --- another plausible hack
2274  * would be to take the Append's startup cost as the maximum of
2275  * the child startup costs. But we don't want to risk believing
2276  * that an ORDER BY LIMIT query can be satisfied at small cost
2277  * when the first child has small startup cost but later ones
2278  * don't. (If we had the ability to deal with nonlinear cost
2279  * interpolation for partial retrievals, we would not need to be
2280  * so conservative about this.)
2281  *
2282  * This case is also different from the above in that we have to
2283  * account for possibly injecting sorts into subpaths that aren't
2284  * natively ordered.
2285  */
2286  foreach(l, apath->subpaths)
2287  {
2288  Path *subpath = (Path *) lfirst(l);
2289  Path sort_path; /* dummy for result of cost_sort */
2290 
2291  if (!pathkeys_contained_in(pathkeys, subpath->pathkeys))
2292  {
2293  /*
2294  * We'll need to insert a Sort node, so include costs for
2295  * that. We can use the parent's LIMIT if any, since we
2296  * certainly won't pull more than that many tuples from
2297  * any child.
2298  */
2299  cost_sort(&sort_path,
2300  NULL, /* doesn't currently need root */
2301  pathkeys,
2302  subpath->total_cost,
2303  subpath->rows,
2304  subpath->pathtarget->width,
2305  0.0,
2306  work_mem,
2307  apath->limit_tuples);
2308  subpath = &sort_path;
2309  }
2310 
2311  apath->path.rows += subpath->rows;
2312  apath->path.startup_cost += subpath->startup_cost;
2313  apath->path.total_cost += subpath->total_cost;
2314  }
2315  }
2316  }
2317  else /* parallel-aware */
2318  {
2319  int i = 0;
2320  double parallel_divisor = get_parallel_divisor(&apath->path);
2321 
2322  /* Parallel-aware Append never produces ordered output. */
2323  Assert(apath->path.pathkeys == NIL);
2324 
2325  /* Calculate startup cost. */
2326  foreach(l, apath->subpaths)
2327  {
2328  Path *subpath = (Path *) lfirst(l);
2329 
2330  /*
2331  * Append will start returning tuples when the child node having
2332  * lowest startup cost is done setting up. We consider only the
2333  * first few subplans that immediately get a worker assigned.
2334  */
2335  if (i == 0)
2336  apath->path.startup_cost = subpath->startup_cost;
2337  else if (i < apath->path.parallel_workers)
2338  apath->path.startup_cost = Min(apath->path.startup_cost,
2339  subpath->startup_cost);
2340 
2341  /*
2342  * Apply parallel divisor to subpaths. Scale the number of rows
2343  * for each partial subpath based on the ratio of the parallel
2344  * divisor originally used for the subpath to the one we adopted.
2345  * Also add the cost of partial paths to the total cost, but
2346  * ignore non-partial paths for now.
2347  */
2348  if (i < apath->first_partial_path)
2349  apath->path.rows += subpath->rows / parallel_divisor;
2350  else
2351  {
2352  double subpath_parallel_divisor;
2353 
2354  subpath_parallel_divisor = get_parallel_divisor(subpath);
2355  apath->path.rows += subpath->rows * (subpath_parallel_divisor /
2356  parallel_divisor);
2357  apath->path.total_cost += subpath->total_cost;
2358  }
2359 
2360  apath->path.rows = clamp_row_est(apath->path.rows);
2361 
2362  i++;
2363  }
2364 
2365  /* Add cost for non-partial subpaths. */
2366  apath->path.total_cost +=
2368  apath->first_partial_path,
2369  apath->path.parallel_workers);
2370  }
2371 
2372  /*
2373  * Although Append does not do any selection or projection, it's not free;
2374  * add a small per-tuple overhead.
2375  */
2376  apath->path.total_cost +=
2378 }
2379 
2380 /*
2381  * cost_merge_append
2382  * Determines and returns the cost of a MergeAppend node.
2383  *
2384  * MergeAppend merges several pre-sorted input streams, using a heap that
2385  * at any given instant holds the next tuple from each stream. If there
2386  * are N streams, we need about N*log2(N) tuple comparisons to construct
2387  * the heap at startup, and then for each output tuple, about log2(N)
2388  * comparisons to replace the top entry.
2389  *
2390  * (The effective value of N will drop once some of the input streams are
2391  * exhausted, but it seems unlikely to be worth trying to account for that.)
2392  *
2393  * The heap is never spilled to disk, since we assume N is not very large.
2394  * So this is much simpler than cost_sort.
2395  *
2396  * As in cost_sort, we charge two operator evals per tuple comparison.
2397  *
2398  * 'pathkeys' is a list of sort keys
2399  * 'n_streams' is the number of input streams
2400  * 'input_startup_cost' is the sum of the input streams' startup costs
2401  * 'input_total_cost' is the sum of the input streams' total costs
2402  * 'tuples' is the number of tuples in all the streams
2403  */
2404 void
2406  List *pathkeys, int n_streams,
2407  Cost input_startup_cost, Cost input_total_cost,
2408  double tuples)
2409 {
2410  Cost startup_cost = 0;
2411  Cost run_cost = 0;
2412  Cost comparison_cost;
2413  double N;
2414  double logN;
2415 
2416  /*
2417  * Avoid log(0)...
2418  */
2419  N = (n_streams < 2) ? 2.0 : (double) n_streams;
2420  logN = LOG2(N);
2421 
2422  /* Assumed cost per tuple comparison */
2423  comparison_cost = 2.0 * cpu_operator_cost;
2424 
2425  /* Heap creation cost */
2426  startup_cost += comparison_cost * N * logN;
2427 
2428  /* Per-tuple heap maintenance cost */
2429  run_cost += tuples * comparison_cost * logN;
2430 
2431  /*
2432  * Although MergeAppend does not do any selection or projection, it's not
2433  * free; add a small per-tuple overhead.
2434  */
2435  run_cost += cpu_tuple_cost * APPEND_CPU_COST_MULTIPLIER * tuples;
2436 
2437  path->startup_cost = startup_cost + input_startup_cost;
2438  path->total_cost = startup_cost + run_cost + input_total_cost;
2439 }
2440 
2441 /*
2442  * cost_material
2443  * Determines and returns the cost of materializing a relation, including
2444  * the cost of reading the input data.
2445  *
2446  * If the total volume of data to materialize exceeds work_mem, we will need
2447  * to write it to disk, so the cost is much higher in that case.
2448  *
2449  * Note that here we are estimating the costs for the first scan of the
2450  * relation, so the materialization is all overhead --- any savings will
2451  * occur only on rescan, which is estimated in cost_rescan.
2452  */
2453 void
2455  Cost input_startup_cost, Cost input_total_cost,
2456  double tuples, int width)
2457 {
2458  Cost startup_cost = input_startup_cost;
2459  Cost run_cost = input_total_cost - input_startup_cost;
2460  double nbytes = relation_byte_size(tuples, width);
2461  long work_mem_bytes = work_mem * 1024L;
2462 
2463  path->rows = tuples;
2464 
2465  /*
2466  * Whether spilling or not, charge 2x cpu_operator_cost per tuple to
2467  * reflect bookkeeping overhead. (This rate must be more than what
2468  * cost_rescan charges for materialize, ie, cpu_operator_cost per tuple;
2469  * if it is exactly the same then there will be a cost tie between
2470  * nestloop with A outer, materialized B inner and nestloop with B outer,
2471  * materialized A inner. The extra cost ensures we'll prefer
2472  * materializing the smaller rel.) Note that this is normally a good deal
2473  * less than cpu_tuple_cost; which is OK because a Material plan node
2474  * doesn't do qual-checking or projection, so it's got less overhead than
2475  * most plan nodes.
2476  */
2477  run_cost += 2 * cpu_operator_cost * tuples;
2478 
2479  /*
2480  * If we will spill to disk, charge at the rate of seq_page_cost per page.
2481  * This cost is assumed to be evenly spread through the plan run phase,
2482  * which isn't exactly accurate but our cost model doesn't allow for
2483  * nonuniform costs within the run phase.
2484  */
2485  if (nbytes > work_mem_bytes)
2486  {
2487  double npages = ceil(nbytes / BLCKSZ);
2488 
2489  run_cost += seq_page_cost * npages;
2490  }
2491 
2492  path->startup_cost = startup_cost;
2493  path->total_cost = startup_cost + run_cost;
2494 }
2495 
2496 /*
2497  * cost_memoize_rescan
2498  * Determines the estimated cost of rescanning a Memoize node.
2499  *
2500  * In order to estimate this, we must gain knowledge of how often we expect to
2501  * be called and how many distinct sets of parameters we are likely to be
2502  * called with. If we expect a good cache hit ratio, then we can set our
2503  * costs to account for that hit ratio, plus a little bit of cost for the
2504  * caching itself. Caching will not work out well if we expect to be called
2505  * with too many distinct parameter values. The worst-case here is that we
2506  * never see any parameter value twice, in which case we'd never get a cache
2507  * hit and caching would be a complete waste of effort.
2508  */
2509 static void
2511  Cost *rescan_startup_cost, Cost *rescan_total_cost)
2512 {
2513  EstimationInfo estinfo;
2514  ListCell *lc;
2515  Cost input_startup_cost = mpath->subpath->startup_cost;
2516  Cost input_total_cost = mpath->subpath->total_cost;
2517  double tuples = mpath->subpath->rows;
2518  double calls = mpath->calls;
2519  int width = mpath->subpath->pathtarget->width;
2520 
2521  double hash_mem_bytes;
2522  double est_entry_bytes;
2523  double est_cache_entries;
2524  double ndistinct;
2525  double evict_ratio;
2526  double hit_ratio;
2527  Cost startup_cost;
2528  Cost total_cost;
2529 
2530  /* available cache space */
2531  hash_mem_bytes = get_hash_memory_limit();
2532 
2533  /*
2534  * Set the number of bytes each cache entry should consume in the cache.
2535  * To provide us with better estimations on how many cache entries we can
2536  * store at once, we make a call to the executor here to ask it what
2537  * memory overheads there are for a single cache entry.
2538  */
2539  est_entry_bytes = relation_byte_size(tuples, width) +
2541 
2542  /* include the estimated width for the cache keys */
2543  foreach(lc, mpath->param_exprs)
2544  est_entry_bytes += get_expr_width(root, (Node *) lfirst(lc));
2545 
2546  /* estimate on the upper limit of cache entries we can hold at once */
2547  est_cache_entries = floor(hash_mem_bytes / est_entry_bytes);
2548 
2549  /* estimate on the distinct number of parameter values */
2550  ndistinct = estimate_num_groups(root, mpath->param_exprs, calls, NULL,
2551  &estinfo);
2552 
2553  /*
2554  * When the estimation fell back on using a default value, it's a bit too
2555  * risky to assume that it's ok to use a Memoize node. The use of a
2556  * default could cause us to use a Memoize node when it's really
2557  * inappropriate to do so. If we see that this has been done, then we'll
2558  * assume that every call will have unique parameters, which will almost
2559  * certainly mean a MemoizePath will never survive add_path().
2560  */
2561  if ((estinfo.flags & SELFLAG_USED_DEFAULT) != 0)
2562  ndistinct = calls;
2563 
2564  /*
2565  * Since we've already estimated the maximum number of entries we can
2566  * store at once and know the estimated number of distinct values we'll be
2567  * called with, we'll take this opportunity to set the path's est_entries.
2568  * This will ultimately determine the hash table size that the executor
2569  * will use. If we leave this at zero, the executor will just choose the
2570  * size itself. Really this is not the right place to do this, but it's
2571  * convenient since everything is already calculated.
2572  */
2573  mpath->est_entries = Min(Min(ndistinct, est_cache_entries),
2574  PG_UINT32_MAX);
2575 
2576  /*
2577  * When the number of distinct parameter values is above the amount we can
2578  * store in the cache, then we'll have to evict some entries from the
2579  * cache. This is not free. Here we estimate how often we'll incur the
2580  * cost of that eviction.
2581  */
2582  evict_ratio = 1.0 - Min(est_cache_entries, ndistinct) / ndistinct;
2583 
2584  /*
2585  * In order to estimate how costly a single scan will be, we need to
2586  * attempt to estimate what the cache hit ratio will be. To do that we
2587  * must look at how many scans are estimated in total for this node and
2588  * how many of those scans we expect to get a cache hit.
2589  */
2590  hit_ratio = ((calls - ndistinct) / calls) *
2591  (est_cache_entries / Max(ndistinct, est_cache_entries));
2592 
2593  Assert(hit_ratio >= 0 && hit_ratio <= 1.0);
2594 
2595  /*
2596  * Set the total_cost accounting for the expected cache hit ratio. We
2597  * also add on a cpu_operator_cost to account for a cache lookup. This
2598  * will happen regardless of whether it's a cache hit or not.
2599  */
2600  total_cost = input_total_cost * (1.0 - hit_ratio) + cpu_operator_cost;
2601 
2602  /* Now adjust the total cost to account for cache evictions */
2603 
2604  /* Charge a cpu_tuple_cost for evicting the actual cache entry */
2605  total_cost += cpu_tuple_cost * evict_ratio;
2606 
2607  /*
2608  * Charge a 10th of cpu_operator_cost to evict every tuple in that entry.
2609  * The per-tuple eviction is really just a pfree, so charging a whole
2610  * cpu_operator_cost seems a little excessive.
2611  */
2612  total_cost += cpu_operator_cost / 10.0 * evict_ratio * tuples;
2613 
2614  /*
2615  * Now adjust for storing things in the cache, since that's not free
2616  * either. Everything must go in the cache. We don't proportion this
2617  * over any ratio, just apply it once for the scan. We charge a
2618  * cpu_tuple_cost for the creation of the cache entry and also a
2619  * cpu_operator_cost for each tuple we expect to cache.
2620  */
2621  total_cost += cpu_tuple_cost + cpu_operator_cost * tuples;
2622 
2623  /*
2624  * Getting the first row must be also be proportioned according to the
2625  * expected cache hit ratio.
2626  */
2627  startup_cost = input_startup_cost * (1.0 - hit_ratio);
2628 
2629  /*
2630  * Additionally we charge a cpu_tuple_cost to account for cache lookups,
2631  * which we'll do regardless of whether it was a cache hit or not.
2632  */
2633  startup_cost += cpu_tuple_cost;
2634 
2635  *rescan_startup_cost = startup_cost;
2636  *rescan_total_cost = total_cost;
2637 }
2638 
2639 /*
2640  * cost_agg
2641  * Determines and returns the cost of performing an Agg plan node,
2642  * including the cost of its input.
2643  *
2644  * aggcosts can be NULL when there are no actual aggregate functions (i.e.,
2645  * we are using a hashed Agg node just to do grouping).
2646  *
2647  * Note: when aggstrategy == AGG_SORTED, caller must ensure that input costs
2648  * are for appropriately-sorted input.
2649  */
2650 void
2652  AggStrategy aggstrategy, const AggClauseCosts *aggcosts,
2653  int numGroupCols, double numGroups,
2654  List *quals,
2655  Cost input_startup_cost, Cost input_total_cost,
2656  double input_tuples, double input_width)
2657 {
2658  double output_tuples;
2659  Cost startup_cost;
2660  Cost total_cost;
2661  AggClauseCosts dummy_aggcosts;
2662 
2663  /* Use all-zero per-aggregate costs if NULL is passed */
2664  if (aggcosts == NULL)
2665  {
2666  Assert(aggstrategy == AGG_HASHED);
2667  MemSet(&dummy_aggcosts, 0, sizeof(AggClauseCosts));
2668  aggcosts = &dummy_aggcosts;
2669  }
2670 
2671  /*
2672  * The transCost.per_tuple component of aggcosts should be charged once
2673  * per input tuple, corresponding to the costs of evaluating the aggregate
2674  * transfns and their input expressions. The finalCost.per_tuple component
2675  * is charged once per output tuple, corresponding to the costs of
2676  * evaluating the finalfns. Startup costs are of course charged but once.
2677  *
2678  * If we are grouping, we charge an additional cpu_operator_cost per
2679  * grouping column per input tuple for grouping comparisons.
2680  *
2681  * We will produce a single output tuple if not grouping, and a tuple per
2682  * group otherwise. We charge cpu_tuple_cost for each output tuple.
2683  *
2684  * Note: in this cost model, AGG_SORTED and AGG_HASHED have exactly the
2685  * same total CPU cost, but AGG_SORTED has lower startup cost. If the
2686  * input path is already sorted appropriately, AGG_SORTED should be
2687  * preferred (since it has no risk of memory overflow). This will happen
2688  * as long as the computed total costs are indeed exactly equal --- but if
2689  * there's roundoff error we might do the wrong thing. So be sure that
2690  * the computations below form the same intermediate values in the same
2691  * order.
2692  */
2693  if (aggstrategy == AGG_PLAIN)
2694  {
2695  startup_cost = input_total_cost;
2696  startup_cost += aggcosts->transCost.startup;
2697  startup_cost += aggcosts->transCost.per_tuple * input_tuples;
2698  startup_cost += aggcosts->finalCost.startup;
2699  startup_cost += aggcosts->finalCost.per_tuple;
2700  /* we aren't grouping */
2701  total_cost = startup_cost + cpu_tuple_cost;
2702  output_tuples = 1;
2703  }
2704  else if (aggstrategy == AGG_SORTED || aggstrategy == AGG_MIXED)
2705  {
2706  /* Here we are able to deliver output on-the-fly */
2707  startup_cost = input_startup_cost;
2708  total_cost = input_total_cost;
2709  if (aggstrategy == AGG_MIXED && !enable_hashagg)
2710  {
2711  startup_cost += disable_cost;
2712  total_cost += disable_cost;
2713  }
2714  /* calcs phrased this way to match HASHED case, see note above */
2715  total_cost += aggcosts->transCost.startup;
2716  total_cost += aggcosts->transCost.per_tuple * input_tuples;
2717  total_cost += (cpu_operator_cost * numGroupCols) * input_tuples;
2718  total_cost += aggcosts->finalCost.startup;
2719  total_cost += aggcosts->finalCost.per_tuple * numGroups;
2720  total_cost += cpu_tuple_cost * numGroups;
2721  output_tuples = numGroups;
2722  }
2723  else
2724  {
2725  /* must be AGG_HASHED */
2726  startup_cost = input_total_cost;
2727  if (!enable_hashagg)
2728  startup_cost += disable_cost;
2729  startup_cost += aggcosts->transCost.startup;
2730  startup_cost += aggcosts->transCost.per_tuple * input_tuples;
2731  /* cost of computing hash value */
2732  startup_cost += (cpu_operator_cost * numGroupCols) * input_tuples;
2733  startup_cost += aggcosts->finalCost.startup;
2734 
2735  total_cost = startup_cost;
2736  total_cost += aggcosts->finalCost.per_tuple * numGroups;
2737  /* cost of retrieving from hash table */
2738  total_cost += cpu_tuple_cost * numGroups;
2739  output_tuples = numGroups;
2740  }
2741 
2742  /*
2743  * Add the disk costs of hash aggregation that spills to disk.
2744  *
2745  * Groups that go into the hash table stay in memory until finalized, so
2746  * spilling and reprocessing tuples doesn't incur additional invocations
2747  * of transCost or finalCost. Furthermore, the computed hash value is
2748  * stored with the spilled tuples, so we don't incur extra invocations of
2749  * the hash function.
2750  *
2751  * Hash Agg begins returning tuples after the first batch is complete.
2752  * Accrue writes (spilled tuples) to startup_cost and to total_cost;
2753  * accrue reads only to total_cost.
2754  */
2755  if (aggstrategy == AGG_HASHED || aggstrategy == AGG_MIXED)
2756  {
2757  double pages;
2758  double pages_written = 0.0;
2759  double pages_read = 0.0;
2760  double spill_cost;
2761  double hashentrysize;
2762  double nbatches;
2763  Size mem_limit;
2764  uint64 ngroups_limit;
2765  int num_partitions;
2766  int depth;
2767 
2768  /*
2769  * Estimate number of batches based on the computed limits. If less
2770  * than or equal to one, all groups are expected to fit in memory;
2771  * otherwise we expect to spill.
2772  */
2773  hashentrysize = hash_agg_entry_size(list_length(root->aggtransinfos),
2774  input_width,
2775  aggcosts->transitionSpace);
2776  hash_agg_set_limits(hashentrysize, numGroups, 0, &mem_limit,
2777  &ngroups_limit, &num_partitions);
2778 
2779  nbatches = Max((numGroups * hashentrysize) / mem_limit,
2780  numGroups / ngroups_limit);
2781 
2782  nbatches = Max(ceil(nbatches), 1.0);
2783  num_partitions = Max(num_partitions, 2);
2784 
2785  /*
2786  * The number of partitions can change at different levels of
2787  * recursion; but for the purposes of this calculation assume it stays
2788  * constant.
2789  */
2790  depth = ceil(log(nbatches) / log(num_partitions));
2791 
2792  /*
2793  * Estimate number of pages read and written. For each level of
2794  * recursion, a tuple must be written and then later read.
2795  */
2796  pages = relation_byte_size(input_tuples, input_width) / BLCKSZ;
2797  pages_written = pages_read = pages * depth;
2798 
2799  /*
2800  * HashAgg has somewhat worse IO behavior than Sort on typical
2801  * hardware/OS combinations. Account for this with a generic penalty.
2802  */
2803  pages_read *= 2.0;
2804  pages_written *= 2.0;
2805 
2806  startup_cost += pages_written * random_page_cost;
2807  total_cost += pages_written * random_page_cost;
2808  total_cost += pages_read * seq_page_cost;
2809 
2810  /* account for CPU cost of spilling a tuple and reading it back */
2811  spill_cost = depth * input_tuples * 2.0 * cpu_tuple_cost;
2812  startup_cost += spill_cost;
2813  total_cost += spill_cost;
2814  }
2815 
2816  /*
2817  * If there are quals (HAVING quals), account for their cost and
2818  * selectivity.
2819  */
2820  if (quals)
2821  {
2822  QualCost qual_cost;
2823 
2824  cost_qual_eval(&qual_cost, quals, root);
2825  startup_cost += qual_cost.startup;
2826  total_cost += qual_cost.startup + output_tuples * qual_cost.per_tuple;
2827 
2828  output_tuples = clamp_row_est(output_tuples *
2830  quals,
2831  0,
2832  JOIN_INNER,
2833  NULL));
2834  }
2835 
2836  path->rows = output_tuples;
2837  path->startup_cost = startup_cost;
2838  path->total_cost = total_cost;
2839 }
2840 
2841 /*
2842  * get_windowclause_startup_tuples
2843  * Estimate how many tuples we'll need to fetch from a WindowAgg's
2844  * subnode before we can output the first WindowAgg tuple.
2845  *
2846  * How many tuples need to be read depends on the WindowClause. For example,
2847  * a WindowClause with no PARTITION BY and no ORDER BY requires that all
2848  * subnode tuples are read and aggregated before the WindowAgg can output
2849  * anything. If there's a PARTITION BY, then we only need to look at tuples
2850  * in the first partition. Here we attempt to estimate just how many
2851  * 'input_tuples' the WindowAgg will need to read for the given WindowClause
2852  * before the first tuple can be output.
2853  */
2854 static double
2856  double input_tuples)
2857 {
2858  int frameOptions = wc->frameOptions;
2859  double partition_tuples;
2860  double return_tuples;
2861  double peer_tuples;
2862 
2863  /*
2864  * First, figure out how many partitions there are likely to be and set
2865  * partition_tuples according to that estimate.
2866  */
2867  if (wc->partitionClause != NIL)
2868  {
2869  double num_partitions;
2870  List *partexprs = get_sortgrouplist_exprs(wc->partitionClause,
2871  root->parse->targetList);
2872 
2873  num_partitions = estimate_num_groups(root, partexprs, input_tuples,
2874  NULL, NULL);
2875  list_free(partexprs);
2876 
2877  partition_tuples = input_tuples / num_partitions;
2878  }
2879  else
2880  {
2881  /* all tuples belong to the same partition */
2882  partition_tuples = input_tuples;
2883  }
2884 
2885  /* estimate the number of tuples in each peer group */
2886  if (wc->orderClause != NIL)
2887  {
2888  double num_groups;
2889  List *orderexprs;
2890 
2891  orderexprs = get_sortgrouplist_exprs(wc->orderClause,
2892  root->parse->targetList);
2893 
2894  /* estimate out how many peer groups there are in the partition */
2895  num_groups = estimate_num_groups(root, orderexprs,
2896  partition_tuples, NULL,
2897  NULL);
2898  list_free(orderexprs);
2899  peer_tuples = partition_tuples / num_groups;
2900  }
2901  else
2902  {
2903  /* no ORDER BY so only 1 tuple belongs in each peer group */
2904  peer_tuples = 1.0;
2905  }
2906 
2907  if (frameOptions & FRAMEOPTION_END_UNBOUNDED_FOLLOWING)
2908  {
2909  /* include all partition rows */
2910  return_tuples = partition_tuples;
2911  }
2912  else if (frameOptions & FRAMEOPTION_END_CURRENT_ROW)
2913  {
2914  if (frameOptions & FRAMEOPTION_ROWS)
2915  {
2916  /* just count the current row */
2917  return_tuples = 1.0;
2918  }
2919  else if (frameOptions & (FRAMEOPTION_RANGE | FRAMEOPTION_GROUPS))
2920  {
2921  /*
2922  * When in RANGE/GROUPS mode, it's more complex. If there's no
2923  * ORDER BY, then all rows in the partition are peers, otherwise
2924  * we'll need to read the first group of peers.
2925  */
2926  if (wc->orderClause == NIL)
2927  return_tuples = partition_tuples;
2928  else
2929  return_tuples = peer_tuples;
2930  }
2931  else
2932  {
2933  /*
2934  * Something new we don't support yet? This needs attention.
2935  * We'll just return 1.0 in the meantime.
2936  */
2937  Assert(false);
2938  return_tuples = 1.0;
2939  }
2940  }
2941  else if (frameOptions & FRAMEOPTION_END_OFFSET_PRECEDING)
2942  {
2943  /*
2944  * BETWEEN ... AND N PRECEDING will only need to read the WindowAgg's
2945  * subnode after N ROWS/RANGES/GROUPS. N can be 0, but not negative,
2946  * so we'll just assume only the current row needs to be read to fetch
2947  * the first WindowAgg row.
2948  */
2949  return_tuples = 1.0;
2950  }
2951  else if (frameOptions & FRAMEOPTION_END_OFFSET_FOLLOWING)
2952  {
2953  Const *endOffset = (Const *) wc->endOffset;
2954  double end_offset_value;
2955 
2956  /* try and figure out the value specified in the endOffset. */
2957  if (IsA(endOffset, Const))
2958  {
2959  if (endOffset->constisnull)
2960  {
2961  /*
2962  * NULLs are not allowed, but currently, there's no code to
2963  * error out if there's a NULL Const. We'll only discover
2964  * this during execution. For now, just pretend everything is
2965  * fine and assume that just the first row/range/group will be
2966  * needed.
2967  */
2968  end_offset_value = 1.0;
2969  }
2970  else
2971  {
2972  switch (endOffset->consttype)
2973  {
2974  case INT2OID:
2975  end_offset_value =
2976  (double) DatumGetInt16(endOffset->constvalue);
2977  break;
2978  case INT4OID:
2979  end_offset_value =
2980  (double) DatumGetInt32(endOffset->constvalue);
2981  break;
2982  case INT8OID:
2983  end_offset_value =
2984  (double) DatumGetInt64(endOffset->constvalue);
2985  break;
2986  default:
2987  end_offset_value =
2988  partition_tuples / peer_tuples *
2990  break;
2991  }
2992  }
2993  }
2994  else
2995  {
2996  /*
2997  * When the end bound is not a Const, we'll just need to guess. We
2998  * just make use of DEFAULT_INEQ_SEL.
2999  */
3000  end_offset_value =
3001  partition_tuples / peer_tuples * DEFAULT_INEQ_SEL;
3002  }
3003 
3004  if (frameOptions & FRAMEOPTION_ROWS)
3005  {
3006  /* include the N FOLLOWING and the current row */
3007  return_tuples = end_offset_value + 1.0;
3008  }
3009  else if (frameOptions & (FRAMEOPTION_RANGE | FRAMEOPTION_GROUPS))
3010  {
3011  /* include N FOLLOWING ranges/group and the initial range/group */
3012  return_tuples = peer_tuples * (end_offset_value + 1.0);
3013  }
3014  else
3015  {
3016  /*
3017  * Something new we don't support yet? This needs attention.
3018  * We'll just return 1.0 in the meantime.
3019  */
3020  Assert(false);
3021  return_tuples = 1.0;
3022  }
3023  }
3024  else
3025  {
3026  /*
3027  * Something new we don't support yet? This needs attention. We'll
3028  * just return 1.0 in the meantime.
3029  */
3030  Assert(false);
3031  return_tuples = 1.0;
3032  }
3033 
3034  if (wc->partitionClause != NIL || wc->orderClause != NIL)
3035  {
3036  /*
3037  * Cap the return value to the estimated partition tuples and account
3038  * for the extra tuple WindowAgg will need to read to confirm the next
3039  * tuple does not belong to the same partition or peer group.
3040  */
3041  return_tuples = Min(return_tuples + 1.0, partition_tuples);
3042  }
3043  else
3044  {
3045  /*
3046  * Cap the return value so it's never higher than the expected tuples
3047  * in the partition.
3048  */
3049  return_tuples = Min(return_tuples, partition_tuples);
3050  }
3051 
3052  /*
3053  * We needn't worry about any EXCLUDE options as those only exclude rows
3054  * from being aggregated, not from being read from the WindowAgg's
3055  * subnode.
3056  */
3057 
3058  return clamp_row_est(return_tuples);
3059 }
3060 
3061 /*
3062  * cost_windowagg
3063  * Determines and returns the cost of performing a WindowAgg plan node,
3064  * including the cost of its input.
3065  *
3066  * Input is assumed already properly sorted.
3067  */
3068 void
3070  List *windowFuncs, WindowClause *winclause,
3071  Cost input_startup_cost, Cost input_total_cost,
3072  double input_tuples)
3073 {
3074  Cost startup_cost;
3075  Cost total_cost;
3076  double startup_tuples;
3077  int numPartCols;
3078  int numOrderCols;
3079  ListCell *lc;
3080 
3081  numPartCols = list_length(winclause->partitionClause);
3082  numOrderCols = list_length(winclause->orderClause);
3083 
3084  startup_cost = input_startup_cost;
3085  total_cost = input_total_cost;
3086 
3087  /*
3088  * Window functions are assumed to cost their stated execution cost, plus
3089  * the cost of evaluating their input expressions, per tuple. Since they
3090  * may in fact evaluate their inputs at multiple rows during each cycle,
3091  * this could be a drastic underestimate; but without a way to know how
3092  * many rows the window function will fetch, it's hard to do better. In
3093  * any case, it's a good estimate for all the built-in window functions,
3094  * so we'll just do this for now.
3095  */
3096  foreach(lc, windowFuncs)
3097  {
3098  WindowFunc *wfunc = lfirst_node(WindowFunc, lc);
3099  Cost wfunccost;
3100  QualCost argcosts;
3101 
3102  argcosts.startup = argcosts.per_tuple = 0;
3103  add_function_cost(root, wfunc->winfnoid, (Node *) wfunc,
3104  &argcosts);
3105  startup_cost += argcosts.startup;
3106  wfunccost = argcosts.per_tuple;
3107 
3108  /* also add the input expressions' cost to per-input-row costs */
3109  cost_qual_eval_node(&argcosts, (Node *) wfunc->args, root);
3110  startup_cost += argcosts.startup;
3111  wfunccost += argcosts.per_tuple;
3112 
3113  /*
3114  * Add the filter's cost to per-input-row costs. XXX We should reduce
3115  * input expression costs according to filter selectivity.
3116  */
3117  cost_qual_eval_node(&argcosts, (Node *) wfunc->aggfilter, root);
3118  startup_cost += argcosts.startup;
3119  wfunccost += argcosts.per_tuple;
3120 
3121  total_cost += wfunccost * input_tuples;
3122  }
3123 
3124  /*
3125  * We also charge cpu_operator_cost per grouping column per tuple for
3126  * grouping comparisons, plus cpu_tuple_cost per tuple for general
3127  * overhead.
3128  *
3129  * XXX this neglects costs of spooling the data to disk when it overflows
3130  * work_mem. Sooner or later that should get accounted for.
3131  */
3132  total_cost += cpu_operator_cost * (numPartCols + numOrderCols) * input_tuples;
3133  total_cost += cpu_tuple_cost * input_tuples;
3134 
3135  path->rows = input_tuples;
3136  path->startup_cost = startup_cost;
3137  path->total_cost = total_cost;
3138 
3139  /*
3140  * Also, take into account how many tuples we need to read from the
3141  * subnode in order to produce the first tuple from the WindowAgg. To do
3142  * this we proportion the run cost (total cost not including startup cost)
3143  * over the estimated startup tuples. We already included the startup
3144  * cost of the subnode, so we only need to do this when the estimated
3145  * startup tuples is above 1.0.
3146  */
3147  startup_tuples = get_windowclause_startup_tuples(root, winclause,
3148  input_tuples);
3149 
3150  if (startup_tuples > 1.0)
3151  path->startup_cost += (total_cost - startup_cost) / input_tuples *
3152  (startup_tuples - 1.0);
3153 }
3154 
3155 /*
3156  * cost_group
3157  * Determines and returns the cost of performing a Group plan node,
3158  * including the cost of its input.
3159  *
3160  * Note: caller must ensure that input costs are for appropriately-sorted
3161  * input.
3162  */
3163 void
3165  int numGroupCols, double numGroups,
3166  List *quals,
3167  Cost input_startup_cost, Cost input_total_cost,
3168  double input_tuples)
3169 {
3170  double output_tuples;
3171  Cost startup_cost;
3172  Cost total_cost;
3173 
3174  output_tuples = numGroups;
3175  startup_cost = input_startup_cost;
3176  total_cost = input_total_cost;
3177 
3178  /*
3179  * Charge one cpu_operator_cost per comparison per input tuple. We assume
3180  * all columns get compared at most of the tuples.
3181  */
3182  total_cost += cpu_operator_cost * input_tuples * numGroupCols;
3183 
3184  /*
3185  * If there are quals (HAVING quals), account for their cost and
3186  * selectivity.
3187  */
3188  if (quals)
3189  {
3190  QualCost qual_cost;
3191 
3192  cost_qual_eval(&qual_cost, quals, root);
3193  startup_cost += qual_cost.startup;
3194  total_cost += qual_cost.startup + output_tuples * qual_cost.per_tuple;
3195 
3196  output_tuples = clamp_row_est(output_tuples *
3198  quals,
3199  0,
3200  JOIN_INNER,
3201  NULL));
3202  }
3203 
3204  path->rows = output_tuples;
3205  path->startup_cost = startup_cost;
3206  path->total_cost = total_cost;
3207 }
3208 
3209 /*
3210  * initial_cost_nestloop
3211  * Preliminary estimate of the cost of a nestloop join path.
3212  *
3213  * This must quickly produce lower-bound estimates of the path's startup and
3214  * total costs. If we are unable to eliminate the proposed path from
3215  * consideration using the lower bounds, final_cost_nestloop will be called
3216  * to obtain the final estimates.
3217  *
3218  * The exact division of labor between this function and final_cost_nestloop
3219  * is private to them, and represents a tradeoff between speed of the initial
3220  * estimate and getting a tight lower bound. We choose to not examine the
3221  * join quals here, since that's by far the most expensive part of the
3222  * calculations. The end result is that CPU-cost considerations must be
3223  * left for the second phase; and for SEMI/ANTI joins, we must also postpone
3224  * incorporation of the inner path's run cost.
3225  *
3226  * 'workspace' is to be filled with startup_cost, total_cost, and perhaps
3227  * other data to be used by final_cost_nestloop
3228  * 'jointype' is the type of join to be performed
3229  * 'outer_path' is the outer input to the join
3230  * 'inner_path' is the inner input to the join
3231  * 'extra' contains miscellaneous information about the join
3232  */
3233 void
3235  JoinType jointype,
3236  Path *outer_path, Path *inner_path,
3237  JoinPathExtraData *extra)
3238 {
3239  Cost startup_cost = 0;
3240  Cost run_cost = 0;
3241  double outer_path_rows = outer_path->rows;
3242  Cost inner_rescan_start_cost;
3243  Cost inner_rescan_total_cost;
3244  Cost inner_run_cost;
3245  Cost inner_rescan_run_cost;
3246 
3247  /* estimate costs to rescan the inner relation */
3248  cost_rescan(root, inner_path,
3249  &inner_rescan_start_cost,
3250  &inner_rescan_total_cost);
3251 
3252  /* cost of source data */
3253 
3254  /*
3255  * NOTE: clearly, we must pay both outer and inner paths' startup_cost
3256  * before we can start returning tuples, so the join's startup cost is
3257  * their sum. We'll also pay the inner path's rescan startup cost
3258  * multiple times.
3259  */
3260  startup_cost += outer_path->startup_cost + inner_path->startup_cost;
3261  run_cost += outer_path->total_cost - outer_path->startup_cost;
3262  if (outer_path_rows > 1)
3263  run_cost += (outer_path_rows - 1) * inner_rescan_start_cost;
3264 
3265  inner_run_cost = inner_path->total_cost - inner_path->startup_cost;
3266  inner_rescan_run_cost = inner_rescan_total_cost - inner_rescan_start_cost;
3267 
3268  if (jointype == JOIN_SEMI || jointype == JOIN_ANTI ||
3269  extra->inner_unique)
3270  {
3271  /*
3272  * With a SEMI or ANTI join, or if the innerrel is known unique, the
3273  * executor will stop after the first match.
3274  *
3275  * Getting decent estimates requires inspection of the join quals,
3276  * which we choose to postpone to final_cost_nestloop.
3277  */
3278 
3279  /* Save private data for final_cost_nestloop */
3280  workspace->inner_run_cost = inner_run_cost;
3281  workspace->inner_rescan_run_cost = inner_rescan_run_cost;
3282  }
3283  else
3284  {
3285  /* Normal case; we'll scan whole input rel for each outer row */
3286  run_cost += inner_run_cost;
3287  if (outer_path_rows > 1)
3288  run_cost += (outer_path_rows - 1) * inner_rescan_run_cost;
3289  }
3290 
3291  /* CPU costs left for later */
3292 
3293  /* Public result fields */
3294  workspace->startup_cost = startup_cost;
3295  workspace->total_cost = startup_cost + run_cost;
3296  /* Save private data for final_cost_nestloop */
3297  workspace->run_cost = run_cost;
3298 }
3299 
3300 /*
3301  * final_cost_nestloop
3302  * Final estimate of the cost and result size of a nestloop join path.
3303  *
3304  * 'path' is already filled in except for the rows and cost fields
3305  * 'workspace' is the result from initial_cost_nestloop
3306  * 'extra' contains miscellaneous information about the join
3307  */
3308 void
3310  JoinCostWorkspace *workspace,
3311  JoinPathExtraData *extra)
3312 {
3313  Path *outer_path = path->jpath.outerjoinpath;
3314  Path *inner_path = path->jpath.innerjoinpath;
3315  double outer_path_rows = outer_path->rows;
3316  double inner_path_rows = inner_path->rows;
3317  Cost startup_cost = workspace->startup_cost;
3318  Cost run_cost = workspace->run_cost;
3319  Cost cpu_per_tuple;
3320  QualCost restrict_qual_cost;
3321  double ntuples;
3322 
3323  /* Protect some assumptions below that rowcounts aren't zero */
3324  if (outer_path_rows <= 0)
3325  outer_path_rows = 1;
3326  if (inner_path_rows <= 0)
3327  inner_path_rows = 1;
3328  /* Mark the path with the correct row estimate */
3329  if (path->jpath.path.param_info)
3330  path->jpath.path.rows = path->jpath.path.param_info->ppi_rows;
3331  else
3332  path->jpath.path.rows = path->jpath.path.parent->rows;
3333 
3334  /* For partial paths, scale row estimate. */
3335  if (path->jpath.path.parallel_workers > 0)
3336  {
3337  double parallel_divisor = get_parallel_divisor(&path->jpath.path);
3338 
3339  path->jpath.path.rows =
3340  clamp_row_est(path->jpath.path.rows / parallel_divisor);
3341  }
3342 
3343  /*
3344  * We could include disable_cost in the preliminary estimate, but that
3345  * would amount to optimizing for the case where the join method is
3346  * disabled, which doesn't seem like the way to bet.
3347  */
3348  if (!enable_nestloop)
3349  startup_cost += disable_cost;
3350 
3351  /* cost of inner-relation source data (we already dealt with outer rel) */
3352 
3353  if (path->jpath.jointype == JOIN_SEMI || path->jpath.jointype == JOIN_ANTI ||
3354  extra->inner_unique)
3355  {
3356  /*
3357  * With a SEMI or ANTI join, or if the innerrel is known unique, the
3358  * executor will stop after the first match.
3359  */
3360  Cost inner_run_cost = workspace->inner_run_cost;
3361  Cost inner_rescan_run_cost = workspace->inner_rescan_run_cost;
3362  double outer_matched_rows;
3363  double outer_unmatched_rows;
3364  Selectivity inner_scan_frac;
3365 
3366  /*
3367  * For an outer-rel row that has at least one match, we can expect the
3368  * inner scan to stop after a fraction 1/(match_count+1) of the inner
3369  * rows, if the matches are evenly distributed. Since they probably
3370  * aren't quite evenly distributed, we apply a fuzz factor of 2.0 to
3371  * that fraction. (If we used a larger fuzz factor, we'd have to
3372  * clamp inner_scan_frac to at most 1.0; but since match_count is at
3373  * least 1, no such clamp is needed now.)
3374  */
3375  outer_matched_rows = rint(outer_path_rows * extra->semifactors.outer_match_frac);
3376  outer_unmatched_rows = outer_path_rows - outer_matched_rows;
3377  inner_scan_frac = 2.0 / (extra->semifactors.match_count + 1.0);
3378 
3379  /*
3380  * Compute number of tuples processed (not number emitted!). First,
3381  * account for successfully-matched outer rows.
3382  */
3383  ntuples = outer_matched_rows * inner_path_rows * inner_scan_frac;
3384 
3385  /*
3386  * Now we need to estimate the actual costs of scanning the inner
3387  * relation, which may be quite a bit less than N times inner_run_cost
3388  * due to early scan stops. We consider two cases. If the inner path
3389  * is an indexscan using all the joinquals as indexquals, then an
3390  * unmatched outer row results in an indexscan returning no rows,
3391  * which is probably quite cheap. Otherwise, the executor will have
3392  * to scan the whole inner rel for an unmatched row; not so cheap.
3393  */
3394  if (has_indexed_join_quals(path))
3395  {
3396  /*
3397  * Successfully-matched outer rows will only require scanning
3398  * inner_scan_frac of the inner relation. In this case, we don't
3399  * need to charge the full inner_run_cost even when that's more
3400  * than inner_rescan_run_cost, because we can assume that none of
3401  * the inner scans ever scan the whole inner relation. So it's
3402  * okay to assume that all the inner scan executions can be
3403  * fractions of the full cost, even if materialization is reducing
3404  * the rescan cost. At this writing, it's impossible to get here
3405  * for a materialized inner scan, so inner_run_cost and
3406  * inner_rescan_run_cost will be the same anyway; but just in
3407  * case, use inner_run_cost for the first matched tuple and
3408  * inner_rescan_run_cost for additional ones.
3409  */
3410  run_cost += inner_run_cost * inner_scan_frac;
3411  if (outer_matched_rows > 1)
3412  run_cost += (outer_matched_rows - 1) * inner_rescan_run_cost * inner_scan_frac;
3413 
3414  /*
3415  * Add the cost of inner-scan executions for unmatched outer rows.
3416  * We estimate this as the same cost as returning the first tuple
3417  * of a nonempty scan. We consider that these are all rescans,
3418  * since we used inner_run_cost once already.
3419  */
3420  run_cost += outer_unmatched_rows *
3421  inner_rescan_run_cost / inner_path_rows;
3422 
3423  /*
3424  * We won't be evaluating any quals at all for unmatched rows, so
3425  * don't add them to ntuples.
3426  */
3427  }
3428  else
3429  {
3430  /*
3431  * Here, a complicating factor is that rescans may be cheaper than
3432  * first scans. If we never scan all the way to the end of the
3433  * inner rel, it might be (depending on the plan type) that we'd
3434  * never pay the whole inner first-scan run cost. However it is
3435  * difficult to estimate whether that will happen (and it could
3436  * not happen if there are any unmatched outer rows!), so be
3437  * conservative and always charge the whole first-scan cost once.
3438  * We consider this charge to correspond to the first unmatched
3439  * outer row, unless there isn't one in our estimate, in which
3440  * case blame it on the first matched row.
3441  */
3442 
3443  /* First, count all unmatched join tuples as being processed */
3444  ntuples += outer_unmatched_rows * inner_path_rows;
3445 
3446  /* Now add the forced full scan, and decrement appropriate count */
3447  run_cost += inner_run_cost;
3448  if (outer_unmatched_rows >= 1)
3449  outer_unmatched_rows -= 1;
3450  else
3451  outer_matched_rows -= 1;
3452 
3453  /* Add inner run cost for additional outer tuples having matches */
3454  if (outer_matched_rows > 0)
3455  run_cost += outer_matched_rows * inner_rescan_run_cost * inner_scan_frac;
3456 
3457  /* Add inner run cost for additional unmatched outer tuples */
3458  if (outer_unmatched_rows > 0)
3459  run_cost += outer_unmatched_rows * inner_rescan_run_cost;
3460  }
3461  }
3462  else
3463  {
3464  /* Normal-case source costs were included in preliminary estimate */
3465 
3466  /* Compute number of tuples processed (not number emitted!) */
3467  ntuples = outer_path_rows * inner_path_rows;
3468  }
3469 
3470  /* CPU costs */
3471  cost_qual_eval(&restrict_qual_cost, path->jpath.joinrestrictinfo, root);
3472  startup_cost += restrict_qual_cost.startup;
3473  cpu_per_tuple = cpu_tuple_cost + restrict_qual_cost.per_tuple;
3474  run_cost += cpu_per_tuple * ntuples;
3475 
3476  /* tlist eval costs are paid per output row, not per tuple scanned */
3477  startup_cost += path->jpath.path.pathtarget->cost.startup;
3478  run_cost += path->jpath.path.pathtarget->cost.per_tuple * path->jpath.path.rows;
3479 
3480  path->jpath.path.startup_cost = startup_cost;
3481  path->jpath.path.total_cost = startup_cost + run_cost;
3482 }
3483 
3484 /*
3485  * initial_cost_mergejoin
3486  * Preliminary estimate of the cost of a mergejoin path.
3487  *
3488  * This must quickly produce lower-bound estimates of the path's startup and
3489  * total costs. If we are unable to eliminate the proposed path from
3490  * consideration using the lower bounds, final_cost_mergejoin will be called
3491  * to obtain the final estimates.
3492  *
3493  * The exact division of labor between this function and final_cost_mergejoin
3494  * is private to them, and represents a tradeoff between speed of the initial
3495  * estimate and getting a tight lower bound. We choose to not examine the
3496  * join quals here, except for obtaining the scan selectivity estimate which
3497  * is really essential (but fortunately, use of caching keeps the cost of
3498  * getting that down to something reasonable).
3499  * We also assume that cost_sort is cheap enough to use here.
3500  *
3501  * 'workspace' is to be filled with startup_cost, total_cost, and perhaps
3502  * other data to be used by final_cost_mergejoin
3503  * 'jointype' is the type of join to be performed
3504  * 'mergeclauses' is the list of joinclauses to be used as merge clauses
3505  * 'outer_path' is the outer input to the join
3506  * 'inner_path' is the inner input to the join
3507  * 'outersortkeys' is the list of sort keys for the outer path
3508  * 'innersortkeys' is the list of sort keys for the inner path
3509  * 'extra' contains miscellaneous information about the join
3510  *
3511  * Note: outersortkeys and innersortkeys should be NIL if no explicit
3512  * sort is needed because the respective source path is already ordered.
3513  */
3514 void
3516  JoinType jointype,
3517  List *mergeclauses,
3518  Path *outer_path, Path *inner_path,
3519  List *outersortkeys, List *innersortkeys,
3520  JoinPathExtraData *extra)
3521 {
3522  Cost startup_cost = 0;
3523  Cost run_cost = 0;
3524  double outer_path_rows = outer_path->rows;
3525  double inner_path_rows = inner_path->rows;
3526  Cost inner_run_cost;
3527  double outer_rows,
3528  inner_rows,
3529  outer_skip_rows,
3530  inner_skip_rows;
3531  Selectivity outerstartsel,
3532  outerendsel,
3533  innerstartsel,
3534  innerendsel;
3535  Path sort_path; /* dummy for result of cost_sort */
3536 
3537  /* Protect some assumptions below that rowcounts aren't zero */
3538  if (outer_path_rows <= 0)
3539  outer_path_rows = 1;
3540  if (inner_path_rows <= 0)
3541  inner_path_rows = 1;
3542 
3543  /*
3544  * A merge join will stop as soon as it exhausts either input stream
3545  * (unless it's an outer join, in which case the outer side has to be
3546  * scanned all the way anyway). Estimate fraction of the left and right
3547  * inputs that will actually need to be scanned. Likewise, we can
3548  * estimate the number of rows that will be skipped before the first join
3549  * pair is found, which should be factored into startup cost. We use only
3550  * the first (most significant) merge clause for this purpose. Since
3551  * mergejoinscansel() is a fairly expensive computation, we cache the
3552  * results in the merge clause RestrictInfo.
3553  */
3554  if (mergeclauses && jointype != JOIN_FULL)
3555  {
3556  RestrictInfo *firstclause = (RestrictInfo *) linitial(mergeclauses);
3557  List *opathkeys;
3558  List *ipathkeys;
3559  PathKey *opathkey;
3560  PathKey *ipathkey;
3561  MergeScanSelCache *cache;
3562 
3563  /* Get the input pathkeys to determine the sort-order details */
3564  opathkeys = outersortkeys ? outersortkeys : outer_path->pathkeys;
3565  ipathkeys = innersortkeys ? innersortkeys : inner_path->pathkeys;
3566  Assert(opathkeys);
3567  Assert(ipathkeys);
3568  opathkey = (PathKey *) linitial(opathkeys);
3569  ipathkey = (PathKey *) linitial(ipathkeys);
3570  /* debugging check */
3571  if (opathkey->pk_opfamily != ipathkey->pk_opfamily ||
3572  opathkey->pk_eclass->ec_collation != ipathkey->pk_eclass->ec_collation ||
3573  opathkey->pk_strategy != ipathkey->pk_strategy ||
3574  opathkey->pk_nulls_first != ipathkey->pk_nulls_first)
3575  elog(ERROR, "left and right pathkeys do not match in mergejoin");
3576 
3577  /* Get the selectivity with caching */
3578  cache = cached_scansel(root, firstclause, opathkey);
3579 
3580  if (bms_is_subset(firstclause->left_relids,
3581  outer_path->parent->relids))
3582  {
3583  /* left side of clause is outer */
3584  outerstartsel = cache->leftstartsel;
3585  outerendsel = cache->leftendsel;
3586  innerstartsel = cache->rightstartsel;
3587  innerendsel = cache->rightendsel;
3588  }
3589  else
3590  {
3591  /* left side of clause is inner */
3592  outerstartsel = cache->rightstartsel;
3593  outerendsel = cache->rightendsel;
3594  innerstartsel = cache->leftstartsel;
3595  innerendsel = cache->leftendsel;
3596  }
3597  if (jointype == JOIN_LEFT ||
3598  jointype == JOIN_ANTI)
3599  {
3600  outerstartsel = 0.0;
3601  outerendsel = 1.0;
3602  }
3603  else if (jointype == JOIN_RIGHT ||
3604  jointype == JOIN_RIGHT_ANTI)
3605  {
3606  innerstartsel = 0.0;
3607  innerendsel = 1.0;
3608  }
3609  }
3610  else
3611  {
3612  /* cope with clauseless or full mergejoin */
3613  outerstartsel = innerstartsel = 0.0;
3614  outerendsel = innerendsel = 1.0;
3615  }
3616 
3617  /*
3618  * Convert selectivities to row counts. We force outer_rows and
3619  * inner_rows to be at least 1, but the skip_rows estimates can be zero.
3620  */
3621  outer_skip_rows = rint(outer_path_rows * outerstartsel);
3622  inner_skip_rows = rint(inner_path_rows * innerstartsel);
3623  outer_rows = clamp_row_est(outer_path_rows * outerendsel);
3624  inner_rows = clamp_row_est(inner_path_rows * innerendsel);
3625 
3626  Assert(outer_skip_rows <= outer_rows);
3627  Assert(inner_skip_rows <= inner_rows);
3628 
3629  /*
3630  * Readjust scan selectivities to account for above rounding. This is
3631  * normally an insignificant effect, but when there are only a few rows in
3632  * the inputs, failing to do this makes for a large percentage error.
3633  */
3634  outerstartsel = outer_skip_rows / outer_path_rows;
3635  innerstartsel = inner_skip_rows / inner_path_rows;
3636  outerendsel = outer_rows / outer_path_rows;
3637  innerendsel = inner_rows / inner_path_rows;
3638 
3639  Assert(outerstartsel <= outerendsel);
3640  Assert(innerstartsel <= innerendsel);
3641 
3642  /* cost of source data */
3643 
3644  if (outersortkeys) /* do we need to sort outer? */
3645  {
3646  cost_sort(&sort_path,
3647  root,
3648  outersortkeys,
3649  outer_path->total_cost,
3650  outer_path_rows,
3651  outer_path->pathtarget->width,
3652  0.0,
3653  work_mem,
3654  -1.0);
3655  startup_cost += sort_path.startup_cost;
3656  startup_cost += (sort_path.total_cost - sort_path.startup_cost)
3657  * outerstartsel;
3658  run_cost += (sort_path.total_cost - sort_path.startup_cost)
3659  * (outerendsel - outerstartsel);
3660  }
3661  else
3662  {
3663  startup_cost += outer_path->startup_cost;
3664  startup_cost += (outer_path->total_cost - outer_path->startup_cost)
3665  * outerstartsel;
3666  run_cost += (outer_path->total_cost - outer_path->startup_cost)
3667  * (outerendsel - outerstartsel);
3668  }
3669 
3670  if (innersortkeys) /* do we need to sort inner? */
3671  {
3672  cost_sort(&sort_path,
3673  root,
3674  innersortkeys,
3675  inner_path->total_cost,
3676  inner_path_rows,
3677  inner_path->pathtarget->width,
3678  0.0,
3679  work_mem,
3680  -1.0);
3681  startup_cost += sort_path.startup_cost;
3682  startup_cost += (sort_path.total_cost - sort_path.startup_cost)
3683  * innerstartsel;
3684  inner_run_cost = (sort_path.total_cost - sort_path.startup_cost)
3685  * (innerendsel - innerstartsel);
3686  }
3687  else
3688  {
3689  startup_cost += inner_path->startup_cost;
3690  startup_cost += (inner_path->total_cost - inner_path->startup_cost)
3691  * innerstartsel;
3692  inner_run_cost = (inner_path->total_cost - inner_path->startup_cost)
3693  * (innerendsel - innerstartsel);
3694  }
3695 
3696  /*
3697  * We can't yet determine whether rescanning occurs, or whether
3698  * materialization of the inner input should be done. The minimum
3699  * possible inner input cost, regardless of rescan and materialization
3700  * considerations, is inner_run_cost. We include that in
3701  * workspace->total_cost, but not yet in run_cost.
3702  */
3703 
3704  /* CPU costs left for later */
3705 
3706  /* Public result fields */
3707  workspace->startup_cost = startup_cost;
3708  workspace->total_cost = startup_cost + run_cost + inner_run_cost;
3709  /* Save private data for final_cost_mergejoin */
3710  workspace->run_cost = run_cost;
3711  workspace->inner_run_cost = inner_run_cost;
3712  workspace->outer_rows = outer_rows;
3713  workspace->inner_rows = inner_rows;
3714  workspace->outer_skip_rows = outer_skip_rows;
3715  workspace->inner_skip_rows = inner_skip_rows;
3716 }
3717 
3718 /*
3719  * final_cost_mergejoin
3720  * Final estimate of the cost and result size of a mergejoin path.
3721  *
3722  * Unlike other costsize functions, this routine makes two actual decisions:
3723  * whether the executor will need to do mark/restore, and whether we should
3724  * materialize the inner path. It would be logically cleaner to build
3725  * separate paths testing these alternatives, but that would require repeating
3726  * most of the cost calculations, which are not all that cheap. Since the
3727  * choice will not affect output pathkeys or startup cost, only total cost,
3728  * there is no possibility of wanting to keep more than one path. So it seems
3729  * best to make the decisions here and record them in the path's
3730  * skip_mark_restore and materialize_inner fields.
3731  *
3732  * Mark/restore overhead is usually required, but can be skipped if we know
3733  * that the executor need find only one match per outer tuple, and that the
3734  * mergeclauses are sufficient to identify a match.
3735  *
3736  * We materialize the inner path if we need mark/restore and either the inner
3737  * path can't support mark/restore, or it's cheaper to use an interposed
3738  * Material node to handle mark/restore.
3739  *
3740  * 'path' is already filled in except for the rows and cost fields and
3741  * skip_mark_restore and materialize_inner
3742  * 'workspace' is the result from initial_cost_mergejoin
3743  * 'extra' contains miscellaneous information about the join
3744  */
3745 void
3747  JoinCostWorkspace *workspace,
3748  JoinPathExtraData *extra)
3749 {
3750  Path *outer_path = path->jpath.outerjoinpath;
3751  Path *inner_path = path->jpath.innerjoinpath;
3752  double inner_path_rows = inner_path->rows;
3753  List *mergeclauses = path->path_mergeclauses;
3754  List *innersortkeys = path->innersortkeys;
3755  Cost startup_cost = workspace->startup_cost;
3756  Cost run_cost = workspace->run_cost;
3757  Cost inner_run_cost = workspace->inner_run_cost;
3758  double outer_rows = workspace->outer_rows;
3759  double inner_rows = workspace->inner_rows;
3760  double outer_skip_rows = workspace->outer_skip_rows;
3761  double inner_skip_rows = workspace->inner_skip_rows;
3762  Cost cpu_per_tuple,
3763  bare_inner_cost,
3764  mat_inner_cost;
3765  QualCost merge_qual_cost;
3766  QualCost qp_qual_cost;
3767  double mergejointuples,
3768  rescannedtuples;
3769  double rescanratio;
3770 
3771  /* Protect some assumptions below that rowcounts aren't zero */
3772  if (inner_path_rows <= 0)
3773  inner_path_rows = 1;
3774 
3775  /* Mark the path with the correct row estimate */
3776  if (path->jpath.path.param_info)
3777  path->jpath.path.rows = path->jpath.path.param_info->ppi_rows;
3778  else
3779  path->jpath.path.rows = path->jpath.path.parent->rows;
3780 
3781  /* For partial paths, scale row estimate. */
3782  if (path->jpath.path.parallel_workers > 0)
3783  {
3784  double parallel_divisor = get_parallel_divisor(&path->jpath.path);
3785 
3786  path->jpath.path.rows =
3787  clamp_row_est(path->jpath.path.rows / parallel_divisor);
3788  }
3789 
3790  /*
3791  * We could include disable_cost in the preliminary estimate, but that
3792  * would amount to optimizing for the case where the join method is
3793  * disabled, which doesn't seem like the way to bet.
3794  */
3795  if (!enable_mergejoin)
3796  startup_cost += disable_cost;
3797 
3798  /*
3799  * Compute cost of the mergequals and qpquals (other restriction clauses)
3800  * separately.
3801  */
3802  cost_qual_eval(&merge_qual_cost, mergeclauses, root);
3803  cost_qual_eval(&qp_qual_cost, path->jpath.joinrestrictinfo, root);
3804  qp_qual_cost.startup -= merge_qual_cost.startup;
3805  qp_qual_cost.per_tuple -= merge_qual_cost.per_tuple;
3806 
3807  /*
3808  * With a SEMI or ANTI join, or if the innerrel is known unique, the
3809  * executor will stop scanning for matches after the first match. When
3810  * all the joinclauses are merge clauses, this means we don't ever need to
3811  * back up the merge, and so we can skip mark/restore overhead.
3812  */
3813  if ((path->jpath.jointype == JOIN_SEMI ||
3814  path->jpath.jointype == JOIN_ANTI ||
3815  extra->inner_unique) &&
3818  path->skip_mark_restore = true;
3819  else
3820  path->skip_mark_restore = false;
3821 
3822  /*
3823  * Get approx # tuples passing the mergequals. We use approx_tuple_count
3824  * here because we need an estimate done with JOIN_INNER semantics.
3825  */
3826  mergejointuples = approx_tuple_count(root, &path->jpath, mergeclauses);
3827 
3828  /*
3829  * When there are equal merge keys in the outer relation, the mergejoin
3830  * must rescan any matching tuples in the inner relation. This means
3831  * re-fetching inner tuples; we have to estimate how often that happens.
3832  *
3833  * For regular inner and outer joins, the number of re-fetches can be
3834  * estimated approximately as size of merge join output minus size of
3835  * inner relation. Assume that the distinct key values are 1, 2, ..., and
3836  * denote the number of values of each key in the outer relation as m1,
3837  * m2, ...; in the inner relation, n1, n2, ... Then we have
3838  *
3839  * size of join = m1 * n1 + m2 * n2 + ...
3840  *
3841  * number of rescanned tuples = (m1 - 1) * n1 + (m2 - 1) * n2 + ... = m1 *
3842  * n1 + m2 * n2 + ... - (n1 + n2 + ...) = size of join - size of inner
3843  * relation
3844  *
3845  * This equation works correctly for outer tuples having no inner match
3846  * (nk = 0), but not for inner tuples having no outer match (mk = 0); we
3847  * are effectively subtracting those from the number of rescanned tuples,
3848  * when we should not. Can we do better without expensive selectivity
3849  * computations?
3850  *
3851  * The whole issue is moot if we are working from a unique-ified outer
3852  * input, or if we know we don't need to mark/restore at all.
3853  */
3854  if (IsA(outer_path, UniquePath) || path->skip_mark_restore)
3855  rescannedtuples = 0;
3856  else
3857  {
3858  rescannedtuples = mergejointuples - inner_path_rows;
3859  /* Must clamp because of possible underestimate */
3860  if (rescannedtuples < 0)
3861  rescannedtuples = 0;
3862  }
3863 
3864  /*
3865  * We'll inflate various costs this much to account for rescanning. Note
3866  * that this is to be multiplied by something involving inner_rows, or
3867  * another number related to the portion of the inner rel we'll scan.
3868  */
3869  rescanratio = 1.0 + (rescannedtuples / inner_rows);
3870 
3871  /*
3872  * Decide whether we want to materialize the inner input to shield it from
3873  * mark/restore and performing re-fetches. Our cost model for regular
3874  * re-fetches is that a re-fetch costs the same as an original fetch,
3875  * which is probably an overestimate; but on the other hand we ignore the
3876  * bookkeeping costs of mark/restore. Not clear if it's worth developing
3877  * a more refined model. So we just need to inflate the inner run cost by
3878  * rescanratio.
3879  */
3880  bare_inner_cost = inner_run_cost * rescanratio;
3881 
3882  /*
3883  * When we interpose a Material node the re-fetch cost is assumed to be
3884  * just cpu_operator_cost per tuple, independently of the underlying
3885  * plan's cost; and we charge an extra cpu_operator_cost per original
3886  * fetch as well. Note that we're assuming the materialize node will
3887  * never spill to disk, since it only has to remember tuples back to the
3888  * last mark. (If there are a huge number of duplicates, our other cost
3889  * factors will make the path so expensive that it probably won't get
3890  * chosen anyway.) So we don't use cost_rescan here.
3891  *
3892  * Note: keep this estimate in sync with create_mergejoin_plan's labeling
3893  * of the generated Material node.
3894  */
3895  mat_inner_cost = inner_run_cost +
3896  cpu_operator_cost * inner_rows * rescanratio;
3897 
3898  /*
3899  * If we don't need mark/restore at all, we don't need materialization.
3900  */
3901  if (path->skip_mark_restore)
3902  path->materialize_inner = false;
3903 
3904  /*
3905  * Prefer materializing if it looks cheaper, unless the user has asked to
3906  * suppress materialization.
3907  */
3908  else if (enable_material && mat_inner_cost < bare_inner_cost)
3909  path->materialize_inner = true;
3910 
3911  /*
3912  * Even if materializing doesn't look cheaper, we *must* do it if the
3913  * inner path is to be used directly (without sorting) and it doesn't
3914  * support mark/restore.
3915  *
3916  * Since the inner side must be ordered, and only Sorts and IndexScans can
3917  * create order to begin with, and they both support mark/restore, you
3918  * might think there's no problem --- but you'd be wrong. Nestloop and
3919  * merge joins can *preserve* the order of their inputs, so they can be
3920  * selected as the input of a mergejoin, and they don't support
3921  * mark/restore at present.
3922  *
3923  * We don't test the value of enable_material here, because
3924  * materialization is required for correctness in this case, and turning
3925  * it off does not entitle us to deliver an invalid plan.
3926  */
3927  else if (innersortkeys == NIL &&
3928  !ExecSupportsMarkRestore(inner_path))
3929  path->materialize_inner = true;
3930 
3931  /*
3932  * Also, force materializing if the inner path is to be sorted and the
3933  * sort is expected to spill to disk. This is because the final merge
3934  * pass can be done on-the-fly if it doesn't have to support mark/restore.
3935  * We don't try to adjust the cost estimates for this consideration,
3936  * though.
3937  *
3938  * Since materialization is a performance optimization in this case,
3939  * rather than necessary for correctness, we skip it if enable_material is
3940  * off.
3941  */
3942  else if (enable_material && innersortkeys != NIL &&
3943  relation_byte_size(inner_path_rows,
3944  inner_path->pathtarget->width) >
3945  (work_mem * 1024L))
3946  path->materialize_inner = true;
3947  else
3948  path->materialize_inner = false;
3949 
3950  /* Charge the right incremental cost for the chosen case */
3951  if (path->materialize_inner)
3952  run_cost += mat_inner_cost;
3953  else
3954  run_cost += bare_inner_cost;
3955 
3956  /* CPU costs */
3957 
3958  /*
3959  * The number of tuple comparisons needed is approximately number of outer
3960  * rows plus number of inner rows plus number of rescanned tuples (can we
3961  * refine this?). At each one, we need to evaluate the mergejoin quals.
3962  */
3963  startup_cost += merge_qual_cost.startup;
3964  startup_cost += merge_qual_cost.per_tuple *
3965  (outer_skip_rows + inner_skip_rows * rescanratio);
3966  run_cost += merge_qual_cost.per_tuple *
3967  ((outer_rows - outer_skip_rows) +
3968  (inner_rows - inner_skip_rows) * rescanratio);
3969 
3970  /*
3971  * For each tuple that gets through the mergejoin proper, we charge
3972  * cpu_tuple_cost plus the cost of evaluating additional restriction
3973  * clauses that are to be applied at the join. (This is pessimistic since
3974  * not all of the quals may get evaluated at each tuple.)
3975  *
3976  * Note: we could adjust for SEMI/ANTI joins skipping some qual
3977  * evaluations here, but it's probably not worth the trouble.
3978  */
3979  startup_cost += qp_qual_cost.startup;
3980  cpu_per_tuple = cpu_tuple_cost + qp_qual_cost.per_tuple;
3981  run_cost += cpu_per_tuple * mergejointuples;
3982 
3983  /* tlist eval costs are paid per output row, not per tuple scanned */
3984  startup_cost += path->jpath.path.pathtarget->cost.startup;
3985  run_cost += path->jpath.path.pathtarget->cost.per_tuple * path->jpath.path.rows;
3986 
3987  path->jpath.path.startup_cost = startup_cost;
3988  path->jpath.path.total_cost = startup_cost + run_cost;
3989 }
3990 
3991 /*
3992  * run mergejoinscansel() with caching
3993  */
3994 static MergeScanSelCache *
3996 {
3997  MergeScanSelCache *cache;
3998  ListCell *lc;
3999  Selectivity leftstartsel,
4000  leftendsel,
4001  rightstartsel,
4002  rightendsel;
4003  MemoryContext oldcontext;
4004 
4005  /* Do we have this result already? */
4006  foreach(lc, rinfo->scansel_cache)
4007  {
4008  cache = (MergeScanSelCache *) lfirst(lc);
4009  if (cache->opfamily == pathkey->pk_opfamily &&
4010  cache->collation == pathkey->pk_eclass->ec_collation &&
4011  cache->strategy == pathkey->pk_strategy &&
4012  cache->nulls_first == pathkey->pk_nulls_first)
4013  return cache;
4014  }
4015 
4016  /* Nope, do the computation */
4017  mergejoinscansel(root,
4018  (Node *) rinfo->clause,
4019  pathkey->pk_opfamily,
4020  pathkey->pk_strategy,
4021  pathkey->pk_nulls_first,
4022  &leftstartsel,
4023  &leftendsel,
4024  &rightstartsel,
4025  &rightendsel);
4026 
4027  /* Cache the result in suitably long-lived workspace */
4028  oldcontext = MemoryContextSwitchTo(root->planner_cxt);
4029 
4030  cache = (MergeScanSelCache *) palloc(sizeof(MergeScanSelCache));
4031  cache->opfamily = pathkey->pk_opfamily;
4032  cache->collation = pathkey->pk_eclass->ec_collation;
4033  cache->strategy = pathkey->pk_strategy;
4034  cache->nulls_first = pathkey->pk_nulls_first;
4035  cache->leftstartsel = leftstartsel;
4036  cache->leftendsel = leftendsel;
4037  cache->rightstartsel = rightstartsel;
4038  cache->rightendsel = rightendsel;
4039 
4040  rinfo->scansel_cache = lappend(rinfo->scansel_cache, cache);
4041 
4042  MemoryContextSwitchTo(oldcontext);
4043 
4044  return cache;
4045 }
4046 
4047 /*
4048  * initial_cost_hashjoin
4049  * Preliminary estimate of the cost of a hashjoin path.
4050  *
4051  * This must quickly produce lower-bound estimates of the path's startup and
4052  * total costs. If we are unable to eliminate the proposed path from
4053  * consideration using the lower bounds, final_cost_hashjoin will be called
4054  * to obtain the final estimates.
4055  *
4056  * The exact division of labor between this function and final_cost_hashjoin
4057  * is private to them, and represents a tradeoff between speed of the initial
4058  * estimate and getting a tight lower bound. We choose to not examine the
4059  * join quals here (other than by counting the number of hash clauses),
4060  * so we can't do much with CPU costs. We do assume that
4061  * ExecChooseHashTableSize is cheap enough to use here.
4062  *
4063  * 'workspace' is to be filled with startup_cost, total_cost, and perhaps
4064  * other data to be used by final_cost_hashjoin
4065  * 'jointype' is the type of join to be performed
4066  * 'hashclauses' is the list of joinclauses to be used as hash clauses
4067  * 'outer_path' is the outer input to the join
4068  * 'inner_path' is the inner input to the join
4069  * 'extra' contains miscellaneous information about the join
4070  * 'parallel_hash' indicates that inner_path is partial and that a shared
4071  * hash table will be built in parallel
4072  */
4073 void
4075  JoinType jointype,
4076  List *hashclauses,
4077  Path *outer_path, Path *inner_path,
4078  JoinPathExtraData *extra,
4079  bool parallel_hash)
4080 {
4081  Cost startup_cost = 0;
4082  Cost run_cost = 0;
4083  double outer_path_rows = outer_path->rows;
4084  double inner_path_rows = inner_path->rows;
4085  double inner_path_rows_total = inner_path_rows;
4086  int num_hashclauses = list_length(hashclauses);
4087  int numbuckets;
4088  int numbatches;
4089  int num_skew_mcvs;
4090  size_t space_allowed; /* unused */
4091 
4092  /* cost of source data */
4093  startup_cost += outer_path->startup_cost;
4094  run_cost += outer_path->total_cost - outer_path->startup_cost;
4095  startup_cost += inner_path->total_cost;
4096 
4097  /*
4098  * Cost of computing hash function: must do it once per input tuple. We
4099  * charge one cpu_operator_cost for each column's hash function. Also,
4100  * tack on one cpu_tuple_cost per inner row, to model the costs of
4101  * inserting the row into the hashtable.
4102  *
4103  * XXX when a hashclause is more complex than a single operator, we really
4104  * should charge the extra eval costs of the left or right side, as
4105  * appropriate, here. This seems more work than it's worth at the moment.
4106  */
4107  startup_cost += (cpu_operator_cost * num_hashclauses + cpu_tuple_cost)
4108  * inner_path_rows;
4109  run_cost += cpu_operator_cost * num_hashclauses * outer_path_rows;
4110 
4111  /*
4112  * If this is a parallel hash build, then the value we have for
4113  * inner_rows_total currently refers only to the rows returned by each
4114  * participant. For shared hash table size estimation, we need the total
4115  * number, so we need to undo the division.
4116  */
4117  if (parallel_hash)
4118  inner_path_rows_total *= get_parallel_divisor(inner_path);
4119 
4120  /*
4121  * Get hash table size that executor would use for inner relation.
4122  *
4123  * XXX for the moment, always assume that skew optimization will be
4124  * performed. As long as SKEW_HASH_MEM_PERCENT is small, it's not worth
4125  * trying to determine that for sure.
4126  *
4127  * XXX at some point it might be interesting to try to account for skew
4128  * optimization in the cost estimate, but for now, we don't.
4129  */
4130  ExecChooseHashTableSize(inner_path_rows_total,
4131  inner_path->pathtarget->width,
4132  true, /* useskew */
4133  parallel_hash, /* try_combined_hash_mem */
4134  outer_path->parallel_workers,
4135  &space_allowed,
4136  &numbuckets,
4137  &numbatches,
4138  &num_skew_mcvs);
4139 
4140  /*
4141  * If inner relation is too big then we will need to "batch" the join,
4142  * which implies writing and reading most of the tuples to disk an extra
4143  * time. Charge seq_page_cost per page, since the I/O should be nice and
4144  * sequential. Writing the inner rel counts as startup cost, all the rest
4145  * as run cost.
4146  */
4147  if (numbatches > 1)
4148  {
4149  double outerpages = page_size(outer_path_rows,
4150  outer_path->pathtarget->width);
4151  double innerpages = page_size(inner_path_rows,
4152  inner_path->pathtarget->width);
4153 
4154  startup_cost += seq_page_cost * innerpages;
4155  run_cost += seq_page_cost * (innerpages + 2 * outerpages);
4156  }
4157 
4158  /* CPU costs left for later */
4159 
4160  /* Public result fields */
4161  workspace->startup_cost = startup_cost;
4162  workspace->total_cost = startup_cost + run_cost;
4163  /* Save private data for final_cost_hashjoin */
4164  workspace->run_cost = run_cost;
4165  workspace->numbuckets = numbuckets;
4166  workspace->numbatches = numbatches;
4167  workspace->inner_rows_total = inner_path_rows_total;
4168 }
4169 
4170 /*
4171  * final_cost_hashjoin
4172  * Final estimate of the cost and result size of a hashjoin path.
4173  *
4174  * Note: the numbatches estimate is also saved into 'path' for use later
4175  *
4176  * 'path' is already filled in except for the rows and cost fields and
4177  * num_batches
4178  * 'workspace' is the result from initial_cost_hashjoin
4179  * 'extra' contains miscellaneous information about the join
4180  */
4181 void
4183  JoinCostWorkspace *workspace,
4184  JoinPathExtraData *extra)
4185 {
4186  Path *outer_path = path->jpath.outerjoinpath;
4187  Path *inner_path = path->jpath.innerjoinpath;
4188  double outer_path_rows = outer_path->rows;
4189  double inner_path_rows = inner_path->rows;
4190  double inner_path_rows_total = workspace->inner_rows_total;
4191  List *hashclauses = path->path_hashclauses;
4192  Cost startup_cost = workspace->startup_cost;
4193  Cost run_cost = workspace->run_cost;
4194  int numbuckets = workspace->numbuckets;
4195  int numbatches = workspace->numbatches;
4196  Cost cpu_per_tuple;
4197  QualCost hash_qual_cost;
4198  QualCost qp_qual_cost;
4199  double hashjointuples;
4200  double virtualbuckets;
4201  Selectivity innerbucketsize;
4202  Selectivity innermcvfreq;
4203  ListCell *hcl;
4204 
4205  /* Mark the path with the correct row estimate */
4206  if (path->jpath.path.param_info)
4207  path->jpath.path.rows = path->jpath.path.param_info->ppi_rows;
4208  else
4209  path->jpath.path.rows = path->jpath.path.parent->rows;
4210 
4211  /* For partial paths, scale row estimate. */
4212  if (path->jpath.path.parallel_workers > 0)
4213  {
4214  double parallel_divisor = get_parallel_divisor(&path->jpath.path);
4215 
4216  path->jpath.path.rows =
4217  clamp_row_est(path->jpath.path.rows / parallel_divisor);
4218  }
4219 
4220  /*
4221  * We could include disable_cost in the preliminary estimate, but that
4222  * would amount to optimizing for the case where the join method is
4223  * disabled, which doesn't seem like the way to bet.
4224  */
4225  if (!enable_hashjoin)
4226  startup_cost += disable_cost;
4227 
4228  /* mark the path with estimated # of batches */
4229  path->num_batches = numbatches;
4230 
4231  /* store the total number of tuples (sum of partial row estimates) */
4232  path->inner_rows_total = inner_path_rows_total;
4233 
4234  /* and compute the number of "virtual" buckets in the whole join */
4235  virtualbuckets = (double) numbuckets * (double) numbatches;
4236 
4237  /*
4238  * Determine bucketsize fraction and MCV frequency for the inner relation.
4239  * We use the smallest bucketsize or MCV frequency estimated for any
4240  * individual hashclause; this is undoubtedly conservative.
4241  *
4242  * BUT: if inner relation has been unique-ified, we can assume it's good
4243  * for hashing. This is important both because it's the right answer, and
4244  * because we avoid contaminating the cache with a value that's wrong for
4245  * non-unique-ified paths.
4246  */
4247  if (IsA(inner_path, UniquePath))
4248  {
4249  innerbucketsize = 1.0 / virtualbuckets;
4250  innermcvfreq = 0.0;
4251  }
4252  else
4253  {
4254  innerbucketsize = 1.0;
4255  innermcvfreq = 1.0;
4256  foreach(hcl, hashclauses)
4257  {
4258  RestrictInfo *restrictinfo = lfirst_node(RestrictInfo, hcl);
4259  Selectivity thisbucketsize;
4260  Selectivity thismcvfreq;
4261 
4262  /*
4263  * First we have to figure out which side of the hashjoin clause
4264  * is the inner side.
4265  *
4266  * Since we tend to visit the same clauses over and over when
4267  * planning a large query, we cache the bucket stats estimates in
4268  * the RestrictInfo node to avoid repeated lookups of statistics.
4269  */
4270  if (bms_is_subset(restrictinfo->right_relids,
4271  inner_path->parent->relids))
4272  {
4273  /* righthand side is inner */
4274  thisbucketsize = restrictinfo->right_bucketsize;
4275  if (thisbucketsize < 0)
4276  {
4277  /* not cached yet */
4279  get_rightop(restrictinfo->clause),
4280  virtualbuckets,
4281  &restrictinfo->right_mcvfreq,
4282  &restrictinfo->right_bucketsize);
4283  thisbucketsize = restrictinfo->right_bucketsize;
4284  }
4285  thismcvfreq = restrictinfo->right_mcvfreq;
4286  }
4287  else
4288  {
4289  Assert(bms_is_subset(restrictinfo->left_relids,
4290  inner_path->parent->relids));
4291  /* lefthand side is inner */
4292  thisbucketsize = restrictinfo->left_bucketsize;
4293  if (thisbucketsize < 0)
4294  {
4295  /* not cached yet */
4297  get_leftop(restrictinfo->clause),
4298  virtualbuckets,
4299  &restrictinfo->left_mcvfreq,
4300  &restrictinfo->left_bucketsize);
4301  thisbucketsize = restrictinfo->left_bucketsize;
4302  }
4303  thismcvfreq = restrictinfo->left_mcvfreq;
4304  }
4305 
4306  if (innerbucketsize > thisbucketsize)
4307  innerbucketsize = thisbucketsize;
4308  if (innermcvfreq > thismcvfreq)
4309  innermcvfreq = thismcvfreq;
4310  }
4311  }
4312 
4313  /*
4314  * If the bucket holding the inner MCV would exceed hash_mem, we don't
4315  * want to hash unless there is really no other alternative, so apply
4316  * disable_cost. (The executor normally copes with excessive memory usage
4317  * by splitting batches, but obviously it cannot separate equal values
4318  * that way, so it will be unable to drive the batch size below hash_mem
4319  * when this is true.)
4320  */
4321  if (relation_byte_size(clamp_row_est(inner_path_rows * innermcvfreq),
4322  inner_path->pathtarget->width) > get_hash_memory_limit())
4323  startup_cost += disable_cost;
4324 
4325  /*
4326  * Compute cost of the hashquals and qpquals (other restriction clauses)
4327  * separately.
4328  */
4329  cost_qual_eval(&hash_qual_cost, hashclauses, root);
4330  cost_qual_eval(&qp_qual_cost, path->jpath.joinrestrictinfo, root);
4331  qp_qual_cost.startup -= hash_qual_cost.startup;
4332  qp_qual_cost.per_tuple -= hash_qual_cost.per_tuple;
4333 
4334  /* CPU costs */
4335 
4336  if (path->jpath.jointype == JOIN_SEMI ||
4337  path->jpath.jointype == JOIN_ANTI ||
4338  extra->inner_unique)
4339  {
4340  double outer_matched_rows;
4341  Selectivity inner_scan_frac;
4342 
4343  /*
4344  * With a SEMI or ANTI join, or if the innerrel is known unique, the
4345  * executor will stop after the first match.
4346  *
4347  * For an outer-rel row that has at least one match, we can expect the
4348  * bucket scan to stop after a fraction 1/(match_count+1) of the
4349  * bucket's rows, if the matches are evenly distributed. Since they
4350  * probably aren't quite evenly distributed, we apply a fuzz factor of
4351  * 2.0 to that fraction. (If we used a larger fuzz factor, we'd have
4352  * to clamp inner_scan_frac to at most 1.0; but since match_count is
4353  * at least 1, no such clamp is needed now.)
4354  */
4355  outer_matched_rows = rint(outer_path_rows * extra->semifactors.outer_match_frac);
4356  inner_scan_frac = 2.0 / (extra->semifactors.match_count + 1.0);
4357 
4358  startup_cost += hash_qual_cost.startup;
4359  run_cost += hash_qual_cost.per_tuple * outer_matched_rows *
4360  clamp_row_est(inner_path_rows * innerbucketsize * inner_scan_frac) * 0.5;
4361 
4362  /*
4363  * For unmatched outer-rel rows, the picture is quite a lot different.
4364  * In the first place, there is no reason to assume that these rows
4365  * preferentially hit heavily-populated buckets; instead assume they
4366  * are uncorrelated with the inner distribution and so they see an
4367  * average bucket size of inner_path_rows / virtualbuckets. In the
4368  * second place, it seems likely that they will have few if any exact
4369  * hash-code matches and so very few of the tuples in the bucket will
4370  * actually require eval of the hash quals. We don't have any good
4371  * way to estimate how many will, but for the moment assume that the
4372  * effective cost per bucket entry is one-tenth what it is for
4373  * matchable tuples.
4374  */
4375  run_cost += hash_qual_cost.per_tuple *
4376  (outer_path_rows - outer_matched_rows) *
4377  clamp_row_est(inner_path_rows / virtualbuckets) * 0.05;
4378 
4379  /* Get # of tuples that will pass the basic join */
4380  if (path->jpath.jointype == JOIN_ANTI)
4381  hashjointuples = outer_path_rows - outer_matched_rows;
4382  else
4383  hashjointuples = outer_matched_rows;
4384  }
4385  else
4386  {
4387  /*
4388  * The number of tuple comparisons needed is the number of outer
4389  * tuples times the typical number of tuples in a hash bucket, which
4390  * is the inner relation size times its bucketsize fraction. At each
4391  * one, we need to evaluate the hashjoin quals. But actually,
4392  * charging the full qual eval cost at each tuple is pessimistic,
4393  * since we don't evaluate the quals unless the hash values match
4394  * exactly. For lack of a better idea, halve the cost estimate to
4395  * allow for that.
4396  */
4397  startup_cost += hash_qual_cost.startup;
4398  run_cost += hash_qual_cost.per_tuple * outer_path_rows *
4399  clamp_row_est(inner_path_rows * innerbucketsize) * 0.5;
4400 
4401  /*
4402  * Get approx # tuples passing the hashquals. We use
4403  * approx_tuple_count here because we need an estimate done with
4404  * JOIN_INNER semantics.
4405  */
4406  hashjointuples = approx_tuple_count(root, &path->jpath, hashclauses);
4407  }
4408 
4409  /*
4410  * For each tuple that gets through the hashjoin proper, we charge
4411  * cpu_tuple_cost plus the cost of evaluating additional restriction
4412  * clauses that are to be applied at the join. (This is pessimistic since
4413  * not all of the quals may get evaluated at each tuple.)
4414  */
4415  startup_cost += qp_qual_cost.startup;
4416  cpu_per_tuple = cpu_tuple_cost + qp_qual_cost.per_tuple;
4417  run_cost += cpu_per_tuple * hashjointuples;
4418 
4419  /* tlist eval costs are paid per output row, not per tuple scanned */
4420  startup_cost += path->jpath.path.pathtarget->cost.startup;
4421  run_cost += path->jpath.path.pathtarget->cost.per_tuple * path->jpath.path.rows;
4422 
4423  path->jpath.path.startup_cost = startup_cost;
4424  path->jpath.path.total_cost = startup_cost + run_cost;
4425 }
4426 
4427 
4428 /*
4429  * cost_subplan
4430  * Figure the costs for a SubPlan (or initplan).
4431  *
4432  * Note: we could dig the subplan's Plan out of the root list, but in practice
4433  * all callers have it handy already, so we make them pass it.
4434  */
4435 void
4437 {
4438  QualCost sp_cost;
4439 
4440  /* Figure any cost for evaluating the testexpr */
4441  cost_qual_eval(&sp_cost,
4442  make_ands_implicit((Expr *) subplan->testexpr),
4443  root);
4444 
4445  if (subplan->useHashTable)
4446  {
4447  /*
4448  * If we are using a hash table for the subquery outputs, then the
4449  * cost of evaluating the query is a one-time cost. We charge one
4450  * cpu_operator_cost per tuple for the work of loading the hashtable,
4451  * too.
4452  */
4453  sp_cost.startup += plan->total_cost +
4454  cpu_operator_cost * plan->plan_rows;
4455 
4456  /*
4457  * The per-tuple costs include the cost of evaluating the lefthand
4458  * expressions, plus the cost of probing the hashtable. We already
4459  * accounted for the lefthand expressions as part of the testexpr, and
4460  * will also have counted one cpu_operator_cost for each comparison
4461  * operator. That is probably too low for the probing cost, but it's
4462  * hard to make a better estimate, so live with it for now.
4463  */
4464  }
4465  else
4466  {
4467  /*
4468  * Otherwise we will be rescanning the subplan output on each
4469  * evaluation. We need to estimate how much of the output we will
4470  * actually need to scan. NOTE: this logic should agree with the
4471  * tuple_fraction estimates used by make_subplan() in
4472  * plan/subselect.c.
4473  */
4474  Cost plan_run_cost = plan->total_cost - plan->startup_cost;
4475 
4476  if (subplan->subLinkType == EXISTS_SUBLINK)
4477  {
4478  /* we only need to fetch 1 tuple; clamp to avoid zero divide */
4479  sp_cost.per_tuple += plan_run_cost / clamp_row_est(plan->plan_rows);
4480  }
4481  else if (subplan->subLinkType == ALL_SUBLINK ||
4482  subplan->subLinkType == ANY_SUBLINK)
4483  {
4484  /* assume we need 50% of the tuples */
4485  sp_cost.per_tuple += 0.50 * plan_run_cost;
4486  /* also charge a cpu_operator_cost per row examined */
4487  sp_cost.per_tuple += 0.50 * plan->plan_rows * cpu_operator_cost;
4488  }
4489  else
4490  {
4491  /* assume we need all tuples */
4492  sp_cost.per_tuple += plan_run_cost;
4493  }
4494 
4495  /*
4496  * Also account for subplan's startup cost. If the subplan is
4497  * uncorrelated or undirect correlated, AND its topmost node is one
4498  * that materializes its output, assume that we'll only need to pay
4499  * its startup cost once; otherwise assume we pay the startup cost
4500  * every time.
4501  */
4502  if (subplan->parParam == NIL &&
4504  sp_cost.startup += plan->startup_cost;
4505  else
4506  sp_cost.per_tuple += plan->startup_cost;
4507  }
4508 
4509  subplan->startup_cost = sp_cost.startup;
4510  subplan->per_call_cost = sp_cost.per_tuple;
4511 }
4512 
4513 
4514 /*
4515  * cost_rescan
4516  * Given a finished Path, estimate the costs of rescanning it after
4517  * having done so the first time. For some Path types a rescan is
4518  * cheaper than an original scan (if no parameters change), and this
4519  * function embodies knowledge about that. The default is to return
4520  * the same costs stored in the Path. (Note that the cost estimates
4521  * actually stored in Paths are always for first scans.)
4522  *
4523  * This function is not currently intended to model effects such as rescans
4524  * being cheaper due to disk block caching; what we are concerned with is
4525  * plan types wherein the executor caches results explicitly, or doesn't
4526  * redo startup calculations, etc.
4527  */
4528 static void
4530  Cost *rescan_startup_cost, /* output parameters */
4531  Cost *rescan_total_cost)
4532 {
4533  switch (path->pathtype)
4534  {
4535  case T_FunctionScan:
4536 
4537  /*
4538  * Currently, nodeFunctionscan.c always executes the function to
4539  * completion before returning any rows, and caches the results in
4540  * a tuplestore. So the function eval cost is all startup cost
4541  * and isn't paid over again on rescans. However, all run costs
4542  * will be paid over again.
4543  */
4544  *rescan_startup_cost = 0;
4545  *rescan_total_cost = path->total_cost - path->startup_cost;
4546  break;
4547  case T_HashJoin:
4548 
4549  /*
4550  * If it's a single-batch join, we don't need to rebuild the hash
4551  * table during a rescan.
4552  */
4553  if (((HashPath *) path)->num_batches == 1)
4554  {
4555  /* Startup cost is exactly the cost of hash table building */
4556  *rescan_startup_cost = 0;
4557  *rescan_total_cost = path->total_cost - path->startup_cost;
4558  }
4559  else
4560  {
4561  /* Otherwise, no special treatment */
4562  *rescan_startup_cost = path->startup_cost;
4563  *rescan_total_cost = path->total_cost;
4564  }
4565  break;
4566  case T_CteScan:
4567  case T_WorkTableScan:
4568  {
4569  /*
4570  * These plan types materialize their final result in a
4571  * tuplestore or tuplesort object. So the rescan cost is only
4572  * cpu_tuple_cost per tuple, unless the result is large enough
4573  * to spill to disk.
4574  */
4575  Cost run_cost = cpu_tuple_cost * path->rows;
4576  double nbytes = relation_byte_size(path->rows,
4577  path->pathtarget->width);
4578  long work_mem_bytes = work_mem * 1024L;
4579 
4580  if (nbytes > work_mem_bytes)
4581  {
4582  /* It will spill, so account for re-read cost */
4583  double npages = ceil(nbytes / BLCKSZ);
4584 
4585  run_cost += seq_page_cost * npages;
4586  }
4587  *rescan_startup_cost = 0;
4588  *rescan_total_cost = run_cost;
4589  }
4590  break;
4591  case T_Material:
4592  case T_Sort:
4593  {
4594  /*
4595  * These plan types not only materialize their results, but do
4596  * not implement qual filtering or projection. So they are
4597  * even cheaper to rescan than the ones above. We charge only
4598  * cpu_operator_cost per tuple. (Note: keep that in sync with
4599  * the run_cost charge in cost_sort, and also see comments in
4600  * cost_material before you change it.)
4601  */
4602  Cost run_cost = cpu_operator_cost * path->rows;
4603  double nbytes = relation_byte_size(path->rows,
4604  path->pathtarget->width);
4605  long work_mem_bytes = work_mem * 1024L;
4606 
4607  if (nbytes > work_mem_bytes)
4608  {
4609  /* It will spill, so account for re-read cost */
4610  double npages = ceil(nbytes / BLCKSZ);
4611 
4612  run_cost += seq_page_cost * npages;
4613  }
4614  *rescan_startup_cost = 0;
4615  *rescan_total_cost = run_cost;
4616  }
4617  break;
4618  case T_Memoize:
4619  /* All the hard work is done by cost_memoize_rescan */
4620  cost_memoize_rescan(root, (MemoizePath *) path,
4621  rescan_startup_cost, rescan_total_cost);
4622  break;
4623  default:
4624  *rescan_startup_cost = path->startup_cost;
4625  *rescan_total_cost = path->total_cost;
4626  break;
4627  }
4628 }
4629 
4630 
4631 /*
4632  * cost_qual_eval
4633  * Estimate the CPU costs of evaluating a WHERE clause.
4634  * The input can be either an implicitly-ANDed list of boolean
4635  * expressions, or a list of RestrictInfo nodes. (The latter is
4636  * preferred since it allows caching of the results.)
4637  * The result includes both a one-time (startup) component,
4638  * and a per-evaluation component.
4639  */
4640 void
4642 {
4643  cost_qual_eval_context context;
4644  ListCell *l;
4645 
4646  context.root = root;
4647  context.total.startup = 0;
4648  context.total.per_tuple = 0;
4649 
4650  /* We don't charge any cost for the implicit ANDing at top level ... */
4651 
4652  foreach(l, quals)
4653  {
4654  Node *qual = (Node *) lfirst(l);
4655 
4656  cost_qual_eval_walker(qual, &context);
4657  }
4658 
4659  *cost = context.total;
4660 }
4661 
4662 /*
4663  * cost_qual_eval_node
4664  * As above, for a single RestrictInfo or expression.
4665  */
4666 void
4668 {
4669  cost_qual_eval_context context;
4670 
4671  context.root = root;
4672  context.total.startup = 0;
4673  context.total.per_tuple = 0;
4674 
4675  cost_qual_eval_walker(qual, &context);
4676 
4677  *cost = context.total;
4678 }
4679 
4680 static bool
4682 {
4683  if (node == NULL)
4684  return false;
4685 
4686  /*
4687  * RestrictInfo nodes contain an eval_cost field reserved for this
4688  * routine's use, so that it's not necessary to evaluate the qual clause's
4689  * cost more than once. If the clause's cost hasn't been computed yet,
4690  * the field's startup value will contain -1.
4691  */
4692  if (IsA(node, RestrictInfo))
4693  {
4694  RestrictInfo *rinfo = (RestrictInfo *) node;
4695 
4696  if (rinfo->eval_cost.startup < 0)
4697  {
4698  cost_qual_eval_context locContext;
4699 
4700  locContext.root = context->root;
4701  locContext.total.startup = 0;
4702  locContext.total.per_tuple = 0;
4703 
4704  /*
4705  * For an OR clause, recurse into the marked-up tree so that we
4706  * set the eval_cost for contained RestrictInfos too.
4707  */
4708  if (rinfo->orclause)
4709  cost_qual_eval_walker((Node *) rinfo->orclause, &locContext);
4710  else
4711  cost_qual_eval_walker((Node *) rinfo->clause, &locContext);
4712 
4713  /*
4714  * If the RestrictInfo is marked pseudoconstant, it will be tested
4715  * only once, so treat its cost as all startup cost.
4716  */
4717  if (rinfo->pseudoconstant)
4718  {
4719  /* count one execution during startup */
4720  locContext.total.startup += locContext.total.per_tuple;
4721  locContext.total.per_tuple = 0;
4722  }
4723  rinfo->eval_cost = locContext.total;
4724  }
4725  context->total.startup += rinfo->eval_cost.startup;
4726  context->total.per_tuple += rinfo->eval_cost.per_tuple;
4727  /* do NOT recurse into children */
4728  return false;
4729  }
4730 
4731  /*
4732  * For each operator or function node in the given tree, we charge the
4733  * estimated execution cost given by pg_proc.procost (remember to multiply
4734  * this by cpu_operator_cost).
4735  *
4736  * Vars and Consts are charged zero, and so are boolean operators (AND,
4737  * OR, NOT). Simplistic, but a lot better than no model at all.
4738  *
4739  * Should we try to account for the possibility of short-circuit
4740  * evaluation of AND/OR? Probably *not*, because that would make the
4741  * results depend on the clause ordering, and we are not in any position
4742  * to expect that the current ordering of the clauses is the one that's
4743  * going to end up being used. The above per-RestrictInfo caching would
4744  * not mix well with trying to re-order clauses anyway.
4745  *
4746  * Another issue that is entirely ignored here is that if a set-returning
4747  * function is below top level in the tree, the functions/operators above
4748  * it will need to be evaluated multiple times. In practical use, such
4749  * cases arise so seldom as to not be worth the added complexity needed;
4750  * moreover, since our rowcount estimates for functions tend to be pretty
4751  * phony, the results would also be pretty phony.
4752  */
4753  if (IsA(node, FuncExpr))
4754  {
4755  add_function_cost(context->root, ((FuncExpr *) node)->funcid, node,
4756  &context->total);
4757  }
4758  else if (IsA(node, OpExpr) ||
4759  IsA(node, DistinctExpr) ||
4760  IsA(node, NullIfExpr))
4761  {
4762  /* rely on struct equivalence to treat these all alike */
4763  set_opfuncid((OpExpr *) node);
4764  add_function_cost(context->root, ((OpExpr *) node)->opfuncid, node,
4765  &context->total);
4766  }
4767  else if (IsA(node, ScalarArrayOpExpr))
4768  {
4769  ScalarArrayOpExpr *saop = (ScalarArrayOpExpr *) node;
4770  Node *arraynode = (Node *) lsecond(saop->args);
4771  QualCost sacosts;
4772  QualCost hcosts;
4773  double estarraylen = estimate_array_length(context->root, arraynode);
4774 
4775  set_sa_opfuncid(saop);
4776  sacosts.startup = sacosts.per_tuple = 0;
4777  add_function_cost(context->root, saop->opfuncid, NULL,
4778  &sacosts);
4779 
4780  if (OidIsValid(saop->hashfuncid))
4781  {
4782  /* Handle costs for hashed ScalarArrayOpExpr */
4783  hcosts.startup = hcosts.per_tuple = 0;
4784 
4785  add_function_cost(context->root, saop->hashfuncid, NULL, &hcosts);
4786  context->total.startup += sacosts.startup + hcosts.startup;
4787 
4788  /* Estimate the cost of building the hashtable. */
4789  context->total.startup += estarraylen * hcosts.per_tuple;
4790 
4791  /*
4792  * XXX should we charge a little bit for sacosts.per_tuple when
4793  * building the table, or is it ok to assume there will be zero
4794  * hash collision?
4795  */
4796 
4797  /*
4798  * Charge for hashtable lookups. Charge a single hash and a
4799  * single comparison.
4800  */
4801  context->total.per_tuple += hcosts.per_tuple + sacosts.per_tuple;
4802  }
4803  else
4804  {
4805  /*
4806  * Estimate that the operator will be applied to about half of the
4807  * array elements before the answer is determined.
4808  */
4809  context->total.startup += sacosts.startup;
4810  context->total.per_tuple += sacosts.per_tuple *
4811  estimate_array_length(context->root, arraynode) * 0.5;
4812  }
4813  }
4814  else if (IsA(node, Aggref) ||
4815  IsA(node, WindowFunc))
4816  {
4817  /*
4818  * Aggref and WindowFunc nodes are (and should be) treated like Vars,
4819  * ie, zero execution cost in the current model, because they behave
4820  * essentially like Vars at execution. We disregard the costs of
4821  * their input expressions for the same reason. The actual execution
4822  * costs of the aggregate/window functions and their arguments have to
4823  * be factored into plan-node-specific costing of the Agg or WindowAgg
4824  * plan node.
4825  */
4826  return false; /* don't recurse into children */
4827  }
4828  else if (IsA(node, GroupingFunc))
4829  {
4830  /* Treat this as having cost 1 */
4831  context->total.per_tuple += cpu_operator_cost;
4832  return false; /* don't recurse into children */
4833  }
4834  else if (IsA(node, CoerceViaIO))
4835  {
4836  CoerceViaIO *iocoerce = (CoerceViaIO *) node;
4837  Oid iofunc;
4838  Oid typioparam;
4839  bool typisvarlena;
4840 
4841  /* check the result type's input function */
4842  getTypeInputInfo(iocoerce->resulttype,
4843  &iofunc, &typioparam);
4844  add_function_cost(context->root, iofunc, NULL,
4845  &context->total);
4846  /* check the input type's output function */
4847  getTypeOutputInfo(exprType((Node *) iocoerce->arg),
4848  &iofunc, &typisvarlena);
4849  add_function_cost(context->root, iofunc, NULL,
4850  &context->total);
4851  }
4852  else if (IsA(node, ArrayCoerceExpr))
4853  {
4854  ArrayCoerceExpr *acoerce = (ArrayCoerceExpr *) node;
4855  QualCost perelemcost;
4856 
4857  cost_qual_eval_node(&perelemcost, (Node *) acoerce->elemexpr,
4858  context->root);
4859  context->total.startup += perelemcost.startup;
4860  if (perelemcost.per_tuple > 0)
4861  context->total.per_tuple += perelemcost.per_tuple *
4862  estimate_array_length(context->root, (Node *) acoerce->arg);
4863  }
4864  else if (IsA(node, RowCompareExpr))
4865  {
4866  /* Conservatively assume we will check all the columns */
4867  RowCompareExpr *rcexpr = (RowCompareExpr *) node;
4868  ListCell *lc;
4869 
4870  foreach(lc, rcexpr->opnos)
4871  {
4872  Oid opid = lfirst_oid(lc);
4873 
4874  add_function_cost(context->root, get_opcode(opid), NULL,
4875  &context->total);
4876  }
4877  }
4878  else if (IsA(node, MinMaxExpr) ||
4879  IsA(node, SQLValueFunction) ||
4880  IsA(node, XmlExpr) ||
4881  IsA(node, CoerceToDomain) ||
4882  IsA(node, NextValueExpr))
4883  {
4884  /* Treat all these as having cost 1 */
4885  context->total.per_tuple += cpu_operator_cost;
4886  }
4887  else if (IsA(node, CurrentOfExpr))
4888  {
4889  /* Report high cost to prevent selection of anything but TID scan */
4890  context->total.startup += disable_cost;
4891  }
4892  else if (IsA(node, SubLink))
4893  {
4894  /* This routine should not be applied to un-planned expressions */
4895  elog(ERROR, "cannot handle unplanned sub-select");
4896  }
4897  else if (IsA(node, SubPlan))
4898  {
4899  /*
4900  * A subplan node in an expression typically indicates that the
4901  * subplan will be executed on each evaluation, so charge accordingly.
4902  * (Sub-selects that can be executed as InitPlans have already been
4903  * removed from the expression.)
4904  */
4905  SubPlan *subplan = (SubPlan *) node;
4906 
4907  context->total.startup += subplan->startup_cost;
4908  context->total.per_tuple += subplan->per_call_cost;
4909 
4910  /*
4911  * We don't want to recurse into the testexpr, because it was already
4912  * counted in the SubPlan node's costs. So we're done.
4913  */
4914  return false;
4915  }
4916  else if (IsA(node, AlternativeSubPlan))
4917  {
4918  /*
4919  * Arbitrarily use the first alternative plan for costing. (We should
4920  * certainly only include one alternative, and we don't yet have
4921  * enough information to know which one the executor is most likely to
4922  * use.)
4923  */
4924  AlternativeSubPlan *asplan = (AlternativeSubPlan *) node;
4925 
4926  return cost_qual_eval_walker((Node *) linitial(asplan->subplans),
4927  context);
4928  }
4929  else if (IsA(node, PlaceHolderVar))
4930  {
4931  /*
4932  * A PlaceHolderVar should be given cost zero when considering general
4933  * expression evaluation costs. The expense of doing the contained
4934  * expression is charged as part of the tlist eval costs of the scan
4935  * or join where the PHV is first computed (see set_rel_width and
4936  * add_placeholders_to_joinrel). If we charged it again here, we'd be
4937  * double-counting the cost for each level of plan that the PHV
4938  * bubbles up through. Hence, return without recursing into the
4939  * phexpr.
4940  */
4941  return false;
4942  }
4943 
4944  /* recurse into children */
4946  (void *) context);
4947 }
4948 
4949 /*
4950  * get_restriction_qual_cost
4951  * Compute evaluation costs of a baserel's restriction quals, plus any
4952  * movable join quals that have been pushed down to the scan.
4953  * Results are returned into *qpqual_cost.
4954  *
4955  * This is a convenience subroutine that works for seqscans and other cases
4956  * where all the given quals will be evaluated the hard way. It's not useful
4957  * for cost_index(), for example, where the index machinery takes care of
4958  * some of the quals. We assume baserestrictcost was previously set by
4959  * set_baserel_size_estimates().
4960  */
4961 static void
4963  ParamPathInfo *param_info,
4964  QualCost *qpqual_cost)
4965 {
4966  if (param_info)
4967  {
4968  /* Include costs of pushed-down clauses */
4969  cost_qual_eval(qpqual_cost, param_info->ppi_clauses, root);
4970 
4971  qpqual_cost->startup += baserel->baserestrictcost.startup;
4972  qpqual_cost->per_tuple += baserel->baserestrictcost.per_tuple;
4973  }
4974  else
4975  *qpqual_cost = baserel->baserestrictcost;
4976 }
4977 
4978 
4979 /*
4980  * compute_semi_anti_join_factors
4981  * Estimate how much of the inner input a SEMI, ANTI, or inner_unique join
4982  * can be expected to scan.
4983  *
4984  * In a hash or nestloop SEMI/ANTI join, the executor will stop scanning
4985  * inner rows as soon as it finds a match to the current outer row.
4986  * The same happens if we have detected the inner rel is unique.
4987  * We should therefore adjust some of the cost components for this effect.
4988  * This function computes some estimates needed for these adjustments.
4989  * These estimates will be the same regardless of the particular paths used
4990  * for the outer and inner relation, so we compute these once and then pass
4991  * them to all the join cost estimation functions.
4992  *
4993  * Input parameters:
4994  * joinrel: join relation under consideration
4995  * outerrel: outer relation under consideration
4996  * innerrel: inner relation under consideration
4997  * jointype: if not JOIN_SEMI or JOIN_ANTI, we assume it's inner_unique
4998  * sjinfo: SpecialJoinInfo relevant to this join
4999  * restrictlist: join quals
5000  * Output parameters:
5001  * *semifactors is filled in (see pathnodes.h for field definitions)
5002  */
5003 void
5005  RelOptInfo *joinrel,
5006  RelOptInfo *outerrel,
5007  RelOptInfo *innerrel,
5008  JoinType jointype,
5009  SpecialJoinInfo *sjinfo,
5010  List *restrictlist,
5011  SemiAntiJoinFactors *semifactors)
5012 {
5013  Selectivity jselec;
5014  Selectivity nselec;
5015  Selectivity avgmatch;
5016  SpecialJoinInfo norm_sjinfo;
5017  List *joinquals;
5018  ListCell *l;
5019 
5020  /*
5021  * In an ANTI join, we must ignore clauses that are "pushed down", since
5022  * those won't affect the match logic. In a SEMI join, we do not
5023  * distinguish joinquals from "pushed down" quals, so just use the whole
5024  * restrictinfo list. For other outer join types, we should consider only
5025  * non-pushed-down quals, so that this devolves to an IS_OUTER_JOIN check.
5026  */
5027  if (IS_OUTER_JOIN(jointype))
5028  {
5029  joinquals = NIL;
5030  foreach(l, restrictlist)
5031  {
5032  RestrictInfo *rinfo = lfirst_node(RestrictInfo, l);
5033 
5034  if (!RINFO_IS_PUSHED_DOWN(rinfo, joinrel->relids))
5035  joinquals = lappend(joinquals, rinfo);
5036  }
5037  }
5038  else
5039  joinquals = restrictlist;
5040 
5041  /*
5042  * Get the JOIN_SEMI or JOIN_ANTI selectivity of the join clauses.
5043  */
5044  jselec = clauselist_selectivity(root,
5045  joinquals,
5046  0,
5047  (jointype == JOIN_ANTI) ? JOIN_ANTI : JOIN_SEMI,
5048  sjinfo);
5049 
5050  /*
5051  * Also get the normal inner-join selectivity of the join clauses.
5052  */
5053  norm_sjinfo.type = T_SpecialJoinInfo;
5054  norm_sjinfo.min_lefthand = outerrel->relids;
5055  norm_sjinfo.min_righthand = innerrel->relids;
5056  norm_sjinfo.syn_lefthand = outerrel->relids;
5057  norm_sjinfo.syn_righthand = innerrel->relids;
5058  norm_sjinfo.jointype = JOIN_INNER;
5059  norm_sjinfo.ojrelid = 0;
5060  norm_sjinfo.commute_above_l = NULL;
5061  norm_sjinfo.commute_above_r = NULL;
5062  norm_sjinfo.commute_below_l = NULL;
5063  norm_sjinfo.commute_below_r = NULL;
5064  /* we don't bother trying to make the remaining fields valid */
5065  norm_sjinfo.lhs_strict = false;
5066  norm_sjinfo.semi_can_btree = false;
5067  norm_sjinfo.semi_can_hash = false;
5068  norm_sjinfo.semi_operators = NIL;
5069  norm_sjinfo.semi_rhs_exprs = NIL;
5070 
5071  nselec = clauselist_selectivity(root,
5072  joinquals,
5073  0,
5074  JOIN_INNER,
5075  &norm_sjinfo);
5076 
5077  /* Avoid leaking a lot of ListCells */
5078  if (IS_OUTER_JOIN(jointype))
5079  list_free(joinquals);
5080 
5081  /*
5082  * jselec can be interpreted as the fraction of outer-rel rows that have
5083  * any matches (this is true for both SEMI and ANTI cases). And nselec is
5084  * the fraction of the Cartesian product that matches. So, the average
5085  * number of matches for each outer-rel row that has at least one match is
5086  * nselec * inner_rows / jselec.
5087  *
5088  * Note: it is correct to use the inner rel's "rows" count here, even
5089  * though we might later be considering a parameterized inner path with
5090  * fewer rows. This is because we have included all the join clauses in
5091  * the selectivity estimate.
5092  */
5093  if (jselec > 0) /* protect against zero divide */
5094  {
5095  avgmatch = nselec * innerrel->rows / jselec;
5096  /* Clamp to sane range */
5097  avgmatch = Max(1.0, avgmatch);
5098  }
5099  else
5100  avgmatch = 1.0;
5101 
5102  semifactors->outer_match_frac = jselec;
5103  semifactors->match_count = avgmatch;
5104 }
5105 
5106 /*
5107  * has_indexed_join_quals
5108  * Check whether all the joinquals of a nestloop join are used as
5109  * inner index quals.
5110  *
5111  * If the inner path of a SEMI/ANTI join is an indexscan (including bitmap
5112  * indexscan) that uses all the joinquals as indexquals, we can assume that an
5113  * unmatched outer tuple is cheap to process, whereas otherwise it's probably
5114  * expensive.
5115  */
5116 static bool
5118 {
5119  JoinPath *joinpath = &path->jpath;
5120  Relids joinrelids = joinpath->path.parent->relids;
5121  Path *innerpath = joinpath->innerjoinpath;
5122  List *indexclauses;
5123  bool found_one;
5124  ListCell *lc;
5125 
5126  /* If join still has quals to evaluate, it's not fast */
5127  if (joinpath->joinrestrictinfo != NIL)
5128  return false;
5129  /* Nor if the inner path isn't parameterized at all */
5130  if (innerpath->param_info == NULL)
5131  return false;
5132 
5133  /* Find the indexclauses list for the inner scan */
5134  switch (innerpath->pathtype)
5135  {
5136  case T_IndexScan:
5137  case T_IndexOnlyScan:
5138  indexclauses = ((IndexPath *) innerpath)->indexclauses;
5139  break;
5140  case T_BitmapHeapScan:
5141  {
5142  /* Accept only a simple bitmap scan, not AND/OR cases */
5143  Path *bmqual = ((BitmapHeapPath *) innerpath)->bitmapqual;
5144 
5145  if (IsA(bmqual, IndexPath))
5146  indexclauses = ((IndexPath *) bmqual)->indexclauses;
5147  else
5148  return false;
5149  break;
5150  }
5151  default:
5152 
5153  /*
5154  * If it's not a simple indexscan, it probably doesn't run quickly
5155  * for zero rows out, even if it's a parameterized path using all
5156  * the joinquals.
5157  */
5158  return false;
5159  }
5160 
5161  /*
5162  * Examine the inner path's param clauses. Any that are from the outer
5163  * path must be found in the indexclauses list, either exactly or in an
5164  * equivalent form generated by equivclass.c. Also, we must find at least
5165  * one such clause, else it's a clauseless join which isn't fast.
5166  */
5167  found_one = false;
5168  foreach(lc, innerpath->param_info->ppi_clauses)
5169  {
5170  RestrictInfo *rinfo = (RestrictInfo *) lfirst(lc);
5171 
5172  if (join_clause_is_movable_into(rinfo,
5173  innerpath->parent->relids,
5174  joinrelids))
5175  {
5176  if (!is_redundant_with_indexclauses(rinfo, indexclauses))
5177  return false;
5178  found_one = true;
5179  }
5180  }
5181  return found_one;
5182 }
5183 
5184 
5185 /*
5186  * approx_tuple_count
5187  * Quick-and-dirty estimation of the number of join rows passing
5188  * a set of qual conditions.
5189  *
5190  * The quals can be either an implicitly-ANDed list of boolean expressions,
5191  * or a list of RestrictInfo nodes (typically the latter).
5192  *
5193  * We intentionally compute the selectivity under JOIN_INNER rules, even
5194  * if it's some type of outer join. This is appropriate because we are
5195  * trying to figure out how many tuples pass the initial merge or hash
5196  * join step.
5197  *
5198  * This is quick-and-dirty because we bypass clauselist_selectivity, and
5199  * simply multiply the independent clause selectivities together. Now
5200  * clauselist_selectivity often can't do any better than that anyhow, but
5201  * for some situations (such as range constraints) it is smarter. However,
5202  * we can't effectively cache the results of clauselist_selectivity, whereas
5203  * the individual clause selectivities can be and are cached.
5204  *
5205  * Since we are only using the results to estimate how many potential
5206  * output tuples are generated and passed through qpqual checking, it
5207  * seems OK to live with the approximation.
5208  */
5209 static double
5211 {
5212  double tuples;
5213  double outer_tuples = path->outerjoinpath->rows;
5214  double inner_tuples = path->innerjoinpath->rows;
5215  SpecialJoinInfo sjinfo;
5216  Selectivity selec = 1.0;
5217  ListCell *l;
5218 
5219  /*
5220  * Make up a SpecialJoinInfo for JOIN_INNER semantics.
5221  */
5222  sjinfo.type = T_SpecialJoinInfo;
5223  sjinfo.min_lefthand = path->outerjoinpath->parent->relids;
5224  sjinfo.min_righthand = path->innerjoinpath->parent->relids;
5225  sjinfo.syn_lefthand = path->outerjoinpath->parent->relids;
5226  sjinfo.syn_righthand = path->innerjoinpath->parent->relids;
5227  sjinfo.jointype = JOIN_INNER;
5228  sjinfo.ojrelid = 0;
5229  sjinfo.commute_above_l = NULL;
5230  sjinfo.commute_above_r = NULL;
5231  sjinfo.commute_below_l = NULL;
5232  sjinfo.commute_below_r = NULL;
5233  /* we don't bother trying to make the remaining fields valid */
5234  sjinfo.lhs_strict = false;
5235  sjinfo.semi_can_btree = false;
5236  sjinfo.semi_can_hash = false;
5237  sjinfo.semi_operators = NIL;
5238  sjinfo.semi_rhs_exprs = NIL;
5239 
5240  /* Get the approximate selectivity */
5241  foreach(l, quals)
5242  {
5243  Node *qual = (Node *) lfirst(l);
5244 
5245  /* Note that clause_selectivity will be able to cache its result */
5246  selec *= clause_selectivity(root, qual, 0, JOIN_INNER, &sjinfo);
5247  }
5248 
5249  /* Apply it to the input relation sizes */
5250  tuples = selec * outer_tuples * inner_tuples;
5251 
5252  return clamp_row_est(tuples);
5253 }
5254 
5255 
5256 /*
5257  * set_baserel_size_estimates
5258  * Set the size estimates for the given base relation.
5259  *
5260  * The rel's targetlist and restrictinfo list must have been constructed
5261  * already, and rel->tuples must be set.
5262  *
5263  * We set the following fields of the rel node:
5264  * rows: the estimated number of output tuples (after applying
5265  * restriction clauses).
5266  * width: the estimated average output tuple width in bytes.
5267  * baserestrictcost: estimated cost of evaluating baserestrictinfo clauses.
5268  */
5269 void
5271 {
5272  double nrows;
5273 
5274  /* Should only be applied to base relations */
5275  Assert(rel->relid > 0);
5276 
5277  nrows = rel->tuples *
5279  rel->baserestrictinfo,
5280  0,
5281  JOIN_INNER,
5282  NULL);
5283 
5284  rel->rows = clamp_row_est(nrows);
5285 
5287 
5288  set_rel_width(root, rel);
5289 }
5290 
5291 /*
5292  * get_parameterized_baserel_size
5293  * Make a size estimate for a parameterized scan of a base relation.
5294  *
5295  * 'param_clauses' lists the additional join clauses to be used.
5296  *
5297  * set_baserel_size_estimates must have been applied already.
5298  */
5299 double
5301  List *param_clauses)
5302 {
5303  List *allclauses;
5304  double nrows;
5305 
5306  /*
5307  * Estimate the number of rows returned by the parameterized scan, knowing
5308  * that it will apply all the extra join clauses as well as the rel's own
5309  * restriction clauses. Note that we force the clauses to be treated as
5310  * non-join clauses during selectivity estimation.
5311  */
5312  allclauses = list_concat_copy(param_clauses, rel->baserestrictinfo);
5313  nrows = rel->tuples *
5315  allclauses,
5316  rel->relid, /* do not use 0! */
5317  JOIN_INNER,
5318  NULL);
5319  nrows = clamp_row_est(nrows);
5320  /* For safety, make sure result is not more than the base estimate */
5321  if (nrows > rel->rows)
5322  nrows = rel->rows;
5323  return nrows;
5324 }
5325 
5326 /*
5327  * set_joinrel_size_estimates
5328  * Set the size estimates for the given join relation.
5329  *
5330  * The rel's targetlist must have been constructed already, and a
5331  * restriction clause list that matches the given component rels must
5332  * be provided.
5333  *
5334  * Since there is more than one way to make a joinrel for more than two
5335  * base relations, the results we get here could depend on which component
5336  * rel pair is provided. In theory we should get the same answers no matter
5337  * which pair is provided; in practice, since the selectivity estimation
5338  * routines don't handle all cases equally well, we might not. But there's
5339  * not much to be done about it. (Would it make sense to repeat the
5340  * calculations for each pair of input rels that's encountered, and somehow
5341  * average the results? Probably way more trouble than it's worth, and
5342  * anyway we must keep the rowcount estimate the same for all paths for the
5343  * joinrel.)
5344  *
5345  * We set only the rows field here. The reltarget field was already set by
5346  * build_joinrel_tlist, and baserestrictcost is not used for join rels.
5347  */
5348 void
5350  RelOptInfo *outer_rel,
5351  RelOptInfo *inner_rel,
5352  SpecialJoinInfo *sjinfo,
5353  List *restrictlist)
5354 {
5355  rel->rows = calc_joinrel_size_estimate(root,
5356  rel,
5357  outer_rel,
5358  inner_rel,
5359  outer_rel->rows,
5360  inner_rel->rows,
5361  sjinfo,
5362  restrictlist);
5363 }
5364 
5365 /*
5366  * get_parameterized_joinrel_size
5367  * Make a size estimate for a parameterized scan of a join relation.
5368  *
5369  * 'rel' is the joinrel under consideration.
5370  * 'outer_path', 'inner_path' are (probably also parameterized) Paths that
5371  * produce the relations being joined.
5372  * 'sjinfo' is any SpecialJoinInfo relevant to this join.
5373  * 'restrict_clauses' lists the join clauses that need to be applied at the
5374  * join node (including any movable clauses that were moved down to this join,
5375  * and not including any movable clauses that were pushed down into the
5376  * child paths).
5377  *
5378  * set_joinrel_size_estimates must have been applied already.
5379  */
5380 double
5382  Path *outer_path,
5383  Path *inner_path,
5384  SpecialJoinInfo *sjinfo,
5385  List *restrict_clauses)
5386 {
5387  double nrows;
5388 
5389  /*
5390  * Estimate the number of rows returned by the parameterized join as the
5391  * sizes of the input paths times the selectivity of the clauses that have
5392  * ended up at this join node.
5393  *
5394  * As with set_joinrel_size_estimates, the rowcount estimate could depend
5395  * on the pair of input paths provided, though ideally we'd get the same
5396  * estimate for any pair with the same parameterization.
5397  */
5398  nrows = calc_joinrel_size_estimate(root,
5399  rel,
5400  outer_path->parent,
5401  inner_path->parent,
5402  outer_path->rows,
5403  inner_path->rows,
5404  sjinfo,
5405  restrict_clauses);
5406  /* For safety, make sure result is not more than the base estimate */
5407  if (nrows > rel->rows)
5408  nrows = rel->rows;
5409  return nrows;
5410 }
5411 
5412 /*
5413  * calc_joinrel_size_estimate
5414  * Workhorse for set_joinrel_size_estimates and
5415  * get_parameterized_joinrel_size.
5416  *
5417  * outer_rel/inner_rel are the relations being joined, but they should be
5418  * assumed to have sizes outer_rows/inner_rows; those numbers might be less
5419  * than what rel->rows says, when we are considering parameterized paths.
5420  */
5421 static double
5423  RelOptInfo *joinrel,
5424  RelOptInfo *outer_rel,
5425  RelOptInfo *inner_rel,
5426  double outer_rows,
5427  double inner_rows,
5428  SpecialJoinInfo *sjinfo,
5429  List *restrictlist)
5430 {
5431  JoinType jointype = sjinfo->jointype;
5432  Selectivity fkselec;
5433  Selectivity jselec;
5434  Selectivity pselec;
5435  double nrows;
5436 
5437  /*
5438  * Compute joinclause selectivity. Note that we are only considering
5439  * clauses that become restriction clauses at this join level; we are not
5440  * double-counting them because they were not considered in estimating the
5441  * sizes of the component rels.
5442  *
5443  * First, see whether any of the joinclauses can be matched to known FK
5444  * constraints. If so, drop those clauses from the restrictlist, and
5445  * instead estimate their selectivity using FK semantics. (We do this
5446  * without regard to whether said clauses are local or "pushed down".
5447  * Probably, an FK-matching clause could never be seen as pushed down at
5448  * an outer join, since it would be strict and hence would be grounds for
5449  * join strength reduction.) fkselec gets the net selectivity for
5450  * FK-matching clauses, or 1.0 if there are none.
5451  */
5452  fkselec = get_foreign_key_join_selectivity(root,
5453  outer_rel->relids,
5454  inner_rel->relids,
5455  sjinfo,
5456  &restrictlist);
5457 
5458  /*
5459  * For an outer join, we have to distinguish the selectivity of the join's
5460  * own clauses (JOIN/ON conditions) from any clauses that were "pushed
5461  * down". For inner joins we just count them all as joinclauses.
5462  */
5463  if (IS_OUTER_JOIN(jointype))
5464  {
5465  List *joinquals = NIL;
5466  List *pushedquals = NIL;
5467  ListCell *l;
5468 
5469  /* Grovel through the clauses to separate into two lists */
5470  foreach(l, restrictlist)
5471  {
5472  RestrictInfo *rinfo = lfirst_node(RestrictInfo, l);
5473 
5474  if (RINFO_IS_PUSHED_DOWN(rinfo, joinrel->relids))
5475  pushedquals = lappend(pushedquals, rinfo);
5476  else
5477  joinquals = lappend(joinquals, rinfo);
5478  }
5479 
5480  /* Get the separate selectivities */
5481  jselec = clauselist_selectivity(root,
5482  joinquals,
5483  0,
5484  jointype,
5485  sjinfo);
5486  pselec = clauselist_selectivity(root,
5487  pushedquals,
5488  0,
5489  jointype,
5490  sjinfo);
5491 
5492  /* Avoid leaking a lot of ListCells */
5493  list_free(joinquals);
5494  list_free(pushedquals);
5495  }
5496  else
5497  {
5498  jselec = clauselist_selectivity(root,
5499  restrictlist,
5500  0,
5501  jointype,
5502  sjinfo);
5503  pselec = 0.0; /* not used, keep compiler quiet */
5504  }
5505 
5506  /*
5507  * Basically, we multiply size of Cartesian product by selectivity.
5508  *
5509  * If we are doing an outer join, take that into account: the joinqual
5510  * selectivity has to be clamped using the knowledge that the output must
5511  * be at least as large as the non-nullable input. However, any
5512  * pushed-down quals are applied after the outer join, so their
5513  * selectivity applies fully.
5514  *
5515  * For JOIN_SEMI and JOIN_ANTI, the selectivity is defined as the fraction
5516  * of LHS rows that have matches, and we apply that straightforwardly.
5517  */
5518  switch (jointype)
5519  {
5520  case JOIN_INNER:
5521  nrows = outer_rows * inner_rows * fkselec * jselec;
5522  /* pselec not used */
5523  break;
5524  case JOIN_LEFT:
5525  nrows = outer_rows * inner_rows * fkselec * jselec;
5526  if (nrows < outer_rows)
5527  nrows = outer_rows;
5528  nrows *= pselec;
5529  break;
5530  case JOIN_FULL:
5531  nrows = outer_rows * inner_rows * fkselec * jselec;
5532  if (nrows < outer_rows)
5533  nrows = outer_rows;
5534  if (nrows < inner_rows)
5535  nrows = inner_rows;
5536  nrows *= pselec;
5537  break;
5538  case JOIN_SEMI:
5539  nrows = outer_rows * fkselec * jselec;
5540  /* pselec not used */
5541  break;
5542  case JOIN_ANTI:
5543  nrows = outer_rows * (1.0 - fkselec * jselec);
5544  nrows *= pselec;
5545  break;
5546  default:
5547  /* other values not expected here */
5548  elog(ERROR, "unrecognized join type: %d", (int) jointype);
5549  nrows = 0; /* keep compiler quiet */
5550  break;
5551  }
5552 
5553  return clamp_row_est(nrows);
5554 }
5555 
5556 /*
5557  * get_foreign_key_join_selectivity
5558  * Estimate join selectivity for foreign-key-related clauses.
5559  *
5560  * Remove any clauses that can be matched to FK constraints from *restrictlist,
5561  * and return a substitute estimate of their selectivity. 1.0 is returned
5562  * when there are no such clauses.
5563  *
5564  * The reason for treating such clauses specially is that we can get better
5565  * estimates this way than by relying on clauselist_selectivity(), especially
5566  * for multi-column FKs where that function's assumption that the clauses are
5567  * independent falls down badly. But even with single-column FKs, we may be
5568  * able to get a better answer when the pg_statistic stats are missing or out
5569  * of date.
5570  */
5571 static Selectivity
5573  Relids outer_relids,
5574  Relids inner_relids,
5575  SpecialJoinInfo *sjinfo,
5576  List **restrictlist)
5577 {
5578  Selectivity fkselec = 1.0;
5579  JoinType jointype = sjinfo->jointype;
5580  List *worklist = *restrictlist;
5581  ListCell *lc;
5582 
5583  /* Consider each FK constraint that is known to match the query */
5584  foreach(lc, root->fkey_list)
5585  {
5586  ForeignKeyOptInfo *fkinfo = (ForeignKeyOptInfo *) lfirst(lc);
5587  bool ref_is_outer;
5588  List *removedlist;
5589  ListCell *cell;
5590 
5591  /*
5592  * This FK is not relevant unless it connects a baserel on one side of
5593  * this join to a baserel on the other side.
5594  */
5595  if (bms_is_member(fkinfo->con_relid, outer_relids) &&
5596  bms_is_member(fkinfo->ref_relid, inner_relids))
5597  ref_is_outer = false;
5598  else if (bms_is_member(fkinfo->ref_relid, outer_relids) &&
5599  bms_is_member(fkinfo->con_relid, inner_relids))
5600  ref_is_outer = true;
5601  else
5602  continue;
5603 
5604  /*
5605  * If we're dealing with a semi/anti join, and the FK's referenced
5606  * relation is on the outside, then knowledge of the FK doesn't help
5607  * us figure out what we need to know (which is the fraction of outer
5608  * rows that have matches). On the other hand, if the referenced rel
5609  * is on the inside, then all outer rows must have matches in the
5610  * referenced table (ignoring nulls). But any restriction or join
5611  * clauses that filter that table will reduce the fraction of matches.
5612  * We can account for restriction clauses, but it's too hard to guess
5613  * how many table rows would get through a join that's inside the RHS.
5614  * Hence, if either case applies, punt and ignore the FK.
5615  */
5616  if ((jointype == JOIN_SEMI || jointype == JOIN_ANTI) &&
5617  (ref_is_outer || bms_membership(inner_relids) != BMS_SINGLETON))
5618  continue;
5619 
5620  /*
5621  * Modify the restrictlist by removing clauses that match the FK (and
5622  * putting them into removedlist instead). It seems unsafe to modify
5623  * the originally-passed List structure, so we make a shallow copy the
5624  * first time through.
5625  */
5626  if (worklist == *restrictlist)
5627  worklist = list_copy(worklist);
5628 
5629  removedlist = NIL;
5630  foreach(cell, worklist)
5631  {
5632  RestrictInfo *rinfo = (RestrictInfo *) lfirst(cell);
5633  bool remove_it = false;
5634  int i;
5635 
5636  /* Drop this clause if it matches any column of the FK */
5637  for (i = 0; i < fkinfo->nkeys; i++)
5638  {
5639  if (rinfo->parent_ec)
5640  {
5641  /*
5642  * EC-derived clauses can only match by EC. It is okay to
5643  * consider any clause derived from the same EC as
5644  * matching the FK: even if equivclass.c chose to generate
5645  * a clause equating some other pair of Vars, it could
5646  * have generated one equating the FK's Vars. So for
5647  * purposes of estimation, we can act as though it did so.
5648  *
5649  * Note: checking parent_ec is a bit of a cheat because
5650  * there are EC-derived clauses that don't have parent_ec
5651  * set; but such clauses must compare expressions that
5652  * aren't just Vars, so they cannot match the FK anyway.
5653  */
5654  if (fkinfo->eclass[i] == rinfo->parent_ec)
5655  {
5656  remove_it = true;
5657  break;
5658  }
5659  }
5660  else
5661  {
5662  /*
5663  * Otherwise, see if rinfo was previously matched to FK as
5664  * a "loose" clause.
5665  */
5666  if (list_member_ptr(fkinfo->rinfos[i], rinfo))
5667  {
5668  remove_it = true;
5669  break;
5670  }
5671  }
5672  }
5673  if (remove_it)
5674  {
5675  worklist = foreach_delete_current(worklist, cell);
5676  removedlist = lappend(removedlist, rinfo);
5677  }
5678  }
5679 
5680  /*
5681  * If we failed to remove all the matching clauses we expected to
5682  * find, chicken out and ignore this FK; applying its selectivity
5683  * might result in double-counting. Put any clauses we did manage to
5684  * remove back into the worklist.
5685  *
5686  * Since the matching clauses are known not outerjoin-delayed, they
5687  * would normally have appeared in the initial joinclause list. If we
5688  * didn't find them, there are two possibilities:
5689  *
5690  * 1. If the FK match is based on an EC that is ec_has_const, it won't
5691  * have generated any join clauses at all. We discount such ECs while
5692  * checking to see if we have "all" the clauses. (Below, we'll adjust
5693  * the selectivity estimate for this case.)
5694  *
5695  * 2. The clauses were matched to some other FK in a previous
5696  * iteration of this loop, and thus removed from worklist. (A likely
5697  * case is that two FKs are matched to the same EC; there will be only
5698  * one EC-derived clause in the initial list, so the first FK will
5699  * consume it.) Applying both FKs' selectivity independently risks
5700  * underestimating the join size; in particular, this would undo one
5701  * of the main things that ECs were invented for, namely to avoid
5702  * double-counting the selectivity of redundant equality conditions.
5703  * Later we might think of a reasonable way to combine the estimates,
5704  * but for now, just punt, since this is a fairly uncommon situation.
5705  */
5706  if (removedlist == NIL ||
5707  list_length(removedlist) !=
5708  (fkinfo->nmatched_ec - fkinfo->nconst_ec + fkinfo->nmatched_ri))
5709  {
5710  worklist = list_concat(worklist, removedlist);
5711  continue;
5712  }
5713 
5714  /*
5715  * Finally we get to the payoff: estimate selectivity using the
5716  * knowledge that each referencing row will match exactly one row in
5717  * the referenced table.
5718  *
5719  * XXX that's not true in the presence of nulls in the referencing
5720  * column(s), so in principle we should derate the estimate for those.
5721  * However (1) if there are any strict restriction clauses for the
5722  * referencing column(s) elsewhere in the query, derating here would
5723  * be double-counting the null fraction, and (2) it's not very clear
5724  * how to combine null fractions for multiple referencing columns. So
5725  * we do nothing for now about correcting for nulls.
5726  *
5727  * XXX another point here is that if either side of an FK constraint
5728  * is an inheritance parent, we estimate as though the constraint
5729  * covers all its children as well. This is not an unreasonable
5730  * assumption for a referencing table, ie the user probably applied
5731  * identical constraints to all child tables (though perhaps we ought
5732  * to check that). But it's not possible to have done that for a
5733  * referenced table. Fortunately, precisely because that doesn't
5734  * work, it is uncommon in practice to have an FK referencing a parent
5735  * table. So, at least for now, disregard inheritance here.
5736  */
5737  if (jointype == JOIN_SEMI || jointype == JOIN_ANTI)
5738  {
5739  /*
5740  * For JOIN_SEMI and JOIN_ANTI, we only get here when the FK's
5741  * referenced table is exactly the inside of the join. The join
5742  * selectivity is defined as the fraction of LHS rows that have
5743  * matches. The FK implies that every LHS row has a match *in the
5744  * referenced table*; but any restriction clauses on it will
5745  * reduce the number of matches. Hence we take the join
5746  * selectivity as equal to the selectivity of the table's
5747  * restriction clauses, which is rows / tuples; but we must guard
5748  * against tuples == 0.
5749  */
5750  RelOptInfo *ref_rel = find_base_rel(root, fkinfo->ref_relid);
5751  double ref_tuples = Max(ref_rel->tuples, 1.0);
5752 
5753  fkselec *= ref_rel->rows / ref_tuples;
5754  }
5755  else
5756  {
5757  /*
5758  * Otherwise, selectivity is exactly 1/referenced-table-size; but
5759  * guard against tuples == 0. Note we should use the raw table
5760  * tuple count, not any estimate of its filtered or joined size.
5761  */
5762  RelOptInfo *ref_rel = find_base_rel(root, fkinfo->ref_relid);
5763  double ref_tuples = Max(ref_rel->tuples, 1.0);
5764 
5765  fkselec *= 1.0 / ref_tuples;
5766  }
5767 
5768  /*
5769  * If any of the FK columns participated in ec_has_const ECs, then
5770  * equivclass.c will have generated "var = const" restrictions for
5771  * each side of the join, thus reducing the sizes of both input
5772  * relations. Taking the fkselec at face value would amount to
5773  * double-counting the selectivity of the constant restriction for the
5774  * referencing Var. Hence, look for the restriction clause(s) that
5775  * were applied to the referencing Var(s), and divide out their
5776  * selectivity to correct for this.
5777  */
5778  if (fkinfo->nconst_ec > 0)
5779  {
5780  for (int i = 0; i < fkinfo->nkeys; i++)
5781  {
5782  EquivalenceClass *ec = fkinfo->eclass[i];
5783 
5784  if (ec && ec->ec_has_const)
5785  {
5786  EquivalenceMember *em = fkinfo->fk_eclass_member[i];
5788  em);
5789 
5790  if (rinfo)
5791  {
5792  Selectivity s0;
5793 
5794  s0 = clause_selectivity(root,
5795  (Node *) rinfo,
5796  0,
5797  jointype,
5798  sjinfo);
5799  if (s0 > 0)
5800  fkselec /= s0;
5801  }
5802  }
5803  }
5804  }
5805  }
5806 
5807  *restrictlist = worklist;
5808  CLAMP_PROBABILITY(fkselec);
5809  return fkselec;
5810 }
5811 
5812 /*
5813  * set_subquery_size_estimates
5814  * Set the size estimates for a base relation that is a subquery.
5815  *
5816  * The rel's targetlist and restrictinfo list must have been constructed
5817  * already, and the Paths for the subquery must have been completed.
5818  * We look at the subquery's PlannerInfo to extract data.
5819  *
5820  * We set the same fields as set_baserel_size_estimates.
5821  */
5822 void
5824 {
5825  PlannerInfo *subroot = rel->subroot;
5826  RelOptInfo *sub_final_rel;
5827  ListCell *lc;
5828 
5829  /* Should only be applied to base relations that are subqueries */
5830  Assert(rel->relid > 0);
5831  Assert(planner_rt_fetch(rel->relid, root)->rtekind == RTE_SUBQUERY);
5832 
5833  /*
5834  * Copy raw number of output rows from subquery. All of its paths should
5835  * have the same output rowcount, so just look at cheapest-total.
5836  */
5837  sub_final_rel = fetch_upper_rel(subroot, UPPERREL_FINAL, NULL);
5838  rel->tuples = sub_final_rel->cheapest_total_path->rows;
5839 
5840  /*
5841  * Compute per-output-column width estimates by examining the subquery's
5842  * targetlist. For any output that is a plain Var, get the width estimate
5843  * that was made while planning the subquery. Otherwise, we leave it to
5844  * set_rel_width to fill in a datatype-based default estimate.
5845  */
5846  foreach(lc, subroot->parse->targetList)
5847  {
5848  TargetEntry *te = lfirst_node(TargetEntry, lc);
5849  Node *texpr = (Node *) te->expr;
5850  int32 item_width = 0;
5851 
5852  /* junk columns aren't visible to upper query */
5853  if (te->resjunk)
5854  continue;
5855 
5856  /*
5857  * The subquery could be an expansion of a view that's had columns
5858  * added to it since the current query was parsed, so that there are
5859  * non-junk tlist columns in it that don't correspond to any column
5860  * visible at our query level. Ignore such columns.
5861  */
5862  if (te->resno < rel->min_attr || te->resno > rel->max_attr)
5863  continue;
5864 
5865  /*
5866  * XXX This currently doesn't work for subqueries containing set
5867  * operations, because the Vars in their tlists are bogus references
5868  * to the first leaf subquery, which wouldn't give the right answer
5869  * even if we could still get to its PlannerInfo.
5870  *
5871  * Also, the subquery could be an appendrel for which all branches are
5872  * known empty due to constraint exclusion, in which case
5873  * set_append_rel_pathlist will have left the attr_widths set to zero.
5874  *
5875  * In either case, we just leave the width estimate zero until
5876  * set_rel_width fixes it.
5877  */
5878  if (IsA(texpr, Var) &&
5879  subroot->parse->setOperations == NULL)
5880  {
5881  Var *var = (Var *) texpr;
5882  RelOptInfo *subrel = find_base_rel(subroot, var->varno);
5883 
5884  item_width = subrel->attr_widths[var->varattno - subrel->min_attr];
5885  }
5886  rel->attr_widths[te->resno - rel->min_attr] = item_width;
5887  }
5888 
5889  /* Now estimate number of output rows, etc */
5890  set_baserel_size_estimates(root, rel);
5891 }
5892 
5893 /*
5894  * set_function_size_estimates
5895  * Set the size estimates for a base relation that is a function call.
5896  *
5897  * The rel's targetlist and restrictinfo list must have been constructed
5898  * already.
5899  *
5900  * We set the same fields as set_baserel_size_estimates.
5901  */
5902 void
5904 {
5905  RangeTblEntry *rte;
5906  ListCell *lc;
5907 
5908  /* Should only be applied to base relations that are functions */
5909  Assert(rel->relid > 0);
5910  rte = planner_rt_fetch(rel->relid, root);
5911  Assert(rte->rtekind == RTE_FUNCTION);
5912 
5913  /*
5914  * Estimate number of rows the functions will return. The rowcount of the
5915  * node is that of the largest function result.
5916  */
5917  rel->tuples = 0;
5918  foreach(lc, rte->functions)
5919  {
5920  RangeTblFunction *rtfunc = (RangeTblFunction *) lfirst(lc);
5921  double ntup = expression_returns_set_rows(root, rtfunc->funcexpr);
5922 
5923  if (ntup > rel->tuples)
5924  rel->tuples = ntup;
5925  }
5926 
5927  /* Now estimate number of output rows, etc */
5928  set_baserel_size_estimates(root, rel);
5929 }
5930 
5931 /*
5932  * set_function_size_estimates
5933  * Set the size estimates for a base relation that is a function call.
5934  *
5935  * The rel's targetlist and restrictinfo list must have been constructed
5936  * already.
5937  *
5938  * We set the same fields as set_tablefunc_size_estimates.
5939  */
5940 void
5942 {
5943  /* Should only be applied to base relations that are functions */
5944  Assert(rel->relid > 0);
5945  Assert(planner_rt_fetch(rel->relid, root)->rtekind == RTE_TABLEFUNC);
5946 
5947  rel->tuples = 100;
5948 
5949  /* Now estimate number of output rows, etc */
5950  set_baserel_size_estimates(root, rel);
5951 }
5952 
5953 /*
5954  * set_values_size_estimates
5955  * Set the size estimates for a base relation that is a values list.
5956  *
5957  * The rel's targetlist and restrictinfo list must have been constructed
5958  * already.
5959  *
5960  * We set the same fields as set_baserel_size_estimates.
5961  */
5962 void
5964 {
5965  RangeTblEntry *rte;
5966 
5967  /* Should only be applied to base relations that are values lists */
5968  Assert(rel->relid > 0);
5969  rte = planner_rt_fetch(rel->relid, root);
5970  Assert(rte->rtekind == RTE_VALUES);
5971 
5972  /*
5973  * Estimate number of rows the values list will return. We know this
5974  * precisely based on the list length (well, barring set-returning
5975  * functions in list items, but that's a refinement not catered for
5976  * anywhere else either).
5977  */
5978  rel->tuples = list_length(rte->values_lists);
5979 
5980  /* Now estimate number of output rows, etc */
5981  set_baserel_size_estimates(root, rel);
5982 }
5983 
5984 /*
5985  * set_cte_size_estimates
5986  * Set the size estimates for a base relation that is a CTE reference.
5987  *
5988  * The rel's targetlist and restrictinfo list must have been constructed
5989  * already, and we need an estimate of the number of rows returned by the CTE
5990  * (if a regular CTE) or the non-recursive term (if a self-reference).
5991  *
5992  * We set the same fields as set_baserel_size_estimates.
5993  */
5994 void
5995 set_cte_size_estimates(PlannerInfo *root, RelOptInfo *rel, double cte_rows)
5996 {
5997  RangeTblEntry *rte;
5998 
5999  /* Should only be applied to base relations that are CTE references */
6000  Assert(rel->relid > 0);
6001  rte = planner_rt_fetch(rel->relid, root);
6002  Assert(rte->rtekind == RTE_CTE);
6003 
6004  if (rte->self_reference)
6005  {
6006  /*
6007  * In a self-reference, we assume the average worktable size is a
6008  * multiple of the nonrecursive term's size. The best multiplier will
6009  * vary depending on query "fan-out", so make its value adjustable.
6010  */
6011  rel->tuples = clamp_row_est(recursive_worktable_factor * cte_rows);
6012  }
6013  else
6014  {
6015  /* Otherwise just believe the CTE's rowcount estimate */
6016  rel->tuples = cte_rows;
6017  }
6018 
6019  /* Now estimate number of output rows, etc */
6020  set_baserel_size_estimates(root, rel);
6021 }
6022 
6023 /*
6024  * set_namedtuplestore_size_estimates
6025  * Set the size estimates for a base relation that is a tuplestore reference.
6026  *
6027  * The rel's targetlist and restrictinfo list must have been constructed
6028  * already.
6029  *
6030  * We set the same fields as set_baserel_size_estimates.
6031  */
6032 void
6034 {
6035  RangeTblEntry *rte;
6036 
6037  /* Should only be applied to base relations that are tuplestore references */
6038  Assert(rel->relid > 0);
6039  rte = planner_rt_fetch(rel->relid, root);
6040  Assert(rte->rtekind ==