PostgreSQL Source Code  git master
tuplesort.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * tuplesort.c
4  * Generalized tuple sorting routines.
5  *
6  * This module handles sorting of heap tuples, index tuples, or single
7  * Datums (and could easily support other kinds of sortable objects,
8  * if necessary). It works efficiently for both small and large amounts
9  * of data. Small amounts are sorted in-memory using qsort(). Large
10  * amounts are sorted using temporary files and a standard external sort
11  * algorithm.
12  *
13  * See Knuth, volume 3, for more than you want to know about the external
14  * sorting algorithm. Historically, we divided the input into sorted runs
15  * using replacement selection, in the form of a priority tree implemented
16  * as a heap (essentially his Algorithm 5.2.3H), but now we always use
17  * quicksort for run generation. We merge the runs using polyphase merge,
18  * Knuth's Algorithm 5.4.2D. The logical "tapes" used by Algorithm D are
19  * implemented by logtape.c, which avoids space wastage by recycling disk
20  * space as soon as each block is read from its "tape".
21  *
22  * The approximate amount of memory allowed for any one sort operation
23  * is specified in kilobytes by the caller (most pass work_mem). Initially,
24  * we absorb tuples and simply store them in an unsorted array as long as
25  * we haven't exceeded workMem. If we reach the end of the input without
26  * exceeding workMem, we sort the array using qsort() and subsequently return
27  * tuples just by scanning the tuple array sequentially. If we do exceed
28  * workMem, we begin to emit tuples into sorted runs in temporary tapes.
29  * When tuples are dumped in batch after quicksorting, we begin a new run
30  * with a new output tape (selected per Algorithm D). After the end of the
31  * input is reached, we dump out remaining tuples in memory into a final run,
32  * then merge the runs using Algorithm D.
33  *
34  * When merging runs, we use a heap containing just the frontmost tuple from
35  * each source run; we repeatedly output the smallest tuple and replace it
36  * with the next tuple from its source tape (if any). When the heap empties,
37  * the merge is complete. The basic merge algorithm thus needs very little
38  * memory --- only M tuples for an M-way merge, and M is constrained to a
39  * small number. However, we can still make good use of our full workMem
40  * allocation by pre-reading additional blocks from each source tape. Without
41  * prereading, our access pattern to the temporary file would be very erratic;
42  * on average we'd read one block from each of M source tapes during the same
43  * time that we're writing M blocks to the output tape, so there is no
44  * sequentiality of access at all, defeating the read-ahead methods used by
45  * most Unix kernels. Worse, the output tape gets written into a very random
46  * sequence of blocks of the temp file, ensuring that things will be even
47  * worse when it comes time to read that tape. A straightforward merge pass
48  * thus ends up doing a lot of waiting for disk seeks. We can improve matters
49  * by prereading from each source tape sequentially, loading about workMem/M
50  * bytes from each tape in turn, and making the sequential blocks immediately
51  * available for reuse. This approach helps to localize both read and write
52  * accesses. The pre-reading is handled by logtape.c, we just tell it how
53  * much memory to use for the buffers.
54  *
55  * When the caller requests random access to the sort result, we form
56  * the final sorted run on a logical tape which is then "frozen", so
57  * that we can access it randomly. When the caller does not need random
58  * access, we return from tuplesort_performsort() as soon as we are down
59  * to one run per logical tape. The final merge is then performed
60  * on-the-fly as the caller repeatedly calls tuplesort_getXXX; this
61  * saves one cycle of writing all the data out to disk and reading it in.
62  *
63  * Before Postgres 8.2, we always used a seven-tape polyphase merge, on the
64  * grounds that 7 is the "sweet spot" on the tapes-to-passes curve according
65  * to Knuth's figure 70 (section 5.4.2). However, Knuth is assuming that
66  * tape drives are expensive beasts, and in particular that there will always
67  * be many more runs than tape drives. In our implementation a "tape drive"
68  * doesn't cost much more than a few Kb of memory buffers, so we can afford
69  * to have lots of them. In particular, if we can have as many tape drives
70  * as sorted runs, we can eliminate any repeated I/O at all. In the current
71  * code we determine the number of tapes M on the basis of workMem: we want
72  * workMem/M to be large enough that we read a fair amount of data each time
73  * we preread from a tape, so as to maintain the locality of access described
74  * above. Nonetheless, with large workMem we can have many tapes (but not
75  * too many -- see the comments in tuplesort_merge_order).
76  *
77  * This module supports parallel sorting. Parallel sorts involve coordination
78  * among one or more worker processes, and a leader process, each with its own
79  * tuplesort state. The leader process (or, more accurately, the
80  * Tuplesortstate associated with a leader process) creates a full tapeset
81  * consisting of worker tapes with one run to merge; a run for every
82  * worker process. This is then merged. Worker processes are guaranteed to
83  * produce exactly one output run from their partial input.
84  *
85  *
86  * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
87  * Portions Copyright (c) 1994, Regents of the University of California
88  *
89  * IDENTIFICATION
90  * src/backend/utils/sort/tuplesort.c
91  *
92  *-------------------------------------------------------------------------
93  */
94 
95 #include "postgres.h"
96 
97 #include <limits.h>
98 
99 #include "access/hash.h"
100 #include "access/htup_details.h"
101 #include "access/nbtree.h"
102 #include "catalog/index.h"
103 #include "catalog/pg_am.h"
104 #include "commands/tablespace.h"
105 #include "executor/executor.h"
106 #include "miscadmin.h"
107 #include "pg_trace.h"
108 #include "utils/datum.h"
109 #include "utils/logtape.h"
110 #include "utils/lsyscache.h"
111 #include "utils/memutils.h"
112 #include "utils/pg_rusage.h"
113 #include "utils/rel.h"
114 #include "utils/sortsupport.h"
115 #include "utils/tuplesort.h"
116 
117 
118 /* sort-type codes for sort__start probes */
119 #define HEAP_SORT 0
120 #define INDEX_SORT 1
121 #define DATUM_SORT 2
122 #define CLUSTER_SORT 3
123 
124 /* Sort parallel code from state for sort__start probes */
125 #define PARALLEL_SORT(state) ((state)->shared == NULL ? 0 : \
126  (state)->worker >= 0 ? 1 : 2)
127 
128 /*
129  * Initial size of memtuples array. We're trying to select this size so that
130  * array doesn't exceed ALLOCSET_SEPARATE_THRESHOLD and so that the overhead of
131  * allocation might possibly be lowered. However, we don't consider array sizes
132  * less than 1024.
133  *
134  */
135 #define INITIAL_MEMTUPSIZE Max(1024, \
136  ALLOCSET_SEPARATE_THRESHOLD / sizeof(SortTuple) + 1)
137 
138 /* GUC variables */
139 #ifdef TRACE_SORT
140 bool trace_sort = false;
141 #endif
142 
143 #ifdef DEBUG_BOUNDED_SORT
144 bool optimize_bounded_sort = true;
145 #endif
146 
147 
148 /*
149  * The objects we actually sort are SortTuple structs. These contain
150  * a pointer to the tuple proper (might be a MinimalTuple or IndexTuple),
151  * which is a separate palloc chunk --- we assume it is just one chunk and
152  * can be freed by a simple pfree() (except during merge, when we use a
153  * simple slab allocator). SortTuples also contain the tuple's first key
154  * column in Datum/nullflag format, and a source/input tape number that
155  * tracks which tape each heap element/slot belongs to during merging.
156  *
157  * Storing the first key column lets us save heap_getattr or index_getattr
158  * calls during tuple comparisons. We could extract and save all the key
159  * columns not just the first, but this would increase code complexity and
160  * overhead, and wouldn't actually save any comparison cycles in the common
161  * case where the first key determines the comparison result. Note that
162  * for a pass-by-reference datatype, datum1 points into the "tuple" storage.
163  *
164  * There is one special case: when the sort support infrastructure provides an
165  * "abbreviated key" representation, where the key is (typically) a pass by
166  * value proxy for a pass by reference type. In this case, the abbreviated key
167  * is stored in datum1 in place of the actual first key column.
168  *
169  * When sorting single Datums, the data value is represented directly by
170  * datum1/isnull1 for pass by value types (or null values). If the datatype is
171  * pass-by-reference and isnull1 is false, then "tuple" points to a separately
172  * palloc'd data value, otherwise "tuple" is NULL. The value of datum1 is then
173  * either the same pointer as "tuple", or is an abbreviated key value as
174  * described above. Accordingly, "tuple" is always used in preference to
175  * datum1 as the authoritative value for pass-by-reference cases.
176  */
177 typedef struct
178 {
179  void *tuple; /* the tuple itself */
180  Datum datum1; /* value of first key column */
181  bool isnull1; /* is first key column NULL? */
182  int srctape; /* source tape number */
183 } SortTuple;
184 
185 /*
186  * During merge, we use a pre-allocated set of fixed-size slots to hold
187  * tuples. To avoid palloc/pfree overhead.
188  *
189  * Merge doesn't require a lot of memory, so we can afford to waste some,
190  * by using gratuitously-sized slots. If a tuple is larger than 1 kB, the
191  * palloc() overhead is not significant anymore.
192  *
193  * 'nextfree' is valid when this chunk is in the free list. When in use, the
194  * slot holds a tuple.
195  */
196 #define SLAB_SLOT_SIZE 1024
197 
198 typedef union SlabSlot
199 {
202 } SlabSlot;
203 
204 /*
205  * Possible states of a Tuplesort object. These denote the states that
206  * persist between calls of Tuplesort routines.
207  */
208 typedef enum
209 {
210  TSS_INITIAL, /* Loading tuples; still within memory limit */
211  TSS_BOUNDED, /* Loading tuples into bounded-size heap */
212  TSS_BUILDRUNS, /* Loading tuples; writing to tape */
213  TSS_SORTEDINMEM, /* Sort completed entirely in memory */
214  TSS_SORTEDONTAPE, /* Sort completed, final run is on tape */
215  TSS_FINALMERGE /* Performing final merge on-the-fly */
216 } TupSortStatus;
217 
218 /*
219  * Parameters for calculation of number of tapes to use --- see inittapes()
220  * and tuplesort_merge_order().
221  *
222  * In this calculation we assume that each tape will cost us about 1 blocks
223  * worth of buffer space. This ignores the overhead of all the other data
224  * structures needed for each tape, but it's probably close enough.
225  *
226  * MERGE_BUFFER_SIZE is how much data we'd like to read from each input
227  * tape during a preread cycle (see discussion at top of file).
228  */
229 #define MINORDER 6 /* minimum merge order */
230 #define MAXORDER 500 /* maximum merge order */
231 #define TAPE_BUFFER_OVERHEAD BLCKSZ
232 #define MERGE_BUFFER_SIZE (BLCKSZ * 32)
233 
234 typedef int (*SortTupleComparator) (const SortTuple *a, const SortTuple *b,
236 
237 /*
238  * Private state of a Tuplesort operation.
239  */
241 {
242  TupSortStatus status; /* enumerated value as shown above */
243  int nKeys; /* number of columns in sort key */
244  bool randomAccess; /* did caller request random access? */
245  bool bounded; /* did caller specify a maximum number of
246  * tuples to return? */
247  bool boundUsed; /* true if we made use of a bounded heap */
248  int bound; /* if bounded, the maximum number of tuples */
249  bool tuples; /* Can SortTuple.tuple ever be set? */
250  int64 availMem; /* remaining memory available, in bytes */
251  int64 allowedMem; /* total memory allowed, in bytes */
252  int maxTapes; /* number of tapes (Knuth's T) */
253  int tapeRange; /* maxTapes-1 (Knuth's P) */
254  int64 maxSpace; /* maximum amount of space occupied among sort
255  * of groups, either in-memory or on-disk */
256  bool isMaxSpaceDisk; /* true when maxSpace is value for on-disk
257  * space, false when it's value for in-memory
258  * space */
259  TupSortStatus maxSpaceStatus; /* sort status when maxSpace was reached */
260  MemoryContext maincontext; /* memory context for tuple sort metadata that
261  * persists across multiple batches */
262  MemoryContext sortcontext; /* memory context holding most sort data */
263  MemoryContext tuplecontext; /* sub-context of sortcontext for tuple data */
264  LogicalTapeSet *tapeset; /* logtape.c object for tapes in a temp file */
265 
266  /*
267  * These function pointers decouple the routines that must know what kind
268  * of tuple we are sorting from the routines that don't need to know it.
269  * They are set up by the tuplesort_begin_xxx routines.
270  *
271  * Function to compare two tuples; result is per qsort() convention, ie:
272  * <0, 0, >0 according as a<b, a=b, a>b. The API must match
273  * qsort_arg_comparator.
274  */
276 
277  /*
278  * Function to copy a supplied input tuple into palloc'd space and set up
279  * its SortTuple representation (ie, set tuple/datum1/isnull1). Also,
280  * state->availMem must be decreased by the amount of space used for the
281  * tuple copy (note the SortTuple struct itself is not counted).
282  */
283  void (*copytup) (Tuplesortstate *state, SortTuple *stup, void *tup);
284 
285  /*
286  * Function to write a stored tuple onto tape. The representation of the
287  * tuple on tape need not be the same as it is in memory; requirements on
288  * the tape representation are given below. Unless the slab allocator is
289  * used, after writing the tuple, pfree() the out-of-line data (not the
290  * SortTuple struct!), and increase state->availMem by the amount of
291  * memory space thereby released.
292  */
293  void (*writetup) (Tuplesortstate *state, int tapenum,
294  SortTuple *stup);
295 
296  /*
297  * Function to read a stored tuple from tape back into memory. 'len' is
298  * the already-read length of the stored tuple. The tuple is allocated
299  * from the slab memory arena, or is palloc'd, see readtup_alloc().
300  */
301  void (*readtup) (Tuplesortstate *state, SortTuple *stup,
302  int tapenum, unsigned int len);
303 
304  /*
305  * This array holds the tuples now in sort memory. If we are in state
306  * INITIAL, the tuples are in no particular order; if we are in state
307  * SORTEDINMEM, the tuples are in final sorted order; in states BUILDRUNS
308  * and FINALMERGE, the tuples are organized in "heap" order per Algorithm
309  * H. In state SORTEDONTAPE, the array is not used.
310  */
311  SortTuple *memtuples; /* array of SortTuple structs */
312  int memtupcount; /* number of tuples currently present */
313  int memtupsize; /* allocated length of memtuples array */
314  bool growmemtuples; /* memtuples' growth still underway? */
315 
316  /*
317  * Memory for tuples is sometimes allocated using a simple slab allocator,
318  * rather than with palloc(). Currently, we switch to slab allocation
319  * when we start merging. Merging only needs to keep a small, fixed
320  * number of tuples in memory at any time, so we can avoid the
321  * palloc/pfree overhead by recycling a fixed number of fixed-size slots
322  * to hold the tuples.
323  *
324  * For the slab, we use one large allocation, divided into SLAB_SLOT_SIZE
325  * slots. The allocation is sized to have one slot per tape, plus one
326  * additional slot. We need that many slots to hold all the tuples kept
327  * in the heap during merge, plus the one we have last returned from the
328  * sort, with tuplesort_gettuple.
329  *
330  * Initially, all the slots are kept in a linked list of free slots. When
331  * a tuple is read from a tape, it is put to the next available slot, if
332  * it fits. If the tuple is larger than SLAB_SLOT_SIZE, it is palloc'd
333  * instead.
334  *
335  * When we're done processing a tuple, we return the slot back to the free
336  * list, or pfree() if it was palloc'd. We know that a tuple was
337  * allocated from the slab, if its pointer value is between
338  * slabMemoryBegin and -End.
339  *
340  * When the slab allocator is used, the USEMEM/LACKMEM mechanism of
341  * tracking memory usage is not used.
342  */
344 
345  char *slabMemoryBegin; /* beginning of slab memory arena */
346  char *slabMemoryEnd; /* end of slab memory arena */
347  SlabSlot *slabFreeHead; /* head of free list */
348 
349  /* Buffer size to use for reading input tapes, during merge. */
351 
352  /*
353  * When we return a tuple to the caller in tuplesort_gettuple_XXX, that
354  * came from a tape (that is, in TSS_SORTEDONTAPE or TSS_FINALMERGE
355  * modes), we remember the tuple in 'lastReturnedTuple', so that we can
356  * recycle the memory on next gettuple call.
357  */
359 
360  /*
361  * While building initial runs, this is the current output run number.
362  * Afterwards, it is the number of initial runs we made.
363  */
365 
366  /*
367  * Unless otherwise noted, all pointer variables below are pointers to
368  * arrays of length maxTapes, holding per-tape data.
369  */
370 
371  /*
372  * This variable is only used during merge passes. mergeactive[i] is true
373  * if we are reading an input run from (actual) tape number i and have not
374  * yet exhausted that run.
375  */
376  bool *mergeactive; /* active input run source? */
377 
378  /*
379  * Variables for Algorithm D. Note that destTape is a "logical" tape
380  * number, ie, an index into the tp_xxx[] arrays. Be careful to keep
381  * "logical" and "actual" tape numbers straight!
382  */
383  int Level; /* Knuth's l */
384  int destTape; /* current output tape (Knuth's j, less 1) */
385  int *tp_fib; /* Target Fibonacci run counts (A[]) */
386  int *tp_runs; /* # of real runs on each tape */
387  int *tp_dummy; /* # of dummy runs for each tape (D[]) */
388  int *tp_tapenum; /* Actual tape numbers (TAPE[]) */
389  int activeTapes; /* # of active input tapes in merge pass */
390 
391  /*
392  * These variables are used after completion of sorting to keep track of
393  * the next tuple to return. (In the tape case, the tape's current read
394  * position is also critical state.)
395  */
396  int result_tape; /* actual tape number of finished output */
397  int current; /* array index (only used if SORTEDINMEM) */
398  bool eof_reached; /* reached EOF (needed for cursors) */
399 
400  /* markpos_xxx holds marked position for mark and restore */
401  long markpos_block; /* tape block# (only used if SORTEDONTAPE) */
402  int markpos_offset; /* saved "current", or offset in tape block */
403  bool markpos_eof; /* saved "eof_reached" */
404 
405  /*
406  * These variables are used during parallel sorting.
407  *
408  * worker is our worker identifier. Follows the general convention that
409  * -1 value relates to a leader tuplesort, and values >= 0 worker
410  * tuplesorts. (-1 can also be a serial tuplesort.)
411  *
412  * shared is mutable shared memory state, which is used to coordinate
413  * parallel sorts.
414  *
415  * nParticipants is the number of worker Tuplesortstates known by the
416  * leader to have actually been launched, which implies that they must
417  * finish a run leader can merge. Typically includes a worker state held
418  * by the leader process itself. Set in the leader Tuplesortstate only.
419  */
420  int worker;
423 
424  /*
425  * The sortKeys variable is used by every case other than the hash index
426  * case; it is set by tuplesort_begin_xxx. tupDesc is only used by the
427  * MinimalTuple and CLUSTER routines, though.
428  */
430  SortSupport sortKeys; /* array of length nKeys */
431 
432  /*
433  * This variable is shared by the single-key MinimalTuple case and the
434  * Datum case (which both use qsort_ssup()). Otherwise it's NULL.
435  */
437 
438  /*
439  * Additional state for managing "abbreviated key" sortsupport routines
440  * (which currently may be used by all cases except the hash index case).
441  * Tracks the intervals at which the optimization's effectiveness is
442  * tested.
443  */
444  int64 abbrevNext; /* Tuple # at which to next check
445  * applicability */
446 
447  /*
448  * These variables are specific to the CLUSTER case; they are set by
449  * tuplesort_begin_cluster.
450  */
451  IndexInfo *indexInfo; /* info about index being used for reference */
452  EState *estate; /* for evaluating index expressions */
453 
454  /*
455  * These variables are specific to the IndexTuple case; they are set by
456  * tuplesort_begin_index_xxx and used only by the IndexTuple routines.
457  */
458  Relation heapRel; /* table the index is being built on */
459  Relation indexRel; /* index being built */
460 
461  /* These are specific to the index_btree subcase: */
462  bool enforceUnique; /* complain if we find duplicate tuples */
463 
464  /* These are specific to the index_hash subcase: */
465  uint32 high_mask; /* masks for sortable part of hash code */
468 
469  /*
470  * These variables are specific to the Datum case; they are set by
471  * tuplesort_begin_datum and used only by the DatumTuple routines.
472  */
474  /* we need typelen in order to know how to copy the Datums. */
476 
477  /*
478  * Resource snapshot for time of sort start.
479  */
480 #ifdef TRACE_SORT
482 #endif
483 };
484 
485 /*
486  * Private mutable state of tuplesort-parallel-operation. This is allocated
487  * in shared memory.
488  */
490 {
491  /* mutex protects all fields prior to tapes */
493 
494  /*
495  * currentWorker generates ordinal identifier numbers for parallel sort
496  * workers. These start from 0, and are always gapless.
497  *
498  * Workers increment workersFinished to indicate having finished. If this
499  * is equal to state.nParticipants within the leader, leader is ready to
500  * merge worker runs.
501  */
504 
505  /* Temporary file space */
507 
508  /* Size of tapes flexible array */
509  int nTapes;
510 
511  /*
512  * Tapes array used by workers to report back information needed by the
513  * leader to concatenate all worker tapes into one for merging
514  */
516 };
517 
518 /*
519  * Is the given tuple allocated from the slab memory arena?
520  */
521 #define IS_SLAB_SLOT(state, tuple) \
522  ((char *) (tuple) >= (state)->slabMemoryBegin && \
523  (char *) (tuple) < (state)->slabMemoryEnd)
524 
525 /*
526  * Return the given tuple to the slab memory free list, or free it
527  * if it was palloc'd.
528  */
529 #define RELEASE_SLAB_SLOT(state, tuple) \
530  do { \
531  SlabSlot *buf = (SlabSlot *) tuple; \
532  \
533  if (IS_SLAB_SLOT((state), buf)) \
534  { \
535  buf->nextfree = (state)->slabFreeHead; \
536  (state)->slabFreeHead = buf; \
537  } else \
538  pfree(buf); \
539  } while(0)
540 
541 #define COMPARETUP(state,a,b) ((*(state)->comparetup) (a, b, state))
542 #define COPYTUP(state,stup,tup) ((*(state)->copytup) (state, stup, tup))
543 #define WRITETUP(state,tape,stup) ((*(state)->writetup) (state, tape, stup))
544 #define READTUP(state,stup,tape,len) ((*(state)->readtup) (state, stup, tape, len))
545 #define LACKMEM(state) ((state)->availMem < 0 && !(state)->slabAllocatorUsed)
546 #define USEMEM(state,amt) ((state)->availMem -= (amt))
547 #define FREEMEM(state,amt) ((state)->availMem += (amt))
548 #define SERIAL(state) ((state)->shared == NULL)
549 #define WORKER(state) ((state)->shared && (state)->worker != -1)
550 #define LEADER(state) ((state)->shared && (state)->worker == -1)
551 
552 /*
553  * NOTES about on-tape representation of tuples:
554  *
555  * We require the first "unsigned int" of a stored tuple to be the total size
556  * on-tape of the tuple, including itself (so it is never zero; an all-zero
557  * unsigned int is used to delimit runs). The remainder of the stored tuple
558  * may or may not match the in-memory representation of the tuple ---
559  * any conversion needed is the job of the writetup and readtup routines.
560  *
561  * If state->randomAccess is true, then the stored representation of the
562  * tuple must be followed by another "unsigned int" that is a copy of the
563  * length --- so the total tape space used is actually sizeof(unsigned int)
564  * more than the stored length value. This allows read-backwards. When
565  * randomAccess is not true, the write/read routines may omit the extra
566  * length word.
567  *
568  * writetup is expected to write both length words as well as the tuple
569  * data. When readtup is called, the tape is positioned just after the
570  * front length word; readtup must read the tuple data and advance past
571  * the back length word (if present).
572  *
573  * The write/read routines can make use of the tuple description data
574  * stored in the Tuplesortstate record, if needed. They are also expected
575  * to adjust state->availMem by the amount of memory space (not tape space!)
576  * released or consumed. There is no error return from either writetup
577  * or readtup; they should ereport() on failure.
578  *
579  *
580  * NOTES about memory consumption calculations:
581  *
582  * We count space allocated for tuples against the workMem limit, plus
583  * the space used by the variable-size memtuples array. Fixed-size space
584  * is not counted; it's small enough to not be interesting.
585  *
586  * Note that we count actual space used (as shown by GetMemoryChunkSpace)
587  * rather than the originally-requested size. This is important since
588  * palloc can add substantial overhead. It's not a complete answer since
589  * we won't count any wasted space in palloc allocation blocks, but it's
590  * a lot better than what we were doing before 7.3. As of 9.6, a
591  * separate memory context is used for caller passed tuples. Resetting
592  * it at certain key increments significantly ameliorates fragmentation.
593  * Note that this places a responsibility on copytup routines to use the
594  * correct memory context for these tuples (and to not use the reset
595  * context for anything whose lifetime needs to span multiple external
596  * sort runs). readtup routines use the slab allocator (they cannot use
597  * the reset context because it gets deleted at the point that merging
598  * begins).
599  */
600 
601 /* When using this macro, beware of double evaluation of len */
602 #define LogicalTapeReadExact(tapeset, tapenum, ptr, len) \
603  do { \
604  if (LogicalTapeRead(tapeset, tapenum, ptr, len) != (size_t) (len)) \
605  elog(ERROR, "unexpected end of data"); \
606  } while(0)
607 
608 
609 static Tuplesortstate *tuplesort_begin_common(int workMem,
610  SortCoordinate coordinate,
611  bool randomAccess);
613 static void puttuple_common(Tuplesortstate *state, SortTuple *tuple);
615 static void inittapes(Tuplesortstate *state, bool mergeruns);
616 static void inittapestate(Tuplesortstate *state, int maxTapes);
617 static void selectnewtape(Tuplesortstate *state);
618 static void init_slab_allocator(Tuplesortstate *state, int numSlots);
619 static void mergeruns(Tuplesortstate *state);
620 static void mergeonerun(Tuplesortstate *state);
621 static void beginmerge(Tuplesortstate *state);
622 static bool mergereadnext(Tuplesortstate *state, int srcTape, SortTuple *stup);
623 static void dumptuples(Tuplesortstate *state, bool alltuples);
631 static unsigned int getlen(Tuplesortstate *state, int tapenum, bool eofOK);
632 static void markrunend(Tuplesortstate *state, int tapenum);
633 static void *readtup_alloc(Tuplesortstate *state, Size tuplen);
634 static int comparetup_heap(const SortTuple *a, const SortTuple *b,
636 static void copytup_heap(Tuplesortstate *state, SortTuple *stup, void *tup);
637 static void writetup_heap(Tuplesortstate *state, int tapenum,
638  SortTuple *stup);
639 static void readtup_heap(Tuplesortstate *state, SortTuple *stup,
640  int tapenum, unsigned int len);
641 static int comparetup_cluster(const SortTuple *a, const SortTuple *b,
643 static void copytup_cluster(Tuplesortstate *state, SortTuple *stup, void *tup);
644 static void writetup_cluster(Tuplesortstate *state, int tapenum,
645  SortTuple *stup);
646 static void readtup_cluster(Tuplesortstate *state, SortTuple *stup,
647  int tapenum, unsigned int len);
648 static int comparetup_index_btree(const SortTuple *a, const SortTuple *b,
650 static int comparetup_index_hash(const SortTuple *a, const SortTuple *b,
652 static void copytup_index(Tuplesortstate *state, SortTuple *stup, void *tup);
653 static void writetup_index(Tuplesortstate *state, int tapenum,
654  SortTuple *stup);
655 static void readtup_index(Tuplesortstate *state, SortTuple *stup,
656  int tapenum, unsigned int len);
657 static int comparetup_datum(const SortTuple *a, const SortTuple *b,
659 static void copytup_datum(Tuplesortstate *state, SortTuple *stup, void *tup);
660 static void writetup_datum(Tuplesortstate *state, int tapenum,
661  SortTuple *stup);
662 static void readtup_datum(Tuplesortstate *state, SortTuple *stup,
663  int tapenum, unsigned int len);
668 static void free_sort_tuple(Tuplesortstate *state, SortTuple *stup);
669 static void tuplesort_free(Tuplesortstate *state);
671 
672 /*
673  * Special versions of qsort just for SortTuple objects. qsort_tuple() sorts
674  * any variant of SortTuples, using the appropriate comparetup function.
675  * qsort_ssup() is specialized for the case where the comparetup function
676  * reduces to ApplySortComparator(), that is single-key MinimalTuple sorts
677  * and Datum sorts.
678  */
679 #include "qsort_tuple.c"
680 
681 
682 /*
683  * tuplesort_begin_xxx
684  *
685  * Initialize for a tuple sort operation.
686  *
687  * After calling tuplesort_begin, the caller should call tuplesort_putXXX
688  * zero or more times, then call tuplesort_performsort when all the tuples
689  * have been supplied. After performsort, retrieve the tuples in sorted
690  * order by calling tuplesort_getXXX until it returns false/NULL. (If random
691  * access was requested, rescan, markpos, and restorepos can also be called.)
692  * Call tuplesort_end to terminate the operation and release memory/disk space.
693  *
694  * Each variant of tuplesort_begin has a workMem parameter specifying the
695  * maximum number of kilobytes of RAM to use before spilling data to disk.
696  * (The normal value of this parameter is work_mem, but some callers use
697  * other values.) Each variant also has a randomAccess parameter specifying
698  * whether the caller needs non-sequential access to the sort result.
699  */
700 
701 static Tuplesortstate *
702 tuplesort_begin_common(int workMem, SortCoordinate coordinate,
703  bool randomAccess)
704 {
706  MemoryContext maincontext;
707  MemoryContext sortcontext;
708  MemoryContext oldcontext;
709 
710  /* See leader_takeover_tapes() remarks on randomAccess support */
711  if (coordinate && randomAccess)
712  elog(ERROR, "random access disallowed under parallel sort");
713 
714  /*
715  * Memory context surviving tuplesort_reset. This memory context holds
716  * data which is useful to keep while sorting multiple similar batches.
717  */
719  "TupleSort main",
721 
722  /*
723  * Create a working memory context for one sort operation. The content of
724  * this context is deleted by tuplesort_reset.
725  */
726  sortcontext = AllocSetContextCreate(maincontext,
727  "TupleSort sort",
729 
730  /*
731  * Additionally a working memory context for tuples is setup in
732  * tuplesort_begin_batch.
733  */
734 
735  /*
736  * Make the Tuplesortstate within the per-sortstate context. This way, we
737  * don't need a separate pfree() operation for it at shutdown.
738  */
739  oldcontext = MemoryContextSwitchTo(maincontext);
740 
741  state = (Tuplesortstate *) palloc0(sizeof(Tuplesortstate));
742 
743 #ifdef TRACE_SORT
744  if (trace_sort)
745  pg_rusage_init(&state->ru_start);
746 #endif
747 
748  state->randomAccess = randomAccess;
749  state->tuples = true;
750 
751  /*
752  * workMem is forced to be at least 64KB, the current minimum valid value
753  * for the work_mem GUC. This is a defense against parallel sort callers
754  * that divide out memory among many workers in a way that leaves each
755  * with very little memory.
756  */
757  state->allowedMem = Max(workMem, 64) * (int64) 1024;
758  state->sortcontext = sortcontext;
759  state->maincontext = maincontext;
760 
761  /*
762  * Initial size of array must be more than ALLOCSET_SEPARATE_THRESHOLD;
763  * see comments in grow_memtuples().
764  */
766  state->memtuples = NULL;
767 
768  /*
769  * After all of the other non-parallel-related state, we setup all of the
770  * state needed for each batch.
771  */
772  tuplesort_begin_batch(state);
773 
774  /*
775  * Initialize parallel-related state based on coordination information
776  * from caller
777  */
778  if (!coordinate)
779  {
780  /* Serial sort */
781  state->shared = NULL;
782  state->worker = -1;
783  state->nParticipants = -1;
784  }
785  else if (coordinate->isWorker)
786  {
787  /* Parallel worker produces exactly one final run from all input */
788  state->shared = coordinate->sharedsort;
789  state->worker = worker_get_identifier(state);
790  state->nParticipants = -1;
791  }
792  else
793  {
794  /* Parallel leader state only used for final merge */
795  state->shared = coordinate->sharedsort;
796  state->worker = -1;
797  state->nParticipants = coordinate->nParticipants;
798  Assert(state->nParticipants >= 1);
799  }
800 
801  MemoryContextSwitchTo(oldcontext);
802 
803  return state;
804 }
805 
806 /*
807  * tuplesort_begin_batch
808  *
809  * Setup, or reset, all state need for processing a new set of tuples with this
810  * sort state. Called both from tuplesort_begin_common (the first time sorting
811  * with this sort state) and tuplesort_reset (for subsequent usages).
812  */
813 static void
815 {
816  MemoryContext oldcontext;
817 
818  oldcontext = MemoryContextSwitchTo(state->maincontext);
819 
820  /*
821  * Caller tuple (e.g. IndexTuple) memory context.
822  *
823  * A dedicated child context used exclusively for caller passed tuples
824  * eases memory management. Resetting at key points reduces
825  * fragmentation. Note that the memtuples array of SortTuples is allocated
826  * in the parent context, not this context, because there is no need to
827  * free memtuples early.
828  */
830  "Caller tuples",
832 
833  state->status = TSS_INITIAL;
834  state->bounded = false;
835  state->boundUsed = false;
836 
837  state->availMem = state->allowedMem;
838 
839  state->tapeset = NULL;
840 
841  state->memtupcount = 0;
842 
843  /*
844  * Initial size of array must be more than ALLOCSET_SEPARATE_THRESHOLD;
845  * see comments in grow_memtuples().
846  */
847  state->growmemtuples = true;
848  state->slabAllocatorUsed = false;
849  if (state->memtuples != NULL && state->memtupsize != INITIAL_MEMTUPSIZE)
850  {
851  pfree(state->memtuples);
852  state->memtuples = NULL;
854  }
855  if (state->memtuples == NULL)
856  {
857  state->memtuples = (SortTuple *) palloc(state->memtupsize * sizeof(SortTuple));
858  USEMEM(state, GetMemoryChunkSpace(state->memtuples));
859  }
860 
861  /* workMem must be large enough for the minimal memtuples array */
862  if (LACKMEM(state))
863  elog(ERROR, "insufficient memory allowed for sort");
864 
865  state->currentRun = 0;
866 
867  /*
868  * maxTapes, tapeRange, and Algorithm D variables will be initialized by
869  * inittapes(), if needed
870  */
871 
872  state->result_tape = -1; /* flag that result tape has not been formed */
873 
874  MemoryContextSwitchTo(oldcontext);
875 }
876 
879  int nkeys, AttrNumber *attNums,
880  Oid *sortOperators, Oid *sortCollations,
881  bool *nullsFirstFlags,
882  int workMem, SortCoordinate coordinate, bool randomAccess)
883 {
884  Tuplesortstate *state = tuplesort_begin_common(workMem, coordinate,
885  randomAccess);
886  MemoryContext oldcontext;
887  int i;
888 
889  oldcontext = MemoryContextSwitchTo(state->maincontext);
890 
891  AssertArg(nkeys > 0);
892 
893 #ifdef TRACE_SORT
894  if (trace_sort)
895  elog(LOG,
896  "begin tuple sort: nkeys = %d, workMem = %d, randomAccess = %c",
897  nkeys, workMem, randomAccess ? 't' : 'f');
898 #endif
899 
900  state->nKeys = nkeys;
901 
902  TRACE_POSTGRESQL_SORT_START(HEAP_SORT,
903  false, /* no unique check */
904  nkeys,
905  workMem,
906  randomAccess,
907  PARALLEL_SORT(state));
908 
909  state->comparetup = comparetup_heap;
910  state->copytup = copytup_heap;
911  state->writetup = writetup_heap;
912  state->readtup = readtup_heap;
913 
914  state->tupDesc = tupDesc; /* assume we need not copy tupDesc */
915  state->abbrevNext = 10;
916 
917  /* Prepare SortSupport data for each column */
918  state->sortKeys = (SortSupport) palloc0(nkeys * sizeof(SortSupportData));
919 
920  for (i = 0; i < nkeys; i++)
921  {
922  SortSupport sortKey = state->sortKeys + i;
923 
924  AssertArg(attNums[i] != 0);
925  AssertArg(sortOperators[i] != 0);
926 
927  sortKey->ssup_cxt = CurrentMemoryContext;
928  sortKey->ssup_collation = sortCollations[i];
929  sortKey->ssup_nulls_first = nullsFirstFlags[i];
930  sortKey->ssup_attno = attNums[i];
931  /* Convey if abbreviation optimization is applicable in principle */
932  sortKey->abbreviate = (i == 0);
933 
934  PrepareSortSupportFromOrderingOp(sortOperators[i], sortKey);
935  }
936 
937  /*
938  * The "onlyKey" optimization cannot be used with abbreviated keys, since
939  * tie-breaker comparisons may be required. Typically, the optimization
940  * is only of value to pass-by-value types anyway, whereas abbreviated
941  * keys are typically only of value to pass-by-reference types.
942  */
943  if (nkeys == 1 && !state->sortKeys->abbrev_converter)
944  state->onlyKey = state->sortKeys;
945 
946  MemoryContextSwitchTo(oldcontext);
947 
948  return state;
949 }
950 
953  Relation indexRel,
954  int workMem,
955  SortCoordinate coordinate, bool randomAccess)
956 {
957  Tuplesortstate *state = tuplesort_begin_common(workMem, coordinate,
958  randomAccess);
959  BTScanInsert indexScanKey;
960  MemoryContext oldcontext;
961  int i;
962 
963  Assert(indexRel->rd_rel->relam == BTREE_AM_OID);
964 
965  oldcontext = MemoryContextSwitchTo(state->maincontext);
966 
967 #ifdef TRACE_SORT
968  if (trace_sort)
969  elog(LOG,
970  "begin tuple sort: nkeys = %d, workMem = %d, randomAccess = %c",
972  workMem, randomAccess ? 't' : 'f');
973 #endif
974 
975  state->nKeys = IndexRelationGetNumberOfKeyAttributes(indexRel);
976 
977  TRACE_POSTGRESQL_SORT_START(CLUSTER_SORT,
978  false, /* no unique check */
979  state->nKeys,
980  workMem,
981  randomAccess,
982  PARALLEL_SORT(state));
983 
985  state->copytup = copytup_cluster;
986  state->writetup = writetup_cluster;
987  state->readtup = readtup_cluster;
988  state->abbrevNext = 10;
989 
990  state->indexInfo = BuildIndexInfo(indexRel);
991 
992  state->tupDesc = tupDesc; /* assume we need not copy tupDesc */
993 
994  indexScanKey = _bt_mkscankey(indexRel, NULL);
995 
996  if (state->indexInfo->ii_Expressions != NULL)
997  {
998  TupleTableSlot *slot;
999  ExprContext *econtext;
1000 
1001  /*
1002  * We will need to use FormIndexDatum to evaluate the index
1003  * expressions. To do that, we need an EState, as well as a
1004  * TupleTableSlot to put the table tuples into. The econtext's
1005  * scantuple has to point to that slot, too.
1006  */
1007  state->estate = CreateExecutorState();
1008  slot = MakeSingleTupleTableSlot(tupDesc, &TTSOpsHeapTuple);
1009  econtext = GetPerTupleExprContext(state->estate);
1010  econtext->ecxt_scantuple = slot;
1011  }
1012 
1013  /* Prepare SortSupport data for each column */
1014  state->sortKeys = (SortSupport) palloc0(state->nKeys *
1015  sizeof(SortSupportData));
1016 
1017  for (i = 0; i < state->nKeys; i++)
1018  {
1019  SortSupport sortKey = state->sortKeys + i;
1020  ScanKey scanKey = indexScanKey->scankeys + i;
1021  int16 strategy;
1022 
1023  sortKey->ssup_cxt = CurrentMemoryContext;
1024  sortKey->ssup_collation = scanKey->sk_collation;
1025  sortKey->ssup_nulls_first =
1026  (scanKey->sk_flags & SK_BT_NULLS_FIRST) != 0;
1027  sortKey->ssup_attno = scanKey->sk_attno;
1028  /* Convey if abbreviation optimization is applicable in principle */
1029  sortKey->abbreviate = (i == 0);
1030 
1031  AssertState(sortKey->ssup_attno != 0);
1032 
1033  strategy = (scanKey->sk_flags & SK_BT_DESC) != 0 ?
1035 
1036  PrepareSortSupportFromIndexRel(indexRel, strategy, sortKey);
1037  }
1038 
1039  pfree(indexScanKey);
1040 
1041  MemoryContextSwitchTo(oldcontext);
1042 
1043  return state;
1044 }
1045 
1048  Relation indexRel,
1049  bool enforceUnique,
1050  int workMem,
1051  SortCoordinate coordinate,
1052  bool randomAccess)
1053 {
1054  Tuplesortstate *state = tuplesort_begin_common(workMem, coordinate,
1055  randomAccess);
1056  BTScanInsert indexScanKey;
1057  MemoryContext oldcontext;
1058  int i;
1059 
1060  oldcontext = MemoryContextSwitchTo(state->maincontext);
1061 
1062 #ifdef TRACE_SORT
1063  if (trace_sort)
1064  elog(LOG,
1065  "begin index sort: unique = %c, workMem = %d, randomAccess = %c",
1066  enforceUnique ? 't' : 'f',
1067  workMem, randomAccess ? 't' : 'f');
1068 #endif
1069 
1070  state->nKeys = IndexRelationGetNumberOfKeyAttributes(indexRel);
1071 
1072  TRACE_POSTGRESQL_SORT_START(INDEX_SORT,
1073  enforceUnique,
1074  state->nKeys,
1075  workMem,
1076  randomAccess,
1077  PARALLEL_SORT(state));
1078 
1080  state->copytup = copytup_index;
1081  state->writetup = writetup_index;
1082  state->readtup = readtup_index;
1083  state->abbrevNext = 10;
1084 
1085  state->heapRel = heapRel;
1086  state->indexRel = indexRel;
1087  state->enforceUnique = enforceUnique;
1088 
1089  indexScanKey = _bt_mkscankey(indexRel, NULL);
1090 
1091  /* Prepare SortSupport data for each column */
1092  state->sortKeys = (SortSupport) palloc0(state->nKeys *
1093  sizeof(SortSupportData));
1094 
1095  for (i = 0; i < state->nKeys; i++)
1096  {
1097  SortSupport sortKey = state->sortKeys + i;
1098  ScanKey scanKey = indexScanKey->scankeys + i;
1099  int16 strategy;
1100 
1101  sortKey->ssup_cxt = CurrentMemoryContext;
1102  sortKey->ssup_collation = scanKey->sk_collation;
1103  sortKey->ssup_nulls_first =
1104  (scanKey->sk_flags & SK_BT_NULLS_FIRST) != 0;
1105  sortKey->ssup_attno = scanKey->sk_attno;
1106  /* Convey if abbreviation optimization is applicable in principle */
1107  sortKey->abbreviate = (i == 0);
1108 
1109  AssertState(sortKey->ssup_attno != 0);
1110 
1111  strategy = (scanKey->sk_flags & SK_BT_DESC) != 0 ?
1113 
1114  PrepareSortSupportFromIndexRel(indexRel, strategy, sortKey);
1115  }
1116 
1117  pfree(indexScanKey);
1118 
1119  MemoryContextSwitchTo(oldcontext);
1120 
1121  return state;
1122 }
1123 
1126  Relation indexRel,
1127  uint32 high_mask,
1128  uint32 low_mask,
1129  uint32 max_buckets,
1130  int workMem,
1131  SortCoordinate coordinate,
1132  bool randomAccess)
1133 {
1134  Tuplesortstate *state = tuplesort_begin_common(workMem, coordinate,
1135  randomAccess);
1136  MemoryContext oldcontext;
1137 
1138  oldcontext = MemoryContextSwitchTo(state->maincontext);
1139 
1140 #ifdef TRACE_SORT
1141  if (trace_sort)
1142  elog(LOG,
1143  "begin index sort: high_mask = 0x%x, low_mask = 0x%x, "
1144  "max_buckets = 0x%x, workMem = %d, randomAccess = %c",
1145  high_mask,
1146  low_mask,
1147  max_buckets,
1148  workMem, randomAccess ? 't' : 'f');
1149 #endif
1150 
1151  state->nKeys = 1; /* Only one sort column, the hash code */
1152 
1154  state->copytup = copytup_index;
1155  state->writetup = writetup_index;
1156  state->readtup = readtup_index;
1157 
1158  state->heapRel = heapRel;
1159  state->indexRel = indexRel;
1160 
1161  state->high_mask = high_mask;
1162  state->low_mask = low_mask;
1163  state->max_buckets = max_buckets;
1164 
1165  MemoryContextSwitchTo(oldcontext);
1166 
1167  return state;
1168 }
1169 
1172  Relation indexRel,
1173  int workMem,
1174  SortCoordinate coordinate,
1175  bool randomAccess)
1176 {
1177  Tuplesortstate *state = tuplesort_begin_common(workMem, coordinate,
1178  randomAccess);
1179  MemoryContext oldcontext;
1180  int i;
1181 
1182  oldcontext = MemoryContextSwitchTo(state->sortcontext);
1183 
1184 #ifdef TRACE_SORT
1185  if (trace_sort)
1186  elog(LOG,
1187  "begin index sort: workMem = %d, randomAccess = %c",
1188  workMem, randomAccess ? 't' : 'f');
1189 #endif
1190 
1191  state->nKeys = IndexRelationGetNumberOfKeyAttributes(indexRel);
1192 
1194  state->copytup = copytup_index;
1195  state->writetup = writetup_index;
1196  state->readtup = readtup_index;
1197 
1198  state->heapRel = heapRel;
1199  state->indexRel = indexRel;
1200 
1201  /* Prepare SortSupport data for each column */
1202  state->sortKeys = (SortSupport) palloc0(state->nKeys *
1203  sizeof(SortSupportData));
1204 
1205  for (i = 0; i < state->nKeys; i++)
1206  {
1207  SortSupport sortKey = state->sortKeys + i;
1208 
1209  sortKey->ssup_cxt = CurrentMemoryContext;
1210  sortKey->ssup_collation = indexRel->rd_indcollation[i];
1211  sortKey->ssup_nulls_first = false;
1212  sortKey->ssup_attno = i + 1;
1213  /* Convey if abbreviation optimization is applicable in principle */
1214  sortKey->abbreviate = (i == 0);
1215 
1216  AssertState(sortKey->ssup_attno != 0);
1217 
1218  /* Look for a sort support function */
1219  PrepareSortSupportFromGistIndexRel(indexRel, sortKey);
1220  }
1221 
1222  MemoryContextSwitchTo(oldcontext);
1223 
1224  return state;
1225 }
1226 
1228 tuplesort_begin_datum(Oid datumType, Oid sortOperator, Oid sortCollation,
1229  bool nullsFirstFlag, int workMem,
1230  SortCoordinate coordinate, bool randomAccess)
1231 {
1232  Tuplesortstate *state = tuplesort_begin_common(workMem, coordinate,
1233  randomAccess);
1234  MemoryContext oldcontext;
1235  int16 typlen;
1236  bool typbyval;
1237 
1238  oldcontext = MemoryContextSwitchTo(state->maincontext);
1239 
1240 #ifdef TRACE_SORT
1241  if (trace_sort)
1242  elog(LOG,
1243  "begin datum sort: workMem = %d, randomAccess = %c",
1244  workMem, randomAccess ? 't' : 'f');
1245 #endif
1246 
1247  state->nKeys = 1; /* always a one-column sort */
1248 
1249  TRACE_POSTGRESQL_SORT_START(DATUM_SORT,
1250  false, /* no unique check */
1251  1,
1252  workMem,
1253  randomAccess,
1254  PARALLEL_SORT(state));
1255 
1256  state->comparetup = comparetup_datum;
1257  state->copytup = copytup_datum;
1258  state->writetup = writetup_datum;
1259  state->readtup = readtup_datum;
1260  state->abbrevNext = 10;
1261 
1262  state->datumType = datumType;
1263 
1264  /* lookup necessary attributes of the datum type */
1265  get_typlenbyval(datumType, &typlen, &typbyval);
1266  state->datumTypeLen = typlen;
1267  state->tuples = !typbyval;
1268 
1269  /* Prepare SortSupport data */
1270  state->sortKeys = (SortSupport) palloc0(sizeof(SortSupportData));
1271 
1273  state->sortKeys->ssup_collation = sortCollation;
1274  state->sortKeys->ssup_nulls_first = nullsFirstFlag;
1275 
1276  /*
1277  * Abbreviation is possible here only for by-reference types. In theory,
1278  * a pass-by-value datatype could have an abbreviated form that is cheaper
1279  * to compare. In a tuple sort, we could support that, because we can
1280  * always extract the original datum from the tuple as needed. Here, we
1281  * can't, because a datum sort only stores a single copy of the datum; the
1282  * "tuple" field of each SortTuple is NULL.
1283  */
1284  state->sortKeys->abbreviate = !typbyval;
1285 
1286  PrepareSortSupportFromOrderingOp(sortOperator, state->sortKeys);
1287 
1288  /*
1289  * The "onlyKey" optimization cannot be used with abbreviated keys, since
1290  * tie-breaker comparisons may be required. Typically, the optimization
1291  * is only of value to pass-by-value types anyway, whereas abbreviated
1292  * keys are typically only of value to pass-by-reference types.
1293  */
1294  if (!state->sortKeys->abbrev_converter)
1295  state->onlyKey = state->sortKeys;
1296 
1297  MemoryContextSwitchTo(oldcontext);
1298 
1299  return state;
1300 }
1301 
1302 /*
1303  * tuplesort_set_bound
1304  *
1305  * Advise tuplesort that at most the first N result tuples are required.
1306  *
1307  * Must be called before inserting any tuples. (Actually, we could allow it
1308  * as long as the sort hasn't spilled to disk, but there seems no need for
1309  * delayed calls at the moment.)
1310  *
1311  * This is a hint only. The tuplesort may still return more tuples than
1312  * requested. Parallel leader tuplesorts will always ignore the hint.
1313  */
1314 void
1316 {
1317  /* Assert we're called before loading any tuples */
1318  Assert(state->status == TSS_INITIAL && state->memtupcount == 0);
1319  /* Can't set the bound twice, either */
1320  Assert(!state->bounded);
1321  /* Also, this shouldn't be called in a parallel worker */
1322  Assert(!WORKER(state));
1323 
1324  /* Parallel leader allows but ignores hint */
1325  if (LEADER(state))
1326  return;
1327 
1328 #ifdef DEBUG_BOUNDED_SORT
1329  /* Honor GUC setting that disables the feature (for easy testing) */
1330  if (!optimize_bounded_sort)
1331  return;
1332 #endif
1333 
1334  /* We want to be able to compute bound * 2, so limit the setting */
1335  if (bound > (int64) (INT_MAX / 2))
1336  return;
1337 
1338  state->bounded = true;
1339  state->bound = (int) bound;
1340 
1341  /*
1342  * Bounded sorts are not an effective target for abbreviated key
1343  * optimization. Disable by setting state to be consistent with no
1344  * abbreviation support.
1345  */
1346  state->sortKeys->abbrev_converter = NULL;
1347  if (state->sortKeys->abbrev_full_comparator)
1349 
1350  /* Not strictly necessary, but be tidy */
1351  state->sortKeys->abbrev_abort = NULL;
1352  state->sortKeys->abbrev_full_comparator = NULL;
1353 }
1354 
1355 /*
1356  * tuplesort_used_bound
1357  *
1358  * Allow callers to find out if the sort state was able to use a bound.
1359  */
1360 bool
1362 {
1363  return state->boundUsed;
1364 }
1365 
1366 /*
1367  * tuplesort_free
1368  *
1369  * Internal routine for freeing resources of tuplesort.
1370  */
1371 static void
1373 {
1374  /* context swap probably not needed, but let's be safe */
1375  MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext);
1376 
1377 #ifdef TRACE_SORT
1378  long spaceUsed;
1379 
1380  if (state->tapeset)
1381  spaceUsed = LogicalTapeSetBlocks(state->tapeset);
1382  else
1383  spaceUsed = (state->allowedMem - state->availMem + 1023) / 1024;
1384 #endif
1385 
1386  /*
1387  * Delete temporary "tape" files, if any.
1388  *
1389  * Note: want to include this in reported total cost of sort, hence need
1390  * for two #ifdef TRACE_SORT sections.
1391  */
1392  if (state->tapeset)
1393  LogicalTapeSetClose(state->tapeset);
1394 
1395 #ifdef TRACE_SORT
1396  if (trace_sort)
1397  {
1398  if (state->tapeset)
1399  elog(LOG, "%s of worker %d ended, %ld disk blocks used: %s",
1400  SERIAL(state) ? "external sort" : "parallel external sort",
1401  state->worker, spaceUsed, pg_rusage_show(&state->ru_start));
1402  else
1403  elog(LOG, "%s of worker %d ended, %ld KB used: %s",
1404  SERIAL(state) ? "internal sort" : "unperformed parallel sort",
1405  state->worker, spaceUsed, pg_rusage_show(&state->ru_start));
1406  }
1407 
1408  TRACE_POSTGRESQL_SORT_DONE(state->tapeset != NULL, spaceUsed);
1409 #else
1410 
1411  /*
1412  * If you disabled TRACE_SORT, you can still probe sort__done, but you
1413  * ain't getting space-used stats.
1414  */
1415  TRACE_POSTGRESQL_SORT_DONE(state->tapeset != NULL, 0L);
1416 #endif
1417 
1418  /* Free any execution state created for CLUSTER case */
1419  if (state->estate != NULL)
1420  {
1421  ExprContext *econtext = GetPerTupleExprContext(state->estate);
1422 
1424  FreeExecutorState(state->estate);
1425  }
1426 
1427  MemoryContextSwitchTo(oldcontext);
1428 
1429  /*
1430  * Free the per-sort memory context, thereby releasing all working memory.
1431  */
1433 }
1434 
1435 /*
1436  * tuplesort_end
1437  *
1438  * Release resources and clean up.
1439  *
1440  * NOTE: after calling this, any pointers returned by tuplesort_getXXX are
1441  * pointing to garbage. Be careful not to attempt to use or free such
1442  * pointers afterwards!
1443  */
1444 void
1446 {
1447  tuplesort_free(state);
1448 
1449  /*
1450  * Free the main memory context, including the Tuplesortstate struct
1451  * itself.
1452  */
1454 }
1455 
1456 /*
1457  * tuplesort_updatemax
1458  *
1459  * Update maximum resource usage statistics.
1460  */
1461 static void
1463 {
1464  int64 spaceUsed;
1465  bool isSpaceDisk;
1466 
1467  /*
1468  * Note: it might seem we should provide both memory and disk usage for a
1469  * disk-based sort. However, the current code doesn't track memory space
1470  * accurately once we have begun to return tuples to the caller (since we
1471  * don't account for pfree's the caller is expected to do), so we cannot
1472  * rely on availMem in a disk sort. This does not seem worth the overhead
1473  * to fix. Is it worth creating an API for the memory context code to
1474  * tell us how much is actually used in sortcontext?
1475  */
1476  if (state->tapeset)
1477  {
1478  isSpaceDisk = true;
1479  spaceUsed = LogicalTapeSetBlocks(state->tapeset) * BLCKSZ;
1480  }
1481  else
1482  {
1483  isSpaceDisk = false;
1484  spaceUsed = state->allowedMem - state->availMem;
1485  }
1486 
1487  /*
1488  * Sort evicts data to the disk when it wasn't able to fit that data into
1489  * main memory. This is why we assume space used on the disk to be more
1490  * important for tracking resource usage than space used in memory. Note
1491  * that the amount of space occupied by some tupleset on the disk might be
1492  * less than amount of space occupied by the same tupleset in memory due
1493  * to more compact representation.
1494  */
1495  if ((isSpaceDisk && !state->isMaxSpaceDisk) ||
1496  (isSpaceDisk == state->isMaxSpaceDisk && spaceUsed > state->maxSpace))
1497  {
1498  state->maxSpace = spaceUsed;
1499  state->isMaxSpaceDisk = isSpaceDisk;
1500  state->maxSpaceStatus = state->status;
1501  }
1502 }
1503 
1504 /*
1505  * tuplesort_reset
1506  *
1507  * Reset the tuplesort. Reset all the data in the tuplesort, but leave the
1508  * meta-information in. After tuplesort_reset, tuplesort is ready to start
1509  * a new sort. This allows avoiding recreation of tuple sort states (and
1510  * save resources) when sorting multiple small batches.
1511  */
1512 void
1514 {
1515  tuplesort_updatemax(state);
1516  tuplesort_free(state);
1517 
1518  /*
1519  * After we've freed up per-batch memory, re-setup all of the state common
1520  * to both the first batch and any subsequent batch.
1521  */
1522  tuplesort_begin_batch(state);
1523 
1524  state->lastReturnedTuple = NULL;
1525  state->slabMemoryBegin = NULL;
1526  state->slabMemoryEnd = NULL;
1527  state->slabFreeHead = NULL;
1528 }
1529 
1530 /*
1531  * Grow the memtuples[] array, if possible within our memory constraint. We
1532  * must not exceed INT_MAX tuples in memory or the caller-provided memory
1533  * limit. Return true if we were able to enlarge the array, false if not.
1534  *
1535  * Normally, at each increment we double the size of the array. When doing
1536  * that would exceed a limit, we attempt one last, smaller increase (and then
1537  * clear the growmemtuples flag so we don't try any more). That allows us to
1538  * use memory as fully as permitted; sticking to the pure doubling rule could
1539  * result in almost half going unused. Because availMem moves around with
1540  * tuple addition/removal, we need some rule to prevent making repeated small
1541  * increases in memtupsize, which would just be useless thrashing. The
1542  * growmemtuples flag accomplishes that and also prevents useless
1543  * recalculations in this function.
1544  */
1545 static bool
1547 {
1548  int newmemtupsize;
1549  int memtupsize = state->memtupsize;
1550  int64 memNowUsed = state->allowedMem - state->availMem;
1551 
1552  /* Forget it if we've already maxed out memtuples, per comment above */
1553  if (!state->growmemtuples)
1554  return false;
1555 
1556  /* Select new value of memtupsize */
1557  if (memNowUsed <= state->availMem)
1558  {
1559  /*
1560  * We've used no more than half of allowedMem; double our usage,
1561  * clamping at INT_MAX tuples.
1562  */
1563  if (memtupsize < INT_MAX / 2)
1564  newmemtupsize = memtupsize * 2;
1565  else
1566  {
1567  newmemtupsize = INT_MAX;
1568  state->growmemtuples = false;
1569  }
1570  }
1571  else
1572  {
1573  /*
1574  * This will be the last increment of memtupsize. Abandon doubling
1575  * strategy and instead increase as much as we safely can.
1576  *
1577  * To stay within allowedMem, we can't increase memtupsize by more
1578  * than availMem / sizeof(SortTuple) elements. In practice, we want
1579  * to increase it by considerably less, because we need to leave some
1580  * space for the tuples to which the new array slots will refer. We
1581  * assume the new tuples will be about the same size as the tuples
1582  * we've already seen, and thus we can extrapolate from the space
1583  * consumption so far to estimate an appropriate new size for the
1584  * memtuples array. The optimal value might be higher or lower than
1585  * this estimate, but it's hard to know that in advance. We again
1586  * clamp at INT_MAX tuples.
1587  *
1588  * This calculation is safe against enlarging the array so much that
1589  * LACKMEM becomes true, because the memory currently used includes
1590  * the present array; thus, there would be enough allowedMem for the
1591  * new array elements even if no other memory were currently used.
1592  *
1593  * We do the arithmetic in float8, because otherwise the product of
1594  * memtupsize and allowedMem could overflow. Any inaccuracy in the
1595  * result should be insignificant; but even if we computed a
1596  * completely insane result, the checks below will prevent anything
1597  * really bad from happening.
1598  */
1599  double grow_ratio;
1600 
1601  grow_ratio = (double) state->allowedMem / (double) memNowUsed;
1602  if (memtupsize * grow_ratio < INT_MAX)
1603  newmemtupsize = (int) (memtupsize * grow_ratio);
1604  else
1605  newmemtupsize = INT_MAX;
1606 
1607  /* We won't make any further enlargement attempts */
1608  state->growmemtuples = false;
1609  }
1610 
1611  /* Must enlarge array by at least one element, else report failure */
1612  if (newmemtupsize <= memtupsize)
1613  goto noalloc;
1614 
1615  /*
1616  * On a 32-bit machine, allowedMem could exceed MaxAllocHugeSize. Clamp
1617  * to ensure our request won't be rejected. Note that we can easily
1618  * exhaust address space before facing this outcome. (This is presently
1619  * impossible due to guc.c's MAX_KILOBYTES limitation on work_mem, but
1620  * don't rely on that at this distance.)
1621  */
1622  if ((Size) newmemtupsize >= MaxAllocHugeSize / sizeof(SortTuple))
1623  {
1624  newmemtupsize = (int) (MaxAllocHugeSize / sizeof(SortTuple));
1625  state->growmemtuples = false; /* can't grow any more */
1626  }
1627 
1628  /*
1629  * We need to be sure that we do not cause LACKMEM to become true, else
1630  * the space management algorithm will go nuts. The code above should
1631  * never generate a dangerous request, but to be safe, check explicitly
1632  * that the array growth fits within availMem. (We could still cause
1633  * LACKMEM if the memory chunk overhead associated with the memtuples
1634  * array were to increase. That shouldn't happen because we chose the
1635  * initial array size large enough to ensure that palloc will be treating
1636  * both old and new arrays as separate chunks. But we'll check LACKMEM
1637  * explicitly below just in case.)
1638  */
1639  if (state->availMem < (int64) ((newmemtupsize - memtupsize) * sizeof(SortTuple)))
1640  goto noalloc;
1641 
1642  /* OK, do it */
1643  FREEMEM(state, GetMemoryChunkSpace(state->memtuples));
1644  state->memtupsize = newmemtupsize;
1645  state->memtuples = (SortTuple *)
1646  repalloc_huge(state->memtuples,
1647  state->memtupsize * sizeof(SortTuple));
1648  USEMEM(state, GetMemoryChunkSpace(state->memtuples));
1649  if (LACKMEM(state))
1650  elog(ERROR, "unexpected out-of-memory situation in tuplesort");
1651  return true;
1652 
1653 noalloc:
1654  /* If for any reason we didn't realloc, shut off future attempts */
1655  state->growmemtuples = false;
1656  return false;
1657 }
1658 
1659 /*
1660  * Accept one tuple while collecting input data for sort.
1661  *
1662  * Note that the input data is always copied; the caller need not save it.
1663  */
1664 void
1666 {
1667  MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext);
1668  SortTuple stup;
1669 
1670  /*
1671  * Copy the given tuple into memory we control, and decrease availMem.
1672  * Then call the common code.
1673  */
1674  COPYTUP(state, &stup, (void *) slot);
1675 
1676  puttuple_common(state, &stup);
1677 
1678  MemoryContextSwitchTo(oldcontext);
1679 }
1680 
1681 /*
1682  * Accept one tuple while collecting input data for sort.
1683  *
1684  * Note that the input data is always copied; the caller need not save it.
1685  */
1686 void
1688 {
1689  MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext);
1690  SortTuple stup;
1691 
1692  /*
1693  * Copy the given tuple into memory we control, and decrease availMem.
1694  * Then call the common code.
1695  */
1696  COPYTUP(state, &stup, (void *) tup);
1697 
1698  puttuple_common(state, &stup);
1699 
1700  MemoryContextSwitchTo(oldcontext);
1701 }
1702 
1703 /*
1704  * Collect one index tuple while collecting input data for sort, building
1705  * it from caller-supplied values.
1706  */
1707 void
1709  ItemPointer self, Datum *values,
1710  bool *isnull)
1711 {
1712  MemoryContext oldcontext = MemoryContextSwitchTo(state->tuplecontext);
1713  SortTuple stup;
1714  Datum original;
1715  IndexTuple tuple;
1716 
1717  stup.tuple = index_form_tuple(RelationGetDescr(rel), values, isnull);
1718  tuple = ((IndexTuple) stup.tuple);
1719  tuple->t_tid = *self;
1720  USEMEM(state, GetMemoryChunkSpace(stup.tuple));
1721  /* set up first-column key value */
1722  original = index_getattr(tuple,
1723  1,
1724  RelationGetDescr(state->indexRel),
1725  &stup.isnull1);
1726 
1728 
1729  if (!state->sortKeys || !state->sortKeys->abbrev_converter || stup.isnull1)
1730  {
1731  /*
1732  * Store ordinary Datum representation, or NULL value. If there is a
1733  * converter it won't expect NULL values, and cost model is not
1734  * required to account for NULL, so in that case we avoid calling
1735  * converter and just set datum1 to zeroed representation (to be
1736  * consistent, and to support cheap inequality tests for NULL
1737  * abbreviated keys).
1738  */
1739  stup.datum1 = original;
1740  }
1741  else if (!consider_abort_common(state))
1742  {
1743  /* Store abbreviated key representation */
1744  stup.datum1 = state->sortKeys->abbrev_converter(original,
1745  state->sortKeys);
1746  }
1747  else
1748  {
1749  /* Abort abbreviation */
1750  int i;
1751 
1752  stup.datum1 = original;
1753 
1754  /*
1755  * Set state to be consistent with never trying abbreviation.
1756  *
1757  * Alter datum1 representation in already-copied tuples, so as to
1758  * ensure a consistent representation (current tuple was just
1759  * handled). It does not matter if some dumped tuples are already
1760  * sorted on tape, since serialized tuples lack abbreviated keys
1761  * (TSS_BUILDRUNS state prevents control reaching here in any case).
1762  */
1763  for (i = 0; i < state->memtupcount; i++)
1764  {
1765  SortTuple *mtup = &state->memtuples[i];
1766 
1767  tuple = mtup->tuple;
1768  mtup->datum1 = index_getattr(tuple,
1769  1,
1770  RelationGetDescr(state->indexRel),
1771  &mtup->isnull1);
1772  }
1773  }
1774 
1775  puttuple_common(state, &stup);
1776 
1777  MemoryContextSwitchTo(oldcontext);
1778 }
1779 
1780 /*
1781  * Accept one Datum while collecting input data for sort.
1782  *
1783  * If the Datum is pass-by-ref type, the value will be copied.
1784  */
1785 void
1787 {
1788  MemoryContext oldcontext = MemoryContextSwitchTo(state->tuplecontext);
1789  SortTuple stup;
1790 
1791  /*
1792  * Pass-by-value types or null values are just stored directly in
1793  * stup.datum1 (and stup.tuple is not used and set to NULL).
1794  *
1795  * Non-null pass-by-reference values need to be copied into memory we
1796  * control, and possibly abbreviated. The copied value is pointed to by
1797  * stup.tuple and is treated as the canonical copy (e.g. to return via
1798  * tuplesort_getdatum or when writing to tape); stup.datum1 gets the
1799  * abbreviated value if abbreviation is happening, otherwise it's
1800  * identical to stup.tuple.
1801  */
1802 
1803  if (isNull || !state->tuples)
1804  {
1805  /*
1806  * Set datum1 to zeroed representation for NULLs (to be consistent,
1807  * and to support cheap inequality tests for NULL abbreviated keys).
1808  */
1809  stup.datum1 = !isNull ? val : (Datum) 0;
1810  stup.isnull1 = isNull;
1811  stup.tuple = NULL; /* no separate storage */
1813  }
1814  else
1815  {
1816  Datum original = datumCopy(val, false, state->datumTypeLen);
1817 
1818  stup.isnull1 = false;
1819  stup.tuple = DatumGetPointer(original);
1820  USEMEM(state, GetMemoryChunkSpace(stup.tuple));
1822 
1823  if (!state->sortKeys->abbrev_converter)
1824  {
1825  stup.datum1 = original;
1826  }
1827  else if (!consider_abort_common(state))
1828  {
1829  /* Store abbreviated key representation */
1830  stup.datum1 = state->sortKeys->abbrev_converter(original,
1831  state->sortKeys);
1832  }
1833  else
1834  {
1835  /* Abort abbreviation */
1836  int i;
1837 
1838  stup.datum1 = original;
1839 
1840  /*
1841  * Set state to be consistent with never trying abbreviation.
1842  *
1843  * Alter datum1 representation in already-copied tuples, so as to
1844  * ensure a consistent representation (current tuple was just
1845  * handled). It does not matter if some dumped tuples are already
1846  * sorted on tape, since serialized tuples lack abbreviated keys
1847  * (TSS_BUILDRUNS state prevents control reaching here in any
1848  * case).
1849  */
1850  for (i = 0; i < state->memtupcount; i++)
1851  {
1852  SortTuple *mtup = &state->memtuples[i];
1853 
1854  mtup->datum1 = PointerGetDatum(mtup->tuple);
1855  }
1856  }
1857  }
1858 
1859  puttuple_common(state, &stup);
1860 
1861  MemoryContextSwitchTo(oldcontext);
1862 }
1863 
1864 /*
1865  * Shared code for tuple and datum cases.
1866  */
1867 static void
1869 {
1870  Assert(!LEADER(state));
1871 
1872  switch (state->status)
1873  {
1874  case TSS_INITIAL:
1875 
1876  /*
1877  * Save the tuple into the unsorted array. First, grow the array
1878  * as needed. Note that we try to grow the array when there is
1879  * still one free slot remaining --- if we fail, there'll still be
1880  * room to store the incoming tuple, and then we'll switch to
1881  * tape-based operation.
1882  */
1883  if (state->memtupcount >= state->memtupsize - 1)
1884  {
1885  (void) grow_memtuples(state);
1886  Assert(state->memtupcount < state->memtupsize);
1887  }
1888  state->memtuples[state->memtupcount++] = *tuple;
1889 
1890  /*
1891  * Check if it's time to switch over to a bounded heapsort. We do
1892  * so if the input tuple count exceeds twice the desired tuple
1893  * count (this is a heuristic for where heapsort becomes cheaper
1894  * than a quicksort), or if we've just filled workMem and have
1895  * enough tuples to meet the bound.
1896  *
1897  * Note that once we enter TSS_BOUNDED state we will always try to
1898  * complete the sort that way. In the worst case, if later input
1899  * tuples are larger than earlier ones, this might cause us to
1900  * exceed workMem significantly.
1901  */
1902  if (state->bounded &&
1903  (state->memtupcount > state->bound * 2 ||
1904  (state->memtupcount > state->bound && LACKMEM(state))))
1905  {
1906 #ifdef TRACE_SORT
1907  if (trace_sort)
1908  elog(LOG, "switching to bounded heapsort at %d tuples: %s",
1909  state->memtupcount,
1910  pg_rusage_show(&state->ru_start));
1911 #endif
1912  make_bounded_heap(state);
1913  return;
1914  }
1915 
1916  /*
1917  * Done if we still fit in available memory and have array slots.
1918  */
1919  if (state->memtupcount < state->memtupsize && !LACKMEM(state))
1920  return;
1921 
1922  /*
1923  * Nope; time to switch to tape-based operation.
1924  */
1925  inittapes(state, true);
1926 
1927  /*
1928  * Dump all tuples.
1929  */
1930  dumptuples(state, false);
1931  break;
1932 
1933  case TSS_BOUNDED:
1934 
1935  /*
1936  * We don't want to grow the array here, so check whether the new
1937  * tuple can be discarded before putting it in. This should be a
1938  * good speed optimization, too, since when there are many more
1939  * input tuples than the bound, most input tuples can be discarded
1940  * with just this one comparison. Note that because we currently
1941  * have the sort direction reversed, we must check for <= not >=.
1942  */
1943  if (COMPARETUP(state, tuple, &state->memtuples[0]) <= 0)
1944  {
1945  /* new tuple <= top of the heap, so we can discard it */
1946  free_sort_tuple(state, tuple);
1948  }
1949  else
1950  {
1951  /* discard top of heap, replacing it with the new tuple */
1952  free_sort_tuple(state, &state->memtuples[0]);
1953  tuplesort_heap_replace_top(state, tuple);
1954  }
1955  break;
1956 
1957  case TSS_BUILDRUNS:
1958 
1959  /*
1960  * Save the tuple into the unsorted array (there must be space)
1961  */
1962  state->memtuples[state->memtupcount++] = *tuple;
1963 
1964  /*
1965  * If we are over the memory limit, dump all tuples.
1966  */
1967  dumptuples(state, false);
1968  break;
1969 
1970  default:
1971  elog(ERROR, "invalid tuplesort state");
1972  break;
1973  }
1974 }
1975 
1976 static bool
1978 {
1979  Assert(state->sortKeys[0].abbrev_converter != NULL);
1980  Assert(state->sortKeys[0].abbrev_abort != NULL);
1981  Assert(state->sortKeys[0].abbrev_full_comparator != NULL);
1982 
1983  /*
1984  * Check effectiveness of abbreviation optimization. Consider aborting
1985  * when still within memory limit.
1986  */
1987  if (state->status == TSS_INITIAL &&
1988  state->memtupcount >= state->abbrevNext)
1989  {
1990  state->abbrevNext *= 2;
1991 
1992  /*
1993  * Check opclass-supplied abbreviation abort routine. It may indicate
1994  * that abbreviation should not proceed.
1995  */
1996  if (!state->sortKeys->abbrev_abort(state->memtupcount,
1997  state->sortKeys))
1998  return false;
1999 
2000  /*
2001  * Finally, restore authoritative comparator, and indicate that
2002  * abbreviation is not in play by setting abbrev_converter to NULL
2003  */
2004  state->sortKeys[0].comparator = state->sortKeys[0].abbrev_full_comparator;
2005  state->sortKeys[0].abbrev_converter = NULL;
2006  /* Not strictly necessary, but be tidy */
2007  state->sortKeys[0].abbrev_abort = NULL;
2008  state->sortKeys[0].abbrev_full_comparator = NULL;
2009 
2010  /* Give up - expect original pass-by-value representation */
2011  return true;
2012  }
2013 
2014  return false;
2015 }
2016 
2017 /*
2018  * All tuples have been provided; finish the sort.
2019  */
2020 void
2022 {
2023  MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext);
2024 
2025 #ifdef TRACE_SORT
2026  if (trace_sort)
2027  elog(LOG, "performsort of worker %d starting: %s",
2028  state->worker, pg_rusage_show(&state->ru_start));
2029 #endif
2030 
2031  switch (state->status)
2032  {
2033  case TSS_INITIAL:
2034 
2035  /*
2036  * We were able to accumulate all the tuples within the allowed
2037  * amount of memory, or leader to take over worker tapes
2038  */
2039  if (SERIAL(state))
2040  {
2041  /* Just qsort 'em and we're done */
2042  tuplesort_sort_memtuples(state);
2043  state->status = TSS_SORTEDINMEM;
2044  }
2045  else if (WORKER(state))
2046  {
2047  /*
2048  * Parallel workers must still dump out tuples to tape. No
2049  * merge is required to produce single output run, though.
2050  */
2051  inittapes(state, false);
2052  dumptuples(state, true);
2053  worker_nomergeruns(state);
2054  state->status = TSS_SORTEDONTAPE;
2055  }
2056  else
2057  {
2058  /*
2059  * Leader will take over worker tapes and merge worker runs.
2060  * Note that mergeruns sets the correct state->status.
2061  */
2062  leader_takeover_tapes(state);
2063  mergeruns(state);
2064  }
2065  state->current = 0;
2066  state->eof_reached = false;
2067  state->markpos_block = 0L;
2068  state->markpos_offset = 0;
2069  state->markpos_eof = false;
2070  break;
2071 
2072  case TSS_BOUNDED:
2073 
2074  /*
2075  * We were able to accumulate all the tuples required for output
2076  * in memory, using a heap to eliminate excess tuples. Now we
2077  * have to transform the heap to a properly-sorted array.
2078  */
2079  sort_bounded_heap(state);
2080  state->current = 0;
2081  state->eof_reached = false;
2082  state->markpos_offset = 0;
2083  state->markpos_eof = false;
2084  state->status = TSS_SORTEDINMEM;
2085  break;
2086 
2087  case TSS_BUILDRUNS:
2088 
2089  /*
2090  * Finish tape-based sort. First, flush all tuples remaining in
2091  * memory out to tape; then merge until we have a single remaining
2092  * run (or, if !randomAccess and !WORKER(), one run per tape).
2093  * Note that mergeruns sets the correct state->status.
2094  */
2095  dumptuples(state, true);
2096  mergeruns(state);
2097  state->eof_reached = false;
2098  state->markpos_block = 0L;
2099  state->markpos_offset = 0;
2100  state->markpos_eof = false;
2101  break;
2102 
2103  default:
2104  elog(ERROR, "invalid tuplesort state");
2105  break;
2106  }
2107 
2108 #ifdef TRACE_SORT
2109  if (trace_sort)
2110  {
2111  if (state->status == TSS_FINALMERGE)
2112  elog(LOG, "performsort of worker %d done (except %d-way final merge): %s",
2113  state->worker, state->activeTapes,
2114  pg_rusage_show(&state->ru_start));
2115  else
2116  elog(LOG, "performsort of worker %d done: %s",
2117  state->worker, pg_rusage_show(&state->ru_start));
2118  }
2119 #endif
2120 
2121  MemoryContextSwitchTo(oldcontext);
2122 }
2123 
2124 /*
2125  * Internal routine to fetch the next tuple in either forward or back
2126  * direction into *stup. Returns false if no more tuples.
2127  * Returned tuple belongs to tuplesort memory context, and must not be freed
2128  * by caller. Note that fetched tuple is stored in memory that may be
2129  * recycled by any future fetch.
2130  */
2131 static bool
2133  SortTuple *stup)
2134 {
2135  unsigned int tuplen;
2136  size_t nmoved;
2137 
2138  Assert(!WORKER(state));
2139 
2140  switch (state->status)
2141  {
2142  case TSS_SORTEDINMEM:
2143  Assert(forward || state->randomAccess);
2144  Assert(!state->slabAllocatorUsed);
2145  if (forward)
2146  {
2147  if (state->current < state->memtupcount)
2148  {
2149  *stup = state->memtuples[state->current++];
2150  return true;
2151  }
2152  state->eof_reached = true;
2153 
2154  /*
2155  * Complain if caller tries to retrieve more tuples than
2156  * originally asked for in a bounded sort. This is because
2157  * returning EOF here might be the wrong thing.
2158  */
2159  if (state->bounded && state->current >= state->bound)
2160  elog(ERROR, "retrieved too many tuples in a bounded sort");
2161 
2162  return false;
2163  }
2164  else
2165  {
2166  if (state->current <= 0)
2167  return false;
2168 
2169  /*
2170  * if all tuples are fetched already then we return last
2171  * tuple, else - tuple before last returned.
2172  */
2173  if (state->eof_reached)
2174  state->eof_reached = false;
2175  else
2176  {
2177  state->current--; /* last returned tuple */
2178  if (state->current <= 0)
2179  return false;
2180  }
2181  *stup = state->memtuples[state->current - 1];
2182  return true;
2183  }
2184  break;
2185 
2186  case TSS_SORTEDONTAPE:
2187  Assert(forward || state->randomAccess);
2188  Assert(state->slabAllocatorUsed);
2189 
2190  /*
2191  * The slot that held the tuple that we returned in previous
2192  * gettuple call can now be reused.
2193  */
2194  if (state->lastReturnedTuple)
2195  {
2196  RELEASE_SLAB_SLOT(state, state->lastReturnedTuple);
2197  state->lastReturnedTuple = NULL;
2198  }
2199 
2200  if (forward)
2201  {
2202  if (state->eof_reached)
2203  return false;
2204 
2205  if ((tuplen = getlen(state, state->result_tape, true)) != 0)
2206  {
2207  READTUP(state, stup, state->result_tape, tuplen);
2208 
2209  /*
2210  * Remember the tuple we return, so that we can recycle
2211  * its memory on next call. (This can be NULL, in the
2212  * !state->tuples case).
2213  */
2214  state->lastReturnedTuple = stup->tuple;
2215 
2216  return true;
2217  }
2218  else
2219  {
2220  state->eof_reached = true;
2221  return false;
2222  }
2223  }
2224 
2225  /*
2226  * Backward.
2227  *
2228  * if all tuples are fetched already then we return last tuple,
2229  * else - tuple before last returned.
2230  */
2231  if (state->eof_reached)
2232  {
2233  /*
2234  * Seek position is pointing just past the zero tuplen at the
2235  * end of file; back up to fetch last tuple's ending length
2236  * word. If seek fails we must have a completely empty file.
2237  */
2238  nmoved = LogicalTapeBackspace(state->tapeset,
2239  state->result_tape,
2240  2 * sizeof(unsigned int));
2241  if (nmoved == 0)
2242  return false;
2243  else if (nmoved != 2 * sizeof(unsigned int))
2244  elog(ERROR, "unexpected tape position");
2245  state->eof_reached = false;
2246  }
2247  else
2248  {
2249  /*
2250  * Back up and fetch previously-returned tuple's ending length
2251  * word. If seek fails, assume we are at start of file.
2252  */
2253  nmoved = LogicalTapeBackspace(state->tapeset,
2254  state->result_tape,
2255  sizeof(unsigned int));
2256  if (nmoved == 0)
2257  return false;
2258  else if (nmoved != sizeof(unsigned int))
2259  elog(ERROR, "unexpected tape position");
2260  tuplen = getlen(state, state->result_tape, false);
2261 
2262  /*
2263  * Back up to get ending length word of tuple before it.
2264  */
2265  nmoved = LogicalTapeBackspace(state->tapeset,
2266  state->result_tape,
2267  tuplen + 2 * sizeof(unsigned int));
2268  if (nmoved == tuplen + sizeof(unsigned int))
2269  {
2270  /*
2271  * We backed up over the previous tuple, but there was no
2272  * ending length word before it. That means that the prev
2273  * tuple is the first tuple in the file. It is now the
2274  * next to read in forward direction (not obviously right,
2275  * but that is what in-memory case does).
2276  */
2277  return false;
2278  }
2279  else if (nmoved != tuplen + 2 * sizeof(unsigned int))
2280  elog(ERROR, "bogus tuple length in backward scan");
2281  }
2282 
2283  tuplen = getlen(state, state->result_tape, false);
2284 
2285  /*
2286  * Now we have the length of the prior tuple, back up and read it.
2287  * Note: READTUP expects we are positioned after the initial
2288  * length word of the tuple, so back up to that point.
2289  */
2290  nmoved = LogicalTapeBackspace(state->tapeset,
2291  state->result_tape,
2292  tuplen);
2293  if (nmoved != tuplen)
2294  elog(ERROR, "bogus tuple length in backward scan");
2295  READTUP(state, stup, state->result_tape, tuplen);
2296 
2297  /*
2298  * Remember the tuple we return, so that we can recycle its memory
2299  * on next call. (This can be NULL, in the Datum case).
2300  */
2301  state->lastReturnedTuple = stup->tuple;
2302 
2303  return true;
2304 
2305  case TSS_FINALMERGE:
2306  Assert(forward);
2307  /* We are managing memory ourselves, with the slab allocator. */
2308  Assert(state->slabAllocatorUsed);
2309 
2310  /*
2311  * The slab slot holding the tuple that we returned in previous
2312  * gettuple call can now be reused.
2313  */
2314  if (state->lastReturnedTuple)
2315  {
2316  RELEASE_SLAB_SLOT(state, state->lastReturnedTuple);
2317  state->lastReturnedTuple = NULL;
2318  }
2319 
2320  /*
2321  * This code should match the inner loop of mergeonerun().
2322  */
2323  if (state->memtupcount > 0)
2324  {
2325  int srcTape = state->memtuples[0].srctape;
2326  SortTuple newtup;
2327 
2328  *stup = state->memtuples[0];
2329 
2330  /*
2331  * Remember the tuple we return, so that we can recycle its
2332  * memory on next call. (This can be NULL, in the Datum case).
2333  */
2334  state->lastReturnedTuple = stup->tuple;
2335 
2336  /*
2337  * Pull next tuple from tape, and replace the returned tuple
2338  * at top of the heap with it.
2339  */
2340  if (!mergereadnext(state, srcTape, &newtup))
2341  {
2342  /*
2343  * If no more data, we've reached end of run on this tape.
2344  * Remove the top node from the heap.
2345  */
2347 
2348  /*
2349  * Rewind to free the read buffer. It'd go away at the
2350  * end of the sort anyway, but better to release the
2351  * memory early.
2352  */
2353  LogicalTapeRewindForWrite(state->tapeset, srcTape);
2354  return true;
2355  }
2356  newtup.srctape = srcTape;
2357  tuplesort_heap_replace_top(state, &newtup);
2358  return true;
2359  }
2360  return false;
2361 
2362  default:
2363  elog(ERROR, "invalid tuplesort state");
2364  return false; /* keep compiler quiet */
2365  }
2366 }
2367 
2368 /*
2369  * Fetch the next tuple in either forward or back direction.
2370  * If successful, put tuple in slot and return true; else, clear the slot
2371  * and return false.
2372  *
2373  * Caller may optionally be passed back abbreviated value (on true return
2374  * value) when abbreviation was used, which can be used to cheaply avoid
2375  * equality checks that might otherwise be required. Caller can safely make a
2376  * determination of "non-equal tuple" based on simple binary inequality. A
2377  * NULL value in leading attribute will set abbreviated value to zeroed
2378  * representation, which caller may rely on in abbreviated inequality check.
2379  *
2380  * If copy is true, the slot receives a tuple that's been copied into the
2381  * caller's memory context, so that it will stay valid regardless of future
2382  * manipulations of the tuplesort's state (up to and including deleting the
2383  * tuplesort). If copy is false, the slot will just receive a pointer to a
2384  * tuple held within the tuplesort, which is more efficient, but only safe for
2385  * callers that are prepared to have any subsequent manipulation of the
2386  * tuplesort's state invalidate slot contents.
2387  */
2388 bool
2389 tuplesort_gettupleslot(Tuplesortstate *state, bool forward, bool copy,
2390  TupleTableSlot *slot, Datum *abbrev)
2391 {
2392  MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext);
2393  SortTuple stup;
2394 
2395  if (!tuplesort_gettuple_common(state, forward, &stup))
2396  stup.tuple = NULL;
2397 
2398  MemoryContextSwitchTo(oldcontext);
2399 
2400  if (stup.tuple)
2401  {
2402  /* Record abbreviated key for caller */
2403  if (state->sortKeys->abbrev_converter && abbrev)
2404  *abbrev = stup.datum1;
2405 
2406  if (copy)
2408 
2409  ExecStoreMinimalTuple((MinimalTuple) stup.tuple, slot, copy);
2410  return true;
2411  }
2412  else
2413  {
2414  ExecClearTuple(slot);
2415  return false;
2416  }
2417 }
2418 
2419 /*
2420  * Fetch the next tuple in either forward or back direction.
2421  * Returns NULL if no more tuples. Returned tuple belongs to tuplesort memory
2422  * context, and must not be freed by caller. Caller may not rely on tuple
2423  * remaining valid after any further manipulation of tuplesort.
2424  */
2425 HeapTuple
2427 {
2428  MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext);
2429  SortTuple stup;
2430 
2431  if (!tuplesort_gettuple_common(state, forward, &stup))
2432  stup.tuple = NULL;
2433 
2434  MemoryContextSwitchTo(oldcontext);
2435 
2436  return stup.tuple;
2437 }
2438 
2439 /*
2440  * Fetch the next index tuple in either forward or back direction.
2441  * Returns NULL if no more tuples. Returned tuple belongs to tuplesort memory
2442  * context, and must not be freed by caller. Caller may not rely on tuple
2443  * remaining valid after any further manipulation of tuplesort.
2444  */
2445 IndexTuple
2447 {
2448  MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext);
2449  SortTuple stup;
2450 
2451  if (!tuplesort_gettuple_common(state, forward, &stup))
2452  stup.tuple = NULL;
2453 
2454  MemoryContextSwitchTo(oldcontext);
2455 
2456  return (IndexTuple) stup.tuple;
2457 }
2458 
2459 /*
2460  * Fetch the next Datum in either forward or back direction.
2461  * Returns false if no more datums.
2462  *
2463  * If the Datum is pass-by-ref type, the returned value is freshly palloc'd
2464  * in caller's context, and is now owned by the caller (this differs from
2465  * similar routines for other types of tuplesorts).
2466  *
2467  * Caller may optionally be passed back abbreviated value (on true return
2468  * value) when abbreviation was used, which can be used to cheaply avoid
2469  * equality checks that might otherwise be required. Caller can safely make a
2470  * determination of "non-equal tuple" based on simple binary inequality. A
2471  * NULL value will have a zeroed abbreviated value representation, which caller
2472  * may rely on in abbreviated inequality check.
2473  */
2474 bool
2476  Datum *val, bool *isNull, Datum *abbrev)
2477 {
2478  MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext);
2479  SortTuple stup;
2480 
2481  if (!tuplesort_gettuple_common(state, forward, &stup))
2482  {
2483  MemoryContextSwitchTo(oldcontext);
2484  return false;
2485  }
2486 
2487  /* Ensure we copy into caller's memory context */
2488  MemoryContextSwitchTo(oldcontext);
2489 
2490  /* Record abbreviated key for caller */
2491  if (state->sortKeys->abbrev_converter && abbrev)
2492  *abbrev = stup.datum1;
2493 
2494  if (stup.isnull1 || !state->tuples)
2495  {
2496  *val = stup.datum1;
2497  *isNull = stup.isnull1;
2498  }
2499  else
2500  {
2501  /* use stup.tuple because stup.datum1 may be an abbreviation */
2502  *val = datumCopy(PointerGetDatum(stup.tuple), false, state->datumTypeLen);
2503  *isNull = false;
2504  }
2505 
2506  return true;
2507 }
2508 
2509 /*
2510  * Advance over N tuples in either forward or back direction,
2511  * without returning any data. N==0 is a no-op.
2512  * Returns true if successful, false if ran out of tuples.
2513  */
2514 bool
2515 tuplesort_skiptuples(Tuplesortstate *state, int64 ntuples, bool forward)
2516 {
2517  MemoryContext oldcontext;
2518 
2519  /*
2520  * We don't actually support backwards skip yet, because no callers need
2521  * it. The API is designed to allow for that later, though.
2522  */
2523  Assert(forward);
2524  Assert(ntuples >= 0);
2525  Assert(!WORKER(state));
2526 
2527  switch (state->status)
2528  {
2529  case TSS_SORTEDINMEM:
2530  if (state->memtupcount - state->current >= ntuples)
2531  {
2532  state->current += ntuples;
2533  return true;
2534  }
2535  state->current = state->memtupcount;
2536  state->eof_reached = true;
2537 
2538  /*
2539  * Complain if caller tries to retrieve more tuples than
2540  * originally asked for in a bounded sort. This is because
2541  * returning EOF here might be the wrong thing.
2542  */
2543  if (state->bounded && state->current >= state->bound)
2544  elog(ERROR, "retrieved too many tuples in a bounded sort");
2545 
2546  return false;
2547 
2548  case TSS_SORTEDONTAPE:
2549  case TSS_FINALMERGE:
2550 
2551  /*
2552  * We could probably optimize these cases better, but for now it's
2553  * not worth the trouble.
2554  */
2555  oldcontext = MemoryContextSwitchTo(state->sortcontext);
2556  while (ntuples-- > 0)
2557  {
2558  SortTuple stup;
2559 
2560  if (!tuplesort_gettuple_common(state, forward, &stup))
2561  {
2562  MemoryContextSwitchTo(oldcontext);
2563  return false;
2564  }
2566  }
2567  MemoryContextSwitchTo(oldcontext);
2568  return true;
2569 
2570  default:
2571  elog(ERROR, "invalid tuplesort state");
2572  return false; /* keep compiler quiet */
2573  }
2574 }
2575 
2576 /*
2577  * tuplesort_merge_order - report merge order we'll use for given memory
2578  * (note: "merge order" just means the number of input tapes in the merge).
2579  *
2580  * This is exported for use by the planner. allowedMem is in bytes.
2581  */
2582 int
2583 tuplesort_merge_order(int64 allowedMem)
2584 {
2585  int mOrder;
2586 
2587  /*
2588  * We need one tape for each merge input, plus another one for the output,
2589  * and each of these tapes needs buffer space. In addition we want
2590  * MERGE_BUFFER_SIZE workspace per input tape (but the output tape doesn't
2591  * count).
2592  *
2593  * Note: you might be thinking we need to account for the memtuples[]
2594  * array in this calculation, but we effectively treat that as part of the
2595  * MERGE_BUFFER_SIZE workspace.
2596  */
2597  mOrder = (allowedMem - TAPE_BUFFER_OVERHEAD) /
2599 
2600  /*
2601  * Even in minimum memory, use at least a MINORDER merge. On the other
2602  * hand, even when we have lots of memory, do not use more than a MAXORDER
2603  * merge. Tapes are pretty cheap, but they're not entirely free. Each
2604  * additional tape reduces the amount of memory available to build runs,
2605  * which in turn can cause the same sort to need more runs, which makes
2606  * merging slower even if it can still be done in a single pass. Also,
2607  * high order merges are quite slow due to CPU cache effects; it can be
2608  * faster to pay the I/O cost of a polyphase merge than to perform a
2609  * single merge pass across many hundreds of tapes.
2610  */
2611  mOrder = Max(mOrder, MINORDER);
2612  mOrder = Min(mOrder, MAXORDER);
2613 
2614  return mOrder;
2615 }
2616 
2617 /*
2618  * inittapes - initialize for tape sorting.
2619  *
2620  * This is called only if we have found we won't sort in memory.
2621  */
2622 static void
2624 {
2625  int maxTapes,
2626  j;
2627 
2628  Assert(!LEADER(state));
2629 
2630  if (mergeruns)
2631  {
2632  /* Compute number of tapes to use: merge order plus 1 */
2633  maxTapes = tuplesort_merge_order(state->allowedMem) + 1;
2634  }
2635  else
2636  {
2637  /* Workers can sometimes produce single run, output without merge */
2638  Assert(WORKER(state));
2639  maxTapes = MINORDER + 1;
2640  }
2641 
2642 #ifdef TRACE_SORT
2643  if (trace_sort)
2644  elog(LOG, "worker %d switching to external sort with %d tapes: %s",
2645  state->worker, maxTapes, pg_rusage_show(&state->ru_start));
2646 #endif
2647 
2648  /* Create the tape set and allocate the per-tape data arrays */
2649  inittapestate(state, maxTapes);
2650  state->tapeset =
2651  LogicalTapeSetCreate(maxTapes, false, NULL,
2652  state->shared ? &state->shared->fileset : NULL,
2653  state->worker);
2654 
2655  state->currentRun = 0;
2656 
2657  /*
2658  * Initialize variables of Algorithm D (step D1).
2659  */
2660  for (j = 0; j < maxTapes; j++)
2661  {
2662  state->tp_fib[j] = 1;
2663  state->tp_runs[j] = 0;
2664  state->tp_dummy[j] = 1;
2665  state->tp_tapenum[j] = j;
2666  }
2667  state->tp_fib[state->tapeRange] = 0;
2668  state->tp_dummy[state->tapeRange] = 0;
2669 
2670  state->Level = 1;
2671  state->destTape = 0;
2672 
2673  state->status = TSS_BUILDRUNS;
2674 }
2675 
2676 /*
2677  * inittapestate - initialize generic tape management state
2678  */
2679 static void
2681 {
2682  int64 tapeSpace;
2683 
2684  /*
2685  * Decrease availMem to reflect the space needed for tape buffers; but
2686  * don't decrease it to the point that we have no room for tuples. (That
2687  * case is only likely to occur if sorting pass-by-value Datums; in all
2688  * other scenarios the memtuples[] array is unlikely to occupy more than
2689  * half of allowedMem. In the pass-by-value case it's not important to
2690  * account for tuple space, so we don't care if LACKMEM becomes
2691  * inaccurate.)
2692  */
2693  tapeSpace = (int64) maxTapes * TAPE_BUFFER_OVERHEAD;
2694 
2695  if (tapeSpace + GetMemoryChunkSpace(state->memtuples) < state->allowedMem)
2696  USEMEM(state, tapeSpace);
2697 
2698  /*
2699  * Make sure that the temp file(s) underlying the tape set are created in
2700  * suitable temp tablespaces. For parallel sorts, this should have been
2701  * called already, but it doesn't matter if it is called a second time.
2702  */
2704 
2705  state->mergeactive = (bool *) palloc0(maxTapes * sizeof(bool));
2706  state->tp_fib = (int *) palloc0(maxTapes * sizeof(int));
2707  state->tp_runs = (int *) palloc0(maxTapes * sizeof(int));
2708  state->tp_dummy = (int *) palloc0(maxTapes * sizeof(int));
2709  state->tp_tapenum = (int *) palloc0(maxTapes * sizeof(int));
2710 
2711  /* Record # of tapes allocated (for duration of sort) */
2712  state->maxTapes = maxTapes;
2713  /* Record maximum # of tapes usable as inputs when merging */
2714  state->tapeRange = maxTapes - 1;
2715 }
2716 
2717 /*
2718  * selectnewtape -- select new tape for new initial run.
2719  *
2720  * This is called after finishing a run when we know another run
2721  * must be started. This implements steps D3, D4 of Algorithm D.
2722  */
2723 static void
2725 {
2726  int j;
2727  int a;
2728 
2729  /* Step D3: advance j (destTape) */
2730  if (state->tp_dummy[state->destTape] < state->tp_dummy[state->destTape + 1])
2731  {
2732  state->destTape++;
2733  return;
2734  }
2735  if (state->tp_dummy[state->destTape] != 0)
2736  {
2737  state->destTape = 0;
2738  return;
2739  }
2740 
2741  /* Step D4: increase level */
2742  state->Level++;
2743  a = state->tp_fib[0];
2744  for (j = 0; j < state->tapeRange; j++)
2745  {
2746  state->tp_dummy[j] = a + state->tp_fib[j + 1] - state->tp_fib[j];
2747  state->tp_fib[j] = a + state->tp_fib[j + 1];
2748  }
2749  state->destTape = 0;
2750 }
2751 
2752 /*
2753  * Initialize the slab allocation arena, for the given number of slots.
2754  */
2755 static void
2757 {
2758  if (numSlots > 0)
2759  {
2760  char *p;
2761  int i;
2762 
2763  state->slabMemoryBegin = palloc(numSlots * SLAB_SLOT_SIZE);
2764  state->slabMemoryEnd = state->slabMemoryBegin +
2765  numSlots * SLAB_SLOT_SIZE;
2766  state->slabFreeHead = (SlabSlot *) state->slabMemoryBegin;
2767  USEMEM(state, numSlots * SLAB_SLOT_SIZE);
2768 
2769  p = state->slabMemoryBegin;
2770  for (i = 0; i < numSlots - 1; i++)
2771  {
2772  ((SlabSlot *) p)->nextfree = (SlabSlot *) (p + SLAB_SLOT_SIZE);
2773  p += SLAB_SLOT_SIZE;
2774  }
2775  ((SlabSlot *) p)->nextfree = NULL;
2776  }
2777  else
2778  {
2779  state->slabMemoryBegin = state->slabMemoryEnd = NULL;
2780  state->slabFreeHead = NULL;
2781  }
2782  state->slabAllocatorUsed = true;
2783 }
2784 
2785 /*
2786  * mergeruns -- merge all the completed initial runs.
2787  *
2788  * This implements steps D5, D6 of Algorithm D. All input data has
2789  * already been written to initial runs on tape (see dumptuples).
2790  */
2791 static void
2793 {
2794  int tapenum,
2795  svTape,
2796  svRuns,
2797  svDummy;
2798  int numTapes;
2799  int numInputTapes;
2800 
2801  Assert(state->status == TSS_BUILDRUNS);
2802  Assert(state->memtupcount == 0);
2803 
2804  if (state->sortKeys != NULL && state->sortKeys->abbrev_converter != NULL)
2805  {
2806  /*
2807  * If there are multiple runs to be merged, when we go to read back
2808  * tuples from disk, abbreviated keys will not have been stored, and
2809  * we don't care to regenerate them. Disable abbreviation from this
2810  * point on.
2811  */
2812  state->sortKeys->abbrev_converter = NULL;
2814 
2815  /* Not strictly necessary, but be tidy */
2816  state->sortKeys->abbrev_abort = NULL;
2817  state->sortKeys->abbrev_full_comparator = NULL;
2818  }
2819 
2820  /*
2821  * Reset tuple memory. We've freed all the tuples that we previously
2822  * allocated. We will use the slab allocator from now on.
2823  */
2825 
2826  /*
2827  * We no longer need a large memtuples array. (We will allocate a smaller
2828  * one for the heap later.)
2829  */
2830  FREEMEM(state, GetMemoryChunkSpace(state->memtuples));
2831  pfree(state->memtuples);
2832  state->memtuples = NULL;
2833 
2834  /*
2835  * If we had fewer runs than tapes, refund the memory that we imagined we
2836  * would need for the tape buffers of the unused tapes.
2837  *
2838  * numTapes and numInputTapes reflect the actual number of tapes we will
2839  * use. Note that the output tape's tape number is maxTapes - 1, so the
2840  * tape numbers of the used tapes are not consecutive, and you cannot just
2841  * loop from 0 to numTapes to visit all used tapes!
2842  */
2843  if (state->Level == 1)
2844  {
2845  numInputTapes = state->currentRun;
2846  numTapes = numInputTapes + 1;
2847  FREEMEM(state, (state->maxTapes - numTapes) * TAPE_BUFFER_OVERHEAD);
2848  }
2849  else
2850  {
2851  numInputTapes = state->tapeRange;
2852  numTapes = state->maxTapes;
2853  }
2854 
2855  /*
2856  * Initialize the slab allocator. We need one slab slot per input tape,
2857  * for the tuples in the heap, plus one to hold the tuple last returned
2858  * from tuplesort_gettuple. (If we're sorting pass-by-val Datums,
2859  * however, we don't need to do allocate anything.)
2860  *
2861  * From this point on, we no longer use the USEMEM()/LACKMEM() mechanism
2862  * to track memory usage of individual tuples.
2863  */
2864  if (state->tuples)
2865  init_slab_allocator(state, numInputTapes + 1);
2866  else
2867  init_slab_allocator(state, 0);
2868 
2869  /*
2870  * Allocate a new 'memtuples' array, for the heap. It will hold one tuple
2871  * from each input tape.
2872  */
2873  state->memtupsize = numInputTapes;
2874  state->memtuples = (SortTuple *) MemoryContextAlloc(state->maincontext,
2875  numInputTapes * sizeof(SortTuple));
2876  USEMEM(state, GetMemoryChunkSpace(state->memtuples));
2877 
2878  /*
2879  * Use all the remaining memory we have available for read buffers among
2880  * the input tapes.
2881  *
2882  * We don't try to "rebalance" the memory among tapes, when we start a new
2883  * merge phase, even if some tapes are inactive in the new phase. That
2884  * would be hard, because logtape.c doesn't know where one run ends and
2885  * another begins. When a new merge phase begins, and a tape doesn't
2886  * participate in it, its buffer nevertheless already contains tuples from
2887  * the next run on same tape, so we cannot release the buffer. That's OK
2888  * in practice, merge performance isn't that sensitive to the amount of
2889  * buffers used, and most merge phases use all or almost all tapes,
2890  * anyway.
2891  */
2892 #ifdef TRACE_SORT
2893  if (trace_sort)
2894  elog(LOG, "worker %d using " INT64_FORMAT " KB of memory for read buffers among %d input tapes",
2895  state->worker, state->availMem / 1024, numInputTapes);
2896 #endif
2897 
2898  state->read_buffer_size = Max(state->availMem / numInputTapes, 0);
2899  USEMEM(state, state->read_buffer_size * numInputTapes);
2900 
2901  /* End of step D2: rewind all output tapes to prepare for merging */
2902  for (tapenum = 0; tapenum < state->tapeRange; tapenum++)
2903  LogicalTapeRewindForRead(state->tapeset, tapenum, state->read_buffer_size);
2904 
2905  for (;;)
2906  {
2907  /*
2908  * At this point we know that tape[T] is empty. If there's just one
2909  * (real or dummy) run left on each input tape, then only one merge
2910  * pass remains. If we don't have to produce a materialized sorted
2911  * tape, we can stop at this point and do the final merge on-the-fly.
2912  */
2913  if (!state->randomAccess && !WORKER(state))
2914  {
2915  bool allOneRun = true;
2916 
2917  Assert(state->tp_runs[state->tapeRange] == 0);
2918  for (tapenum = 0; tapenum < state->tapeRange; tapenum++)
2919  {
2920  if (state->tp_runs[tapenum] + state->tp_dummy[tapenum] != 1)
2921  {
2922  allOneRun = false;
2923  break;
2924  }
2925  }
2926  if (allOneRun)
2927  {
2928  /* Tell logtape.c we won't be writing anymore */
2930  /* Initialize for the final merge pass */
2931  beginmerge(state);
2932  state->status = TSS_FINALMERGE;
2933  return;
2934  }
2935  }
2936 
2937  /* Step D5: merge runs onto tape[T] until tape[P] is empty */
2938  while (state->tp_runs[state->tapeRange - 1] ||
2939  state->tp_dummy[state->tapeRange - 1])
2940  {
2941  bool allDummy = true;
2942 
2943  for (tapenum = 0; tapenum < state->tapeRange; tapenum++)
2944  {
2945  if (state->tp_dummy[tapenum] == 0)
2946  {
2947  allDummy = false;
2948  break;
2949  }
2950  }
2951 
2952  if (allDummy)
2953  {
2954  state->tp_dummy[state->tapeRange]++;
2955  for (tapenum = 0; tapenum < state->tapeRange; tapenum++)
2956  state->tp_dummy[tapenum]--;
2957  }
2958  else
2959  mergeonerun(state);
2960  }
2961 
2962  /* Step D6: decrease level */
2963  if (--state->Level == 0)
2964  break;
2965  /* rewind output tape T to use as new input */
2966  LogicalTapeRewindForRead(state->tapeset, state->tp_tapenum[state->tapeRange],
2967  state->read_buffer_size);
2968  /* rewind used-up input tape P, and prepare it for write pass */
2969  LogicalTapeRewindForWrite(state->tapeset, state->tp_tapenum[state->tapeRange - 1]);
2970  state->tp_runs[state->tapeRange - 1] = 0;
2971 
2972  /*
2973  * reassign tape units per step D6; note we no longer care about A[]
2974  */
2975  svTape = state->tp_tapenum[state->tapeRange];
2976  svDummy = state->tp_dummy[state->tapeRange];
2977  svRuns = state->tp_runs[state->tapeRange];
2978  for (tapenum = state->tapeRange; tapenum > 0; tapenum--)
2979  {
2980  state->tp_tapenum[tapenum] = state->tp_tapenum[tapenum - 1];
2981  state->tp_dummy[tapenum] = state->tp_dummy[tapenum - 1];
2982  state->tp_runs[tapenum] = state->tp_runs[tapenum - 1];
2983  }
2984  state->tp_tapenum[0] = svTape;
2985  state->tp_dummy[0] = svDummy;
2986  state->tp_runs[0] = svRuns;
2987  }
2988 
2989  /*
2990  * Done. Knuth says that the result is on TAPE[1], but since we exited
2991  * the loop without performing the last iteration of step D6, we have not
2992  * rearranged the tape unit assignment, and therefore the result is on
2993  * TAPE[T]. We need to do it this way so that we can freeze the final
2994  * output tape while rewinding it. The last iteration of step D6 would be
2995  * a waste of cycles anyway...
2996  */
2997  state->result_tape = state->tp_tapenum[state->tapeRange];
2998  if (!WORKER(state))
2999  LogicalTapeFreeze(state->tapeset, state->result_tape, NULL);
3000  else
3002  state->status = TSS_SORTEDONTAPE;
3003 
3004  /* Release the read buffers of all the other tapes, by rewinding them. */
3005  for (tapenum = 0; tapenum < state->maxTapes; tapenum++)
3006  {
3007  if (tapenum != state->result_tape)
3008  LogicalTapeRewindForWrite(state->tapeset, tapenum);
3009  }
3010 }
3011 
3012 /*
3013  * Merge one run from each input tape, except ones with dummy runs.
3014  *
3015  * This is the inner loop of Algorithm D step D5. We know that the
3016  * output tape is TAPE[T].
3017  */
3018 static void
3020 {
3021  int destTape = state->tp_tapenum[state->tapeRange];
3022  int srcTape;
3023 
3024  /*
3025  * Start the merge by loading one tuple from each active source tape into
3026  * the heap. We can also decrease the input run/dummy run counts.
3027  */
3028  beginmerge(state);
3029 
3030  /*
3031  * Execute merge by repeatedly extracting lowest tuple in heap, writing it
3032  * out, and replacing it with next tuple from same tape (if there is
3033  * another one).
3034  */
3035  while (state->memtupcount > 0)
3036  {
3037  SortTuple stup;
3038 
3039  /* write the tuple to destTape */
3040  srcTape = state->memtuples[0].srctape;
3041  WRITETUP(state, destTape, &state->memtuples[0]);
3042 
3043  /* recycle the slot of the tuple we just wrote out, for the next read */
3044  if (state->memtuples[0].tuple)
3045  RELEASE_SLAB_SLOT(state, state->memtuples[0].tuple);
3046 
3047  /*
3048  * pull next tuple from the tape, and replace the written-out tuple in
3049  * the heap with it.
3050  */
3051  if (mergereadnext(state, srcTape, &stup))
3052  {
3053  stup.srctape = srcTape;
3054  tuplesort_heap_replace_top(state, &stup);
3055  }
3056  else
3058  }
3059 
3060  /*
3061  * When the heap empties, we're done. Write an end-of-run marker on the
3062  * output tape, and increment its count of real runs.
3063  */
3064  markrunend(state, destTape);
3065  state->tp_runs[state->tapeRange]++;
3066 
3067 #ifdef TRACE_SORT
3068  if (trace_sort)
3069  elog(LOG, "worker %d finished %d-way merge step: %s", state->worker,
3070  state->activeTapes, pg_rusage_show(&state->ru_start));
3071 #endif
3072 }
3073 
3074 /*
3075  * beginmerge - initialize for a merge pass
3076  *
3077  * We decrease the counts of real and dummy runs for each tape, and mark
3078  * which tapes contain active input runs in mergeactive[]. Then, fill the
3079  * merge heap with the first tuple from each active tape.
3080  */
3081 static void
3083 {
3084  int activeTapes;
3085  int tapenum;
3086  int srcTape;
3087 
3088  /* Heap should be empty here */
3089  Assert(state->memtupcount == 0);
3090 
3091  /* Adjust run counts and mark the active tapes */
3092  memset(state->mergeactive, 0,
3093  state->maxTapes * sizeof(*state->mergeactive));
3094  activeTapes = 0;
3095  for (tapenum = 0; tapenum < state->tapeRange; tapenum++)
3096  {
3097  if (state->tp_dummy[tapenum] > 0)
3098  state->tp_dummy[tapenum]--;
3099  else
3100  {
3101  Assert(state->tp_runs[tapenum] > 0);
3102  state->tp_runs[tapenum]--;
3103  srcTape = state->tp_tapenum[tapenum];
3104  state->mergeactive[srcTape] = true;
3105  activeTapes++;
3106  }
3107  }
3108  Assert(activeTapes > 0);
3109  state->activeTapes = activeTapes;
3110 
3111  /* Load the merge heap with the first tuple from each input tape */
3112  for (srcTape = 0; srcTape < state->maxTapes; srcTape++)
3113  {
3114  SortTuple tup;
3115 
3116  if (mergereadnext(state, srcTape, &tup))
3117  {
3118  tup.srctape = srcTape;
3119  tuplesort_heap_insert(state, &tup);
3120  }
3121  }
3122 }
3123 
3124 /*
3125  * mergereadnext - read next tuple from one merge input tape
3126  *
3127  * Returns false on EOF.
3128  */
3129 static bool
3131 {
3132  unsigned int tuplen;
3133 
3134  if (!state->mergeactive[srcTape])
3135  return false; /* tape's run is already exhausted */
3136 
3137  /* read next tuple, if any */
3138  if ((tuplen = getlen(state, srcTape, true)) == 0)
3139  {
3140  state->mergeactive[srcTape] = false;
3141  return false;
3142  }
3143  READTUP(state, stup, srcTape, tuplen);
3144 
3145  return true;
3146 }
3147 
3148 /*
3149  * dumptuples - remove tuples from memtuples and write initial run to tape
3150  *
3151  * When alltuples = true, dump everything currently in memory. (This case is
3152  * only used at end of input data.)
3153  */
3154 static void
3156 {
3157  int memtupwrite;
3158  int i;
3159 
3160  /*
3161  * Nothing to do if we still fit in available memory and have array slots,
3162  * unless this is the final call during initial run generation.
3163  */
3164  if (state->memtupcount < state->memtupsize && !LACKMEM(state) &&
3165  !alltuples)
3166  return;
3167 
3168  /*
3169  * Final call might require no sorting, in rare cases where we just so
3170  * happen to have previously LACKMEM()'d at the point where exactly all
3171  * remaining tuples are loaded into memory, just before input was
3172  * exhausted.
3173  *
3174  * In general, short final runs are quite possible. Rather than allowing
3175  * a special case where there was a superfluous selectnewtape() call (i.e.
3176  * a call with no subsequent run actually written to destTape), we prefer
3177  * to write out a 0 tuple run.
3178  *
3179  * mergereadnext() is prepared for 0 tuple runs, and will reliably mark
3180  * the tape inactive for the merge when called from beginmerge(). This
3181  * case is therefore similar to the case where mergeonerun() finds a dummy
3182  * run for the tape, and so doesn't need to merge a run from the tape (or
3183  * conceptually "merges" the dummy run, if you prefer). According to
3184  * Knuth, Algorithm D "isn't strictly optimal" in its method of
3185  * distribution and dummy run assignment; this edge case seems very
3186  * unlikely to make that appreciably worse.
3187  */
3188  Assert(state->status == TSS_BUILDRUNS);
3189 
3190  /*
3191  * It seems unlikely that this limit will ever be exceeded, but take no
3192  * chances
3193  */
3194  if (state->currentRun == INT_MAX)
3195  ereport(ERROR,
3196  (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
3197  errmsg("cannot have more than %d runs for an external sort",
3198  INT_MAX)));
3199 
3200  state->currentRun++;
3201 
3202 #ifdef TRACE_SORT
3203  if (trace_sort)
3204  elog(LOG, "worker %d starting quicksort of run %d: %s",
3205  state->worker, state->currentRun,
3206  pg_rusage_show(&state->ru_start));
3207 #endif
3208 
3209  /*
3210  * Sort all tuples accumulated within the allowed amount of memory for
3211  * this run using quicksort
3212  */
3213  tuplesort_sort_memtuples(state);
3214 
3215 #ifdef TRACE_SORT
3216  if (trace_sort)
3217  elog(LOG, "worker %d finished quicksort of run %d: %s",
3218  state->worker, state->currentRun,
3219  pg_rusage_show(&state->ru_start));
3220 #endif
3221 
3222  memtupwrite = state->memtupcount;
3223  for (i = 0; i < memtupwrite; i++)
3224  {
3225  WRITETUP(state, state->tp_tapenum[state->destTape],
3226  &state->memtuples[i]);
3227  state->memtupcount--;
3228  }
3229 
3230  /*
3231  * Reset tuple memory. We've freed all of the tuples that we previously
3232  * allocated. It's important to avoid fragmentation when there is a stark
3233  * change in the sizes of incoming tuples. Fragmentation due to
3234  * AllocSetFree's bucketing by size class might be particularly bad if
3235  * this step wasn't taken.
3236  */
3238 
3239  markrunend(state, state->tp_tapenum[state->destTape]);
3240  state->tp_runs[state->destTape]++;
3241  state->tp_dummy[state->destTape]--; /* per Alg D step D2 */
3242 
3243 #ifdef TRACE_SORT
3244  if (trace_sort)
3245  elog(LOG, "worker %d finished writing run %d to tape %d: %s",
3246  state->worker, state->currentRun, state->destTape,
3247  pg_rusage_show(&state->ru_start));
3248 #endif
3249 
3250  if (!alltuples)
3251  selectnewtape(state);
3252 }
3253 
3254 /*
3255  * tuplesort_rescan - rewind and replay the scan
3256  */
3257 void
3259 {
3260  MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext);
3261 
3262  Assert(state->randomAccess);
3263 
3264  switch (state->status)
3265  {
3266  case TSS_SORTEDINMEM:
3267  state->current = 0;
3268  state->eof_reached = false;
3269  state->markpos_offset = 0;
3270  state->markpos_eof = false;
3271  break;
3272  case TSS_SORTEDONTAPE:
3274  state->result_tape,
3275  0);
3276  state->eof_reached = false;
3277  state->markpos_block = 0L;
3278  state->markpos_offset = 0;
3279  state->markpos_eof = false;
3280  break;
3281  default:
3282  elog(ERROR, "invalid tuplesort state");
3283  break;
3284  }
3285 
3286  MemoryContextSwitchTo(oldcontext);
3287 }
3288 
3289 /*
3290  * tuplesort_markpos - saves current position in the merged sort file
3291  */
3292 void
3294 {
3295  MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext);
3296 
3297  Assert(state->randomAccess);
3298 
3299  switch (state->status)
3300  {
3301  case TSS_SORTEDINMEM:
3302  state->markpos_offset = state->current;
3303  state->markpos_eof = state->eof_reached;
3304  break;
3305  case TSS_SORTEDONTAPE:
3306  LogicalTapeTell(state->tapeset,
3307  state->result_tape,
3308  &state->markpos_block,
3309  &state->markpos_offset);
3310  state->markpos_eof = state->eof_reached;
3311  break;
3312  default:
3313  elog(ERROR, "invalid tuplesort state");
3314  break;
3315  }
3316 
3317  MemoryContextSwitchTo(oldcontext);
3318 }
3319 
3320 /*
3321  * tuplesort_restorepos - restores current position in merged sort file to
3322  * last saved position
3323  */
3324 void
3326 {
3327  MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext);
3328 
3329  Assert(state->randomAccess);
3330 
3331  switch (state->status)
3332  {
3333  case TSS_SORTEDINMEM:
3334  state->current = state->markpos_offset;
3335  state->eof_reached = state->markpos_eof;
3336  break;
3337  case TSS_SORTEDONTAPE:
3338  LogicalTapeSeek(state->tapeset,
3339  state->result_tape,
3340  state->markpos_block,
3341  state->markpos_offset);
3342  state->eof_reached = state->markpos_eof;
3343  break;
3344  default:
3345  elog(ERROR, "invalid tuplesort state");
3346  break;
3347  }
3348 
3349  MemoryContextSwitchTo(oldcontext);
3350 }
3351 
3352 /*
3353  * tuplesort_get_stats - extract summary statistics
3354  *
3355  * This can be called after tuplesort_performsort() finishes to obtain
3356  * printable summary information about how the sort was performed.
3357  */
3358 void
3360  TuplesortInstrumentation *stats)
3361 {
3362  /*
3363  * Note: it might seem we should provide both memory and disk usage for a
3364  * disk-based sort. However, the current code doesn't track memory space
3365  * accurately once we have begun to return tuples to the caller (since we
3366  * don't account for pfree's the caller is expected to do), so we cannot
3367  * rely on availMem in a disk sort. This does not seem worth the overhead
3368  * to fix. Is it worth creating an API for the memory context code to
3369  * tell us how much is actually used in sortcontext?
3370  */
3371  tuplesort_updatemax(state);
3372 
3373  if (state->isMaxSpaceDisk)
3375  else
3377  stats->spaceUsed = (state->maxSpace + 1023) / 1024;
3378 
3379  switch (state->maxSpaceStatus)
3380  {
3381  case TSS_SORTEDINMEM:
3382  if (state->boundUsed)
3384  else
3386  break;
3387  case TSS_SORTEDONTAPE:
3389  break;
3390  case TSS_FINALMERGE:
3392  break;
3393  default:
3395  break;
3396  }
3397 }
3398 
3399 /*
3400  * Convert TuplesortMethod to a string.
3401  */
3402 const char *
3404 {
3405  switch (m)
3406  {
3408  return "still in progress";
3410  return "top-N heapsort";
3411  case SORT_TYPE_QUICKSORT:
3412  return "quicksort";
3414  return "external sort";
3416  return "external merge";
3417  }
3418 
3419  return "unknown";
3420 }
3421 
3422 /*
3423  * Convert TuplesortSpaceType to a string.
3424  */
3425 const char *
3427 {
3429  return t == SORT_SPACE_TYPE_DISK ? "Disk" : "Memory";
3430 }
3431 
3432 
3433 /*
3434  * Heap manipulation routines, per Knuth's Algorithm 5.2.3H.
3435  */
3436 
3437 /*
3438  * Convert the existing unordered array of SortTuples to a bounded heap,
3439  * discarding all but the smallest "state->bound" tuples.
3440  *
3441  * When working with a bounded heap, we want to keep the largest entry
3442  * at the root (array entry zero), instead of the smallest as in the normal
3443  * sort case. This allows us to discard the largest entry cheaply.
3444  * Therefore, we temporarily reverse the sort direction.
3445  */
3446 static void
3448 {
3449  int tupcount = state->memtupcount;
3450  int i;
3451 
3452  Assert(state->status == TSS_INITIAL);
3453  Assert(state->bounded);
3454  Assert(tupcount >= state->bound);
3455  Assert(SERIAL(state));
3456 
3457  /* Reverse sort direction so largest entry will be at root */
3458  reversedirection(state);
3459 
3460  state->memtupcount = 0; /* make the heap empty */
3461  for (i = 0; i < tupcount; i++)
3462  {
3463  if (state->memtupcount < state->bound)
3464  {
3465  /* Insert next tuple into heap */
3466  /* Must copy source tuple to avoid possible overwrite */
3467  SortTuple stup = state->memtuples[i];
3468 
3469  tuplesort_heap_insert(state, &stup);
3470  }
3471  else
3472  {
3473  /*
3474  * The heap is full. Replace the largest entry with the new
3475  * tuple, or just discard it, if it's larger than anything already
3476  * in the heap.
3477  */
3478  if (COMPARETUP(state, &state->memtuples[i], &state->memtuples[0]) <= 0)
3479  {
3480  free_sort_tuple(state, &state->memtuples[i]);
3482  }
3483  else
3484  tuplesort_heap_replace_top(state, &state->memtuples[i]);
3485  }
3486  }
3487 
3488  Assert(state->memtupcount == state->bound);
3489  state->status = TSS_BOUNDED;
3490 }
3491 
3492 /*
3493  * Convert the bounded heap to a properly-sorted array
3494  */
3495 static void
3497 {
3498  int tupcount = state->memtupcount;
3499 
3500  Assert(state->status == TSS_BOUNDED);
3501  Assert(state->bounded);
3502  Assert(tupcount == state->bound);
3503  Assert(SERIAL(state));
3504 
3505  /*
3506  * We can unheapify in place because each delete-top call will remove the
3507  * largest entry, which we can promptly store in the newly freed slot at
3508  * the end. Once we're down to a single-entry heap, we're done.
3509  */
3510  while (state->memtupcount > 1)
3511  {
3512  SortTuple stup = state->memtuples[0];
3513 
3514  /* this sifts-up the next-largest entry and decreases memtupcount */
3516  state->memtuples[state->memtupcount] = stup;
3517  }
3518  state->memtupcount = tupcount;
3519 
3520  /*
3521  * Reverse sort direction back to the original state. This is not
3522  * actually necessary but seems like a good idea for tidiness.
3523  */
3524  reversedirection(state);
3525 
3526  state->status = TSS_SORTEDINMEM;
3527  state->boundUsed = true;
3528 }
3529 
3530 /*
3531  * Sort all memtuples using specialized qsort() routines.
3532  *
3533  * Quicksort is used for small in-memory sorts, and external sort runs.
3534  */
3535 static void
3537 {
3538  Assert(!LEADER(state));
3539 
3540  if (state->memtupcount > 1)
3541  {
3542  /* Can we use the single-key sort function? */
3543  if (state->onlyKey != NULL)
3544  qsort_ssup(state->memtuples, state->memtupcount,
3545  state->onlyKey);
3546  else
3547  qsort_tuple(state->memtuples,
3548  state->memtupcount,
3549  state->comparetup,
3550  state);
3551  }
3552 }
3553 
3554 /*
3555  * Insert a new tuple into an empty or existing heap, maintaining the
3556  * heap invariant. Caller is responsible for ensuring there's room.
3557  *
3558  * Note: For some callers, tuple points to a memtuples[] entry above the
3559  * end of the heap. This is safe as long as it's not immediately adjacent
3560  * to the end of the heap (ie, in the [memtupcount] array entry) --- if it
3561  * is, it might get overwritten before being moved into the heap!
3562  */
3563 static void
3565 {
3566  SortTuple *memtuples;
3567  int j;
3568 
3569  memtuples = state->memtuples;
3570  Assert(state->memtupcount < state->memtupsize);
3571 
3573 
3574  /*
3575  * Sift-up the new entry, per Knuth 5.2.3 exercise 16. Note that Knuth is
3576  * using 1-based array indexes, not 0-based.
3577  */
3578  j = state->memtupcount++;
3579  while (j > 0)
3580  {
3581  int i = (j - 1) >> 1;
3582 
3583  if (COMPARETUP(state, tuple, &memtuples[i]) >= 0)
3584  break;
3585  memtuples[j] = memtuples[i];
3586  j = i;
3587  }
3588  memtuples[j] = *tuple;
3589 }
3590 
3591 /*
3592  * Remove the tuple at state->memtuples[0] from the heap. Decrement
3593  * memtupcount, and sift up to maintain the heap invariant.
3594  *
3595  * The caller has already free'd the tuple the top node points to,
3596  * if necessary.
3597  */
3598 static void
3600 {
3601  SortTuple *memtuples = state->memtuples;
3602  SortTuple *tuple;
3603 
3604  if (--state->memtupcount <= 0)
3605  return;
3606 
3607  /*
3608  * Remove the last tuple in the heap, and re-insert it, by replacing the
3609  * current top node with it.
3610  */
3611  tuple = &memtuples[state->memtupcount];
3612  tuplesort_heap_replace_top(state, tuple);
3613 }
3614 
3615 /*
3616  * Replace the tuple at state->memtuples[0] with a new tuple. Sift up to
3617  * maintain the heap invariant.
3618  *
3619  * This corresponds to Knuth's "sift-up" algorithm (Algorithm 5.2.3H,
3620  * Heapsort, steps H3-H8).
3621  */
3622 static void
3624 {
3625  SortTuple *memtuples = state->memtuples;
3626  unsigned int i,
3627  n;
3628 
3629  Assert(state->memtupcount >= 1);
3630 
3632 
3633  /*
3634  * state->memtupcount is "int", but we use "unsigned int" for i, j, n.
3635  * This prevents overflow in the "2 * i + 1" calculation, since at the top
3636  * of the loop we must have i < n <= INT_MAX <= UINT_MAX/2.
3637  */
3638  n = state->memtupcount;
3639  i = 0; /* i is where the "hole" is */
3640  for (;;)
3641  {
3642  unsigned int j = 2 * i + 1;
3643 
3644  if (j >= n)
3645  break;
3646  if (j + 1 < n &&
3647  COMPARETUP(state, &memtuples[j], &memtuples[j + 1]) > 0)
3648  j++;
3649  if (COMPARETUP(state, tuple, &memtuples[j]) <= 0)
3650  break;
3651  memtuples[i] = memtuples[j];
3652  i = j;
3653  }
3654  memtuples[i] = *tuple;
3655 }
3656 
3657 /*
3658  * Function to reverse the sort direction from its current state
3659  *
3660  * It is not safe to call this when performing hash tuplesorts
3661  */
3662 static void
3664 {
3665  SortSupport sortKey = state->sortKeys;
3666  int nkey;
3667 
3668  for (nkey = 0; nkey < state->nKeys; nkey++, sortKey++)
3669  {
3670  sortKey->ssup_reverse = !sortKey->ssup_reverse;
3671  sortKey->ssup_nulls_first = !sortKey->ssup_nulls_first;
3672  }
3673 }
3674 
3675 
3676 /*
3677  * Tape interface routines
3678  */
3679 
3680 static unsigned int
3681 getlen(Tuplesortstate *state, int tapenum, bool eofOK)
3682 {
3683  unsigned int len;
3684 
3685  if (LogicalTapeRead(state->tapeset, tapenum,
3686  &len, sizeof(len)) != sizeof(len))
3687  elog(ERROR, "unexpected end of tape");
3688  if (len == 0 && !eofOK)
3689  elog(ERROR, "unexpected end of data");
3690  return len;
3691 }
3692 
3693 static void
3695 {
3696  unsigned int len = 0;
3697 
3698  LogicalTapeWrite(state->tapeset, tapenum, (void *) &len, sizeof(len));
3699 }
3700 
3701 /*
3702  * Get memory for tuple from within READTUP() routine.
3703  *
3704  * We use next free slot from the slab allocator, or palloc() if the tuple
3705  * is too large for that.
3706  */
3707 static void *
3709 {
3710  SlabSlot *buf;
3711 
3712  /*
3713  * We pre-allocate enough slots in the slab arena that we should never run
3714  * out.
3715  */
3716  Assert(state->slabFreeHead);
3717 
3718  if (tuplen > SLAB_SLOT_SIZE || !state->slabFreeHead)
3719  return MemoryContextAlloc(state->sortcontext, tuplen);
3720  else
3721  {
3722  buf = state->slabFreeHead;
3723  /* Reuse this slot */
3724  state->slabFreeHead = buf->nextfree;
3725 
3726  return buf;
3727  }
3728 }
3729 
3730 
3731 /*
3732  * Routines specialized for HeapTuple (actually MinimalTuple) case
3733  */
3734 
3735 static int
3737 {
3738  SortSupport sortKey = state->sortKeys;
3739  HeapTupleData ltup;
3740  HeapTupleData rtup;
3741  TupleDesc tupDesc;
3742  int nkey;
3743  int32 compare;
3744  AttrNumber attno;
3745  Datum datum1,
3746  datum2;
3747  bool isnull1,
3748  isnull2;
3749 
3750 
3751  /* Compare the leading sort key */
3752  compare = ApplySortComparator(a->datum1, a->isnull1,
3753  b->datum1, b->isnull1,
3754  sortKey);
3755  if (compare != 0)
3756  return compare;
3757 
3758  /* Compare additional sort keys */
3759  ltup.t_len = ((MinimalTuple) a->tuple)->t_len + MINIMAL_TUPLE_OFFSET;
3760  ltup.t_data = (HeapTupleHeader) ((char *) a->tuple - MINIMAL_TUPLE_OFFSET);
3761  rtup.t_len = ((MinimalTuple) b->tuple)->t_len + MINIMAL_TUPLE_OFFSET;
3762  rtup.t_data = (HeapTupleHeader) ((char *) b->tuple - MINIMAL_TUPLE_OFFSET);
3763  tupDesc = state->tupDesc;
3764 
3765  if (sortKey->abbrev_converter)
3766  {
3767  attno = sortKey->ssup_attno;
3768 
3769  datum1 = heap_getattr(&ltup, attno, tupDesc, &isnull1);
3770  datum2 = heap_getattr(&rtup, attno, tupDesc, &isnull2);
3771 
3772  compare = ApplySortAbbrevFullComparator(datum1, isnull1,
3773  datum2, isnull2,
3774  sortKey);
3775  if (compare != 0)
3776  return compare;
3777  }
3778 
3779  sortKey++;
3780  for (nkey = 1; nkey < state->nKeys; nkey++, sortKey++)
3781  {
3782  attno = sortKey->ssup_attno;
3783 
3784  datum1 = heap_getattr(&ltup, attno, tupDesc, &isnull1);
3785  datum2 = heap_getattr(&rtup, attno, tupDesc, &isnull2);
3786 
3787  compare = ApplySortComparator(datum1, isnull1,
3788  datum2, isnull2,
3789  sortKey);
3790  if (compare != 0)
3791  return compare;
3792  }
3793 
3794  return 0;
3795 }
3796 
3797 static void
3799 {
3800  /*
3801  * We expect the passed "tup" to be a TupleTableSlot, and form a
3802  * MinimalTuple using the exported interface for that.
3803  */
3804  TupleTableSlot *slot = (TupleTableSlot *) tup;
3805  Datum original;
3806  MinimalTuple tuple;
3807  HeapTupleData htup;
3808  MemoryContext oldcontext = MemoryContextSwitchTo(state->tuplecontext);
3809 
3810  /* copy the tuple into sort storage */
3811  tuple = ExecCopySlotMinimalTuple(slot);
3812  stup->tuple = (void *) tuple;
3813  USEMEM(state, GetMemoryChunkSpace(tuple));
3814  /* set up first-column key value */
3815  htup.t_len = tuple->t_len + MINIMAL_TUPLE_OFFSET;
3816  htup.t_data = (HeapTupleHeader) ((char *) tuple - MINIMAL_TUPLE_OFFSET);
3817  original = heap_getattr(&htup,
3818  state->sortKeys[0].ssup_attno,
3819  state->tupDesc,
3820  &stup->isnull1);
3821 
3822  MemoryContextSwitchTo(oldcontext);
3823 
3824  if (!state->sortKeys->abbrev_converter || stup->isnull1)
3825  {
3826  /*
3827  * Store ordinary Datum representation, or NULL value. If there is a
3828  * converter it won't expect NULL values, and cost model is not
3829  * required to account for NULL, so in that case we avoid calling
3830  * converter and just set datum1 to zeroed representation (to be
3831  * consistent, and to support cheap inequality tests for NULL
3832  * abbreviated keys).
3833  */
3834  stup->datum1 = original;
3835  }
3836  else if (!consider_abort_common(state))
3837  {
3838  /* Store abbreviated key representation */
3839  stup->datum1 = state->sortKeys->abbrev_converter(original,
3840  state->sortKeys);
3841  }
3842  else
3843  {
3844  /* Abort abbreviation */
3845  int i;
3846 
3847  stup->datum1 = original;
3848 
3849  /*
3850  * Set state to be consistent with never trying abbreviation.
3851  *
3852  * Alter datum1 representation in already-copied tuples, so as to
3853  * ensure a consistent representation (current tuple was just
3854  * handled). It does not matter if some dumped tuples are already
3855  * sorted on tape, since serialized tuples lack abbreviated keys
3856  * (TSS_BUILDRUNS state prevents control reaching here in any case).
3857  */
3858  for (i = 0; i < state->memtupcount; i++)
3859  {
3860  SortTuple *mtup = &state->memtuples[i];
3861 
3862  htup.t_len = ((MinimalTuple) mtup->tuple)->t_len +
3864  htup.t_data = (HeapTupleHeader) ((char *) mtup->tuple -
3866 
3867  mtup->datum1 = heap_getattr(&htup,
3868  state->sortKeys[0].ssup_attno,
3869  state->tupDesc,
3870  &mtup->isnull1);
3871  }
3872  }
3873 }
3874 
3875 static void
3877 {
3878  MinimalTuple tuple = (MinimalTuple) stup->tuple;
3879 
3880  /* the part of the MinimalTuple we'll write: */
3881  char *tupbody = (char *) tuple + MINIMAL_TUPLE_DATA_OFFSET;
3882  unsigned int tupbodylen = tuple->t_len - MINIMAL_TUPLE_DATA_OFFSET;
3883 
3884  /* total on-disk footprint: */
3885  unsigned int tuplen = tupbodylen + sizeof(int);
3886 
3887  LogicalTapeWrite(state->tapeset, tapenum,
3888  (void *) &tuplen, sizeof(tuplen));
3889  LogicalTapeWrite(state->tapeset, tapenum,
3890  (void *) tupbody, tupbodylen);
3891  if (state->randomAccess) /* need trailing length word? */
3892  LogicalTapeWrite(state->tapeset, tapenum,
3893  (void *) &tuplen, sizeof(tuplen));
3894 
3895  if (!state->slabAllocatorUsed)
3896  {
3897  FREEMEM(state, GetMemoryChunkSpace(tuple));
3898  heap_free_minimal_tuple(tuple);
3899  }
3900 }
3901 
3902 static void
3904  int tapenum, unsigned int len)
3905 {
3906  unsigned int tupbodylen = len - sizeof(int);
3907  unsigned int tuplen = tupbodylen + MINIMAL_TUPLE_DATA_OFFSET;
3908  MinimalTuple tuple = (MinimalTuple) readtup_alloc(state, tuplen);
3909  char *tupbody = (char *) tuple + MINIMAL_TUPLE_DATA_OFFSET;
3910  HeapTupleData htup;
3911 
3912  /* read in the tuple proper */
3913  tuple->t_len = tuplen;
3914  LogicalTapeReadExact(state->tapeset, tapenum,
3915  tupbody, tupbodylen);
3916  if (state->randomAccess) /* need trailing length word? */
3917  LogicalTapeReadExact(state->tapeset, tapenum,
3918  &tuplen, sizeof(tuplen));
3919  stup->tuple = (void *) tuple;
3920  /* set up first-column key value */
3921  htup.t_len = tuple->t_len + MINIMAL_TUPLE_OFFSET;
3922  htup.t_data = (HeapTupleHeader) ((char *) tuple - MINIMAL_TUPLE_OFFSET);
3923  stup->datum1 = heap_getattr(&htup,
3924  state->sortKeys[0].ssup_attno,
3925  state->tupDesc,
3926  &stup->isnull1);
3927 }
3928 
3929 /*
3930  * Routines specialized for the CLUSTER case (HeapTuple data, with
3931  * comparisons per a btree index definition)
3932  */
3933 
3934 static int
3937 {
3938  SortSupport sortKey = state->sortKeys;
3939  HeapTuple ltup;
3940  HeapTuple rtup;
3941  TupleDesc tupDesc;
3942  int nkey;
3943  int32 compare;
3944  Datum datum1,
3945  datum2;
3946  bool isnull1,
3947  isnull2;
3948  AttrNumber leading = state->indexInfo->ii_IndexAttrNumbers[0];
3949 
3950  /* Be prepared to compare additional sort keys */
3951  ltup = (HeapTuple) a->tuple;
3952  rtup = (HeapTuple) b->tuple;
3953  tupDesc = state->tupDesc;
3954 
3955  /* Compare the leading sort key, if it's simple */
3956  if (leading != 0)
3957  {
3958  compare = ApplySortComparator(a->datum1, a->isnull1,
3959  b->datum1, b->isnull1,
3960  sortKey);
3961  if (compare != 0)
3962  return compare;
3963 
3964  if (sortKey->abbrev_converter)
3965  {
3966  datum1 = heap_getattr(ltup, leading, tupDesc, &isnull1);
3967  datum2 = heap_getattr(rtup, leading, tupDesc, &isnull2);
3968 
3969  compare = ApplySortAbbrevFullComparator(datum1, isnull1,
3970  datum2, isnull2,
3971  sortKey);
3972  }
3973  if (compare != 0 || state->nKeys == 1)
3974  return compare;
3975  /* Compare additional columns the hard way */
3976  sortKey++;
3977  nkey = 1;
3978  }
3979  else
3980  {
3981  /* Must compare all keys the hard way */
3982  nkey = 0;
3983  }
3984 
3985  if (state->indexInfo->ii_Expressions == NULL)
3986  {
3987  /* If not expression index, just compare the proper heap attrs */
3988 
3989  for (; nkey < state->nKeys; nkey++, sortKey++)
3990  {
3991  AttrNumber attno = state->indexInfo->ii_IndexAttrNumbers[nkey];
3992 
3993  datum1 = heap_getattr(ltup, attno, tupDesc, &isnull1);
3994  datum2 = heap_getattr(rtup, attno, tupDesc, &isnull2);
3995 
3996  compare = ApplySortComparator(datum1, isnull1,
3997  datum2, isnull2,
3998  sortKey);
3999  if (compare != 0)
4000  return compare;
4001  }
4002  }
4003  else
4004  {
4005  /*
4006  * In the expression index case, compute the whole index tuple and
4007  * then compare values. It would perhaps be faster to compute only as
4008  * many columns as we need to compare, but that would require
4009  * duplicating all the logic in FormIndexDatum.
4010  */
4011  Datum l_index_values[INDEX_MAX_KEYS];
4012  bool l_index_isnull[INDEX_MAX_KEYS];
4013  Datum r_index_values[INDEX_MAX_KEYS];
4014  bool r_index_isnull[INDEX_MAX_KEYS];
4015  TupleTableSlot *ecxt_scantuple;
4016 
4017  /* Reset context each time to prevent memory leakage */
4019 
4020  ecxt_scantuple = GetPerTupleExprContext(state->estate)->ecxt_scantuple;
4021 
4022  ExecStoreHeapTuple(ltup, ecxt_scantuple, false);
4023  FormIndexDatum(state->indexInfo, ecxt_scantuple, state->estate,
4024  l_index_values, l_index_isnull);
4025 
4026  ExecStoreHeapTuple(rtup, ecxt_scantuple, false);
4027  FormIndexDatum(state->indexInfo, ecxt_scantuple, state->estate,
4028  r_index_values, r_index_isnull);
4029 
4030  for (; nkey < state->nKeys; nkey++, sortKey++)
4031  {
4032  compare = ApplySortComparator(l_index_values[nkey],
4033  l_index_isnull[nkey],
4034  r_index_values[nkey],
4035  r_index_isnull[nkey],
4036  sortKey);
4037  if (compare != 0)
4038  return compare;
4039  }
4040  }
4041 
4042  return 0;
4043 }
4044 
4045 static void
4047 {
4048  HeapTuple tuple = (HeapTuple) tup;
4049  Datum original;
4050  MemoryContext oldcontext = MemoryContextSwitchTo(state->tuplecontext);
4051 
4052  /* copy the tuple into sort storage */
4053  tuple = heap_copytuple(tuple);
4054  stup->tuple = (void *) tuple;
4055  USEMEM(state, GetMemoryChunkSpace(tuple));
4056 
4057  MemoryContextSwitchTo(oldcontext);
4058 
4059  /*
4060  * set up first-column key value, and potentially abbreviate, if it's a
4061  * simple column
4062  */
4063  if (state->indexInfo->ii_IndexAttrNumbers[0] == 0)
4064  return;
4065 
4066  original = heap_getattr(tuple,
4067  state->indexInfo->ii_IndexAttrNumbers[0],
4068  state->tupDesc,
4069  &stup->isnull1);
4070 
4071  if (!state->sortKeys->abbrev_converter || stup->isnull1)
4072  {
4073  /*
4074  * Store ordinary Datum representation, or NULL value. If there is a
4075  * converter it won't expect NULL values, and cost model is not
4076  * required to account for NULL, so in that case we avoid calling
4077  * converter and just set datum1 to zeroed representation (to be
4078  * consistent, and to support cheap inequality tests for NULL
4079  * abbreviated keys).
4080  */
4081  stup->datum1 = original;
4082  }
4083  else if (!consider_abort_common(state))
4084  {
4085  /* Store abbreviated key representation */
4086  stup->datum1 = state->sortKeys->abbrev_converter(original,
4087  state->sortKeys);
4088  }
4089  else
4090  {
4091  /* Abort abbreviation */
4092  int i;
4093 
4094  stup->datum1 = original;
4095 
4096  /*
4097  * Set state to be consistent with never trying abbreviation.
4098  *
4099  * Alter datum1 representation in already-copied tuples, so as to
4100  * ensure a consistent representation (current tuple was just
4101  * handled). It does not matter if some dumped tuples are already
4102  * sorted on tape, since serialized tuples lack abbreviated keys
4103  * (TSS_BUILDRUNS state prevents control reaching here in any case).
4104  */
4105  for (i = 0; i < state->memtupcount; i++)
4106  {
4107  SortTuple *mtup = &state->memtuples[i];
4108 
4109  tuple = (HeapTuple) mtup->tuple;
4110  mtup->datum1 = heap_getattr(tuple,
4111  state->indexInfo->ii_IndexAttrNumbers[0],
4112  state->tupDesc,
4113  &mtup->isnull1);
4114  }
4115  }
4116 }
4117 
4118 static void
4120 {
4121  HeapTuple tuple = (HeapTuple) stup->tuple;
4122  unsigned int tuplen = tuple->t_len + sizeof(ItemPointerData) + sizeof(int);
4123 
4124  /* We need to store t_self, but not other fields of HeapTupleData */
4125  LogicalTapeWrite(state->tapeset, tapenum,
4126  &tuplen, sizeof(tuplen));
4127  LogicalTapeWrite(state->tapeset, tapenum,
4128  &tuple->t_self, sizeof(ItemPointerData));
4129  LogicalTapeWrite(state->tapeset, tapenum,
4130  tuple->t_data, tuple->t_len);
4131  if (state->randomAccess) /* need trailing length word? */
4132  LogicalTapeWrite(state->tapeset, tapenum,
4133  &tuplen, sizeof(tuplen));
4134 
4135  if (!state->slabAllocatorUsed)
4136  {
4137  FREEMEM(state, GetMemoryChunkSpace(tuple));
4138  heap_freetuple(tuple);
4139  }
4140 }
4141 
4142 static void
4144  int tapenum, unsigned int tuplen)
4145 {
4146  unsigned int t_len = tuplen - sizeof(ItemPointerData) - sizeof(int);
4147  HeapTuple tuple = (HeapTuple) readtup_alloc(state,
4148  t_len + HEAPTUPLESIZE);
4149 
4150  /* Reconstruct the HeapTupleData header */
4151  tuple->t_data = (HeapTupleHeader) ((char *) tuple + HEAPTUPLESIZE);
4152  tuple->t_len = t_len;
4153  LogicalTapeReadExact(state->tapeset, tapenum,
4154  &tuple->t_self, sizeof(ItemPointerData));
4155  /* We don't currently bother to reconstruct t_tableOid */
4156  tuple->t_tableOid = InvalidOid;
4157  /* Read in the tuple body */
4158  LogicalTapeReadExact(state->tapeset, tapenum,
4159  tuple->t_data, tuple->t_len);
4160  if (state->randomAccess) /* need trailing length word? */
4161  LogicalTapeReadExact(state->tapeset, tapenum,
4162  &tuplen, sizeof(tuplen));
4163  stup->tuple = (void *) tuple;
4164  /* set up first-column key value, if it's a simple column */
4165  if (state->indexInfo->ii_IndexAttrNumbers[0] != 0)
4166  stup->datum1 = heap_getattr(tuple,
4167  state->indexInfo->ii_IndexAttrNumbers[0],
4168  state->tupDesc,
4169  &stup->isnull1);
4170 }
4171 
4172 /*
4173  * Routines specialized for IndexTuple case
4174  *
4175  * The btree and hash cases require separate comparison functions, but the
4176  * IndexTuple representation is the same so the copy/write/read support
4177  * functions can be shared.
4178  */
4179 
4180 static int
4183 {
4184  /*
4185  * This is similar to comparetup_heap(), but expects index tuples. There
4186  * is also special handling for enforcing uniqueness, and special
4187  * treatment for equal keys at the end.
4188  */
4189  SortSupport sortKey = state->sortKeys;
4190  IndexTuple tuple1;
4191  IndexTuple tuple2;
4192  int keysz;
4193  TupleDesc tupDes;
4194  bool equal_hasnull = false;
4195  int nkey;
4196  int32 compare;
4197  Datum datum1,
4198  datum2;
4199  bool isnull1,
4200  isnull2;
4201 
4202 
4203  /* Compare the leading sort key */
4204  compare = ApplySortComparator(a->datum1, a->isnull1,
4205  b->datum1, b->isnull1,
4206  sortKey);
4207  if (compare != 0)
4208  return compare;
4209 
4210  /* Compare additional sort keys */
4211  tuple1 = (IndexTuple) a->tuple;
4212  tuple2 = (IndexTuple) b->tuple;
4213  keysz = state->nKeys;
4214  tupDes = RelationGetDescr(state->indexRel);
4215 
4216  if (sortKey->abbrev_converter)
4217  {
4218  datum1 = index_getattr(tuple1, 1, tupDes, &isnull1);
4219  datum2 = index_getattr(tuple2, 1, tupDes, &isnull2);
4220 
4221  compare = ApplySortAbbrevFullComparator(datum1, isnull1,
4222  datum2, isnull2,
4223  sortKey);
4224  if (compare != 0)
4225  return compare;
4226  }
4227 
4228  /* they are equal, so we only need to examine one null flag */
4229  if (a->isnull1)
4230  equal_hasnull = true;
4231 
4232  sortKey++;
4233  for (nkey = 2; nkey <= keysz; nkey++, sortKey++)
4234  {
4235  datum1 = index_getattr(tuple1, nkey, tupDes, &isnull1);
4236  datum2 = index_getattr(tuple2, nkey, tupDes, &isnull2);
4237 
4238  compare = ApplySortComparator(datum1, isnull1,
4239  datum2, isnull2,
4240  sortKey);
4241  if (compare != 0)
4242  return compare; /* done when we find unequal attributes */
4243 
4244  /* they are equal, so we only need to examine one null flag */
4245  if (isnull1)
4246  equal_hasnull = true;
4247  }
4248 
4249  /*
4250  * If btree has asked us to enforce uniqueness, complain if two equal
4251  * tuples are detected (unless there was at least one NULL field).
4252  *
4253  * It is sufficient to make the test here, because if two tuples are equal
4254  * they *must* get compared at some stage of the sort --- otherwise the
4255  * sort algorithm wouldn't have checked whether one must appear before the
4256  * other.
4257  */
4258  if (state->enforceUnique && !equal_hasnull)
4259  {
4261  bool isnull[INDEX_MAX_KEYS];
4262  char *key_desc;
4263 
4264  /*
4265  * Some rather brain-dead implementations of qsort (such as the one in
4266  * QNX 4) will sometimes call the comparison routine to compare a
4267  * value to itself, but we always use our own implementation, which
4268  * does not.
4269  */
4270  Assert(tuple1 != tuple2);
4271 
4272  index_deform_tuple(tuple1, tupDes, values, isnull);
4273 
4274  key_desc = BuildIndexValueDescription(state->indexRel, values, isnull);
4275 
4276  ereport(ERROR,
4277  (errcode(ERRCODE_UNIQUE_VIOLATION),
4278  errmsg("could not create unique index \"%s\"",
4280  key_desc ? errdetail("Key %s is duplicated.", key_desc) :
4281  errdetail("Duplicate keys exist."),
4282  errtableconstraint(state->heapRel,
4283  RelationGetRelationName(state->indexRel))));
4284  }
4285 
4286  /*
4287  * If key values are equal, we sort on ItemPointer. This is required for
4288  * btree indexes, since heap TID is treated as an implicit last key
4289  * attribute in order to ensure that all keys in the index are physically
4290  * unique.
4291  */
4292  {
4293  BlockNumber blk1 = ItemPointerGetBlockNumber(&tuple1->t_tid);
4294  BlockNumber blk2 = ItemPointerGetBlockNumber(&tuple2->t_tid);
4295 
4296  if (blk1 != blk2)
4297  return (blk1 < blk2) ? -1 : 1;
4298  }
4299  {
4302 
4303  if (pos1 != pos2)
4304  return (pos1 < pos2) ? -1 : 1;
4305  }
4306 
4307  /* ItemPointer values should never be equal */
4308  Assert(false);
4309 
4310  return 0;
4311 }
4312 
4313 static int
4316 {
4317  Bucket bucket1;
4318  Bucket bucket2;
4319  IndexTuple tuple1;
4320  IndexTuple tuple2;
4321 
4322  /*
4323  * Fetch hash keys and mask off bits we don't want to sort by. We know
4324  * that the first column of the index tuple is the hash key.
4325  */
4326  Assert(!a->isnull1);
4328  state->max_buckets, state->high_mask,
4329  state->low_mask);
4330  Assert(!b->isnull1);
4332  state->max_buckets, state->high_mask,
4333  state->low_mask);
4334  if (bucket1 > bucket2)
4335  return 1;
4336  else if (bucket1 < bucket2)
4337  return -1;
4338 
4339  /*
4340  * If hash values are equal, we sort on ItemPointer. This does not affect
4341  * validity of the finished index, but it may be useful to have index
4342  * scans in physical order.
4343  */
4344  tuple1 = (IndexTuple) a->tuple;
4345  tuple2 = (IndexTuple) b->tuple;
4346 
4347  {
4348  BlockNumber blk1 = ItemPointerGetBlockNumber(&tuple1->t_tid);
4349  BlockNumber blk2 = ItemPointerGetBlockNumber(&tuple2->t_tid);
4350 
4351  if (blk1 != blk2)
4352  return (blk1 < blk2) ? -1 : 1;
4353  }
4354  {
4357 
4358  if (pos1 != pos2)
4359  return (pos1 < pos2) ? -1 : 1;
4360  }
4361 
4362  /* ItemPointer values should never be equal */
4363  Assert(false);
4364 
4365  return 0;
4366 }
4367 
4368 static void
4370 {
4371  /* Not currently needed */
4372  elog(ERROR, "copytup_index() should not be called");
4373 }
4374 
4375 static void
4377 {
4378  IndexTuple tuple = (IndexTuple) stup->tuple;
4379  unsigned int tuplen;
4380 
4381  tuplen = IndexTupleSize(tuple) + sizeof(tuplen);
4382  LogicalTapeWrite(state->tapeset, tapenum,
4383  (void *) &tuplen, sizeof(tuplen));
4384  LogicalTapeWrite(state->tapeset, tapenum,
4385  (void *) tuple, IndexTupleSize(tuple));
4386  if (state->randomAccess) /* need trailing length word? */
4387  LogicalTapeWrite(state->tapeset, tapenum,
4388  (void *) &tuplen, sizeof(tuplen));
4389 
4390  if (!state->slabAllocatorUsed)
4391  {
4392  FREEMEM(state, GetMemoryChunkSpace(tuple));
4393  pfree(tuple);
4394  }
4395 }
4396 
4397 static void
4399  int tapenum, unsigned int len)
4400 {
4401  unsigned int tuplen = len - sizeof(unsigned int);
4402  IndexTuple tuple = (IndexTuple) readtup_alloc(state, tuplen);
4403 
4404  LogicalTapeReadExact(state->tapeset, tapenum,
4405  tuple, tuplen);
4406  if (state->randomAccess) /* need trailing length word? */
4407  LogicalTapeReadExact(state->tapeset, tapenum,
4408  &tuplen, sizeof(tuplen));
4409  stup->tuple = (void *) tuple;
4410  /* set up first-column key value */
4411  stup->datum1 = index_getattr(tuple,
4412  1,
4413  RelationGetDescr(state->indexRel),
4414  &stup->isnull1);
4415 }
4416 
4417 /*
4418  * Routines specialized for DatumTuple case
4419  */
4420 
4421 static int
4423 {
4424  int compare;
4425 
4426  compare = ApplySortComparator(a->datum1, a->isnull1,
4427  b->datum1, b->isnull1,
4428  state->sortKeys);
4429  if (compare != 0)
4430  return compare;
4431 
4432  /* if we have abbreviations, then "tuple" has the original value */
4433 
4434  if (state->sortKeys->abbrev_converter)
4436  PointerGetDatum(b->tuple), b->isnull1,
4437  state->sortKeys);
4438 
4439  return compare;
4440 }
4441 
4442 static void
4444 {
4445  /* Not currently needed */
4446  elog(ERROR, "copytup_datum() should not be called");
4447 }
4448 
4449 static void
4451 {
4452  void *waddr;
4453  unsigned int tuplen;
4454  unsigned int writtenlen;
4455 
4456  if (stup->isnull1)
4457  {
4458  waddr = NULL;
4459  tuplen = 0;
4460  }
4461  else if (!state->tuples)
4462  {
4463  waddr = &stup->datum1;
4464  tuplen = sizeof(Datum);
4465  }
4466  else
4467  {
4468  waddr = stup->tuple;
4469  tuplen = datumGetSize(PointerGetDatum(stup->tuple), false, state->datumTypeLen);
4470  Assert(tuplen != 0);
4471  }
4472 
4473  writtenlen = tuplen + sizeof(unsigned int);
4474 
4475  LogicalTapeWrite(state->tapeset, tapenum,
4476  (void *) &writtenlen, sizeof(writtenlen));
4477  LogicalTapeWrite(state->tapeset, tapenum,
4478  waddr, tuplen);
4479  if (state->randomAccess) /* need trailing length word? */
4480  LogicalTapeWrite(state->tapeset, tapenum,
4481  (void *) &writtenlen, sizeof(writtenlen));
4482 
4483  if (!state->slabAllocatorUsed && stup->tuple)
4484  {
4485  FREEMEM(state, GetMemoryChunkSpace(stup->tuple));
4486  pfree(stup->tuple);
4487  }
4488 }
4489 
4490 static void
4492  int tapenum, unsigned int len)
4493 {
4494  unsigned int tuplen = len - sizeof(unsigned int);
4495 
4496  if (tuplen == 0)
4497  {
4498  /* it's NULL */
4499  stup->datum1 = (Datum) 0;
4500  stup->isnull1 = true;
4501  stup->tuple = NULL;
4502  }
4503  else if (!state->tuples)
4504  {
4505  Assert(tuplen == sizeof(Datum));
4506  LogicalTapeReadExact(state->tapeset, tapenum,
4507  &stup->datum1, tuplen);
4508  stup->isnull1 = false;
4509  stup->tuple = NULL;
4510  }
4511  else
4512  {
4513  void *raddr = readtup_alloc(state, tuplen);
4514 
4515  LogicalTapeReadExact(state->tapeset, tapenum,
4516  raddr, tuplen);
4517  stup->datum1 = PointerGetDatum(raddr);
4518  stup->isnull1 = false;
4519  stup->tuple = raddr;
4520  }
4521 
4522  if (state->randomAccess) /* need trailing length word? */
4523  LogicalTapeReadExact(state->tapeset, tapenum,
4524  &tuplen, sizeof(tuplen));
4525 }
4526 
4527 /*
4528  * Parallel sort routines
4529  */
4530 
4531 /*
4532  * tuplesort_estimate_shared - estimate required shared memory allocation
4533  *
4534  * nWorkers is an estimate of the number of workers (it's the number that
4535  * will be requested).
4536  */
4537 Size
4539 {
4540  Size tapesSize;
4541 
4542  Assert(nWorkers > 0);
4543 
4544  /* Make sure that BufFile shared state is MAXALIGN'd */
4545  tapesSize = mul_size(sizeof(TapeShare), nWorkers);
4546  tapesSize = MAXALIGN(add_size(tapesSize, offsetof(Sharedsort, tapes)));
4547 
4548  return tapesSize;
4549 }
4550 
4551 /*
4552  * tuplesort_initialize_shared - initialize shared tuplesort state
4553  *
4554  * Must be called from leader process before workers are launched, to
4555  * establish state needed up-front for worker tuplesortstates. nWorkers
4556  * should match the argument passed to tuplesort_estimate_shared().
4557  */
4558 void
4560 {
4561  int i;
4562 
4563  Assert(nWorkers > 0);
4564 
4565  SpinLockInit(&shared->mutex);
4566  shared->currentWorker = 0;
4567  shared->workersFinished = 0;
4568  SharedFileSetInit(&shared->fileset, seg);
4569  shared->nTapes = nWorkers;
4570  for (i = 0; i < nWorkers; i++)
4571  {
4572  shared->tapes[i].firstblocknumber = 0L;
4573  }
4574 }
4575 
4576 /*
4577  * tuplesort_attach_shared - attach to shared tuplesort state
4578  *
4579  * Must be called by all worker processes.
4580  */
4581 void
4583 {
4584  /* Attach to SharedFileSet */
4585  SharedFileSetAttach(&shared->fileset, seg);
4586 }
4587 
4588 /*
4589  * worker_get_identifier - Assign and return ordinal identifier for worker
4590  *
4591  * The order in which these are assigned is not well defined, and should not
4592  * matter; worker numbers across parallel sort participants need only be
4593  * distinct and gapless. logtape.c requires this.
4594  *
4595  * Note that the identifiers assigned from here have no relation to
4596  * ParallelWorkerNumber number, to avoid making any assumption about
4597  * caller's requirements. However, we do follow the ParallelWorkerNumber
4598  * convention of representing a non-worker with worker number -1. This
4599  * includes the leader, as well as serial Tuplesort processes.
4600  */
4601 static int
4603 {
4604  Sharedsort *shared = state->shared;
4605  int worker;
4606 
4607  Assert(WORKER(state));
4608 
4609  SpinLockAcquire(&shared->mutex);
4610  worker = shared->currentWorker++;
4611  SpinLockRelease(&shared->mutex);
4612 
4613  return worker;
4614 }
4615 
4616 /*
4617  * worker_freeze_result_tape - freeze worker's result tape for leader
4618  *
4619  * This is called by workers just after the result tape has been determined,
4620  * instead of calling LogicalTapeFreeze() directly. They do so because
4621  * workers require a few additional steps over similar serial
4622  * TSS_SORTEDONTAPE external sort cases, which also happen here. The extra
4623  * steps are around freeing now unneeded resources, and representing to
4624  * leader that worker's input run is available for its merge.
4625  *
4626  * There should only be one final output run for each worker, which consists
4627  * of all tuples that were originally input into worker.
4628  */
4629 static void
4631 {
4632  Sharedsort *shared = state->shared;
4633  TapeShare output;
4634 
4635  Assert(WORKER(state));
4636  Assert(state->result_tape != -1);
4637  Assert(state->memtupcount == 0);
4638 
4639  /*
4640  * Free most remaining memory, in case caller is sensitive to our holding
4641  * on to it. memtuples may not be a tiny merge heap at this point.
4642  */
4643  pfree(state->memtuples);
4644  /* Be tidy */
4645  state->memtuples = NULL;
4646  state->memtupsize = 0;
4647 
4648  /*
4649  * Parallel worker requires result tape metadata, which is to be stored in
4650  * shared memory for leader
4651  */
4652  LogicalTapeFreeze(state->tapeset, state->result_tape, &output);
4653 
4654  /* Store properties of output tape, and update finished worker count */
4655  SpinLockAcquire(&shared->mutex);
4656  shared->tapes[state->worker] = output;
4657  shared->workersFinished++;
4658  SpinLockRelease(&shared->mutex);
4659 }
4660 
4661 /*
4662  * worker_nomergeruns - dump memtuples in worker, without merging
4663  *
4664  * This called as an alternative to mergeruns() with a worker when no
4665  * merging is required.
4666  */
4667 static void
4669 {
4670  Assert(WORKER(state));
4671  Assert(state->result_tape == -1);
4672 
4673  state->result_tape = state->tp_tapenum[state->destTape];
4675 }
4676 
4677 /*
4678  * leader_takeover_tapes - create tapeset for leader from worker tapes
4679  *
4680  * So far, leader Tuplesortstate has performed no actual sorting. By now, all
4681  * sorting has occurred in workers, all of which must have already returned
4682  * from tuplesort_performsort().
4683  *
4684  * When this returns, leader process is left in a state that is virtually
4685  * indistinguishable from it having generated runs as a serial external sort
4686  * might have.
4687  */
4688 static void
4690 {
4691  Sharedsort *shared = state->shared;
4692  int nParticipants = state->nParticipants;
4693  int workersFinished;
4694  int j;
4695 
4696  Assert(LEADER(state));
4697  Assert(nParticipants >= 1);
4698 
4699  SpinLockAcquire(&shared->mutex);
4700  workersFinished = shared->workersFinished;
4701  SpinLockRelease(&shared->mutex);
4702 
4703  if (nParticipants != workersFinished)
4704  elog(ERROR, "cannot take over tapes before all workers finish");
4705 
4706  /*
4707  * Create the tapeset from worker tapes, including a leader-owned tape at
4708  * the end. Parallel workers are far more expensive than logical tapes,
4709  * so the number of tapes allocated here should never be excessive.
4710  *
4711  * We still have a leader tape, though it's not possible to write to it
4712  * due to restrictions in the shared fileset infrastructure used by
4713  * logtape.c. It will never be written to in practice because
4714  * randomAccess is disallowed for parallel sorts.
4715  */
4716  inittapestate(state, nParticipants + 1);
4717  state->tapeset = LogicalTapeSetCreate(nParticipants + 1, false,
4718  shared->tapes, &shared->fileset,
4719  state->worker);
4720 
4721  /* mergeruns() relies on currentRun for # of runs (in one-pass cases) */
4722  state->currentRun = nParticipants;
4723 
4724  /*
4725  * Initialize variables of Algorithm D to be consistent with runs from
4726  * workers having been generated in the leader.
4727  *
4728  * There will always be exactly 1 run per worker, and exactly one input
4729  * tape per run, because workers always output exactly 1 run, even when
4730  * there were no input tuples for workers to sort.
4731  */
4732  for (j = 0; j < state->maxTapes; j++)
4733  {
4734  /* One real run; no dummy runs for worker tapes */
4735  state->tp_fib[j] = 1;
4736  state->tp_runs[j] = 1;
4737  state->tp_dummy[j] = 0;
4738  state->tp_tapenum[j] = j;
4739  }
4740  /* Leader tape gets one dummy run, and no real runs */
4741  state->tp_fib[state->tapeRange] = 0;
4742  state->tp_runs[state->tapeRange] = 0;
4743  state->tp_dummy[state->tapeRange] = 1;
4744 
4745  state->Level = 1;
4746  state->destTape = 0;
4747 
4748  state->status = TSS_BUILDRUNS;
4749 }
4750 
4751 /*
4752  * Convenience routine to free a tuple previously loaded into sort memory
4753  */
4754 static void
4756 {
4757  FREEMEM(state, GetMemoryChunkSpace(stup->tuple));
4758  pfree(stup->tuple);
4759 }
bool tuplesort_used_bound(Tuplesortstate *state)
Definition: tuplesort.c:1361
struct SortSupportData * SortSupport
Definition: sortsupport.h:58
IndexTuple tuplesort_getindextuple(Tuplesortstate *state, bool forward)
Definition: tuplesort.c:2446
#define MINIMAL_TUPLE_DATA_OFFSET
Definition: htup_details.h:623
signed short int16
Definition: c.h:362
static int comparetup_index_hash(const SortTuple *a, const SortTuple *b, Tuplesortstate *state)
Definition: tuplesort.c:4314
void FormIndexDatum(IndexInfo *indexInfo, TupleTableSlot *slot, EState *estate, Datum *values, bool *isnull)
Definition: index.c:2585
HeapTuple heap_copytuple(HeapTuple tuple)
Definition: heaptuple.c:680
bool ssup_nulls_first
Definition: sortsupport.h:75
#define DatumGetUInt32(X)
Definition: postgres.h:486
int slock_t
Definition: s_lock.h:934
LogicalTapeSet * LogicalTapeSetCreate(int ntapes, bool preallocate, TapeShare *shared, SharedFileSet *fileset, int worker)
Definition: logtape.c:685
size_t LogicalTapeRead(LogicalTapeSet *lts, int tapenum, void *ptr, size_t size)
Definition: logtape.c:977
static int comparetup_heap(const SortTuple *a, const SortTuple *b, Tuplesortstate *state)
Definition: tuplesort.c:3736
static void dumptuples(Tuplesortstate *state, bool alltuples)
Definition: tuplesort.c:3155
static void reversedirection(Tuplesortstate *state)
Definition: tuplesort.c:3663
bool tuplesort_getdatum(Tuplesortstate *state, bool forward, Datum *val, bool *isNull, Datum *abbrev)
Definition: tuplesort.c:2475
int64 availMem
Definition: tuplesort.c:250
size_t read_buffer_size
Definition: tuplesort.c:350
TupSortStatus status
Definition: tuplesort.c:242
static void writetup_cluster(Tuplesortstate *state, int tapenum, SortTuple *stup)
Definition: tuplesort.c:4119
Relation heapRel
Definition: tuplesort.c:458
void tuplesort_performsort(Tuplesortstate *state)
Definition: tuplesort.c:2021
static bool grow_memtuples(Tuplesortstate *state)
Definition: tuplesort.c:1546
void MemoryContextDelete(MemoryContext context)
Definition: mcxt.c:212
#define AllocSetContextCreate
Definition: memutils.h:170
HeapTupleData * HeapTuple
Definition: htup.h:71
HeapTuple tuplesort_getheaptuple(Tuplesortstate *state, bool forward)
Definition: tuplesort.c:2426
TupleTableSlot * ExecStoreMinimalTuple(MinimalTuple mtup, TupleTableSlot *slot, bool shouldFree)
Definition: execTuples.c:1416
bool isMaxSpaceDisk
Definition: tuplesort.c:256
#define BTGreaterStrategyNumber
Definition: stratnum.h:33
const char * tuplesort_space_type_name(TuplesortSpaceType t)
Definition: tuplesort.c:3426
void tuplesort_putdatum(Tuplesortstate *state, Datum val, bool isNull)
Definition: tuplesort.c:1786
#define AssertState(condition)
Definition: c.h:749
char * slabMemoryEnd
Definition: tuplesort.c:346
Tuplesortstate * tuplesort_begin_datum(Oid datumType, Oid sortOperator, Oid sortCollation, bool nullsFirstFlag, int workMem, SortCoordinate coordinate, bool randomAccess)
Definition: tuplesort.c:1228
void tuplesort_restorepos(Tuplesortstate *state)
Definition: tuplesort.c:3325
static void mergeonerun(Tuplesortstate *state)
Definition: tuplesort.c:3019
static void worker_freeze_result_tape(Tuplesortstate *state)
Definition: tuplesort.c:4630
PGRUsage ru_start
Definition: tuplesort.c:481
#define SERIAL(state)
Definition: tuplesort.c:548
slock_t mutex
Definition: tuplesort.c:492
static TupleTableSlot * ExecClearTuple(TupleTableSlot *slot)
Definition: tuptable.h:425
static void sort_bounded_heap(Tuplesortstate *state)
Definition: tuplesort.c:3496
void SharedFileSetInit(SharedFileSet *fileset, dsm_segment *seg)
Definition: sharedfileset.c:64
TuplesortMethod
Definition: tuplesort.h:72
#define ResetPerTupleExprContext(estate)
Definition: executor.h:515
int64 abbrevNext
Definition: tuplesort.c:444
#define RelationGetDescr(relation)
Definition: rel.h:482
BTScanInsert _bt_mkscankey(Relation rel, IndexTuple itup)
Definition: nbtutils.c:90
Bucket _hash_hashkey2bucket(uint32 hashkey, uint32 maxbucket, uint32 highmask, uint32 lowmask)
Definition: hashutil.c:126
TupleTableSlot * MakeSingleTupleTableSlot(TupleDesc tupdesc, const TupleTableSlotOps *tts_ops)
Definition: execTuples.c:1208
#define RelationGetNumberOfAttributes(relation)
Definition: rel.h:462
EState * estate
Definition: tuplesort.c:452
static void output(uint64 loop_count)
#define PointerGetDatum(X)
Definition: postgres.h:556
HeapTupleHeaderData * HeapTupleHeader
Definition: htup.h:23
static void copytup_index(Tuplesortstate *state, SortTuple *stup, void *tup)
Definition: tuplesort.c:4369
int(* SortTupleComparator)(const SortTuple *a, const SortTuple *b, Tuplesortstate *state)
Definition: tuplesort.c:234
SortSupport sortKeys
Definition: tuplesort.c:430
MemoryContext maincontext
Definition: tuplesort.c:260
#define SpinLockInit(lock)
Definition: spin.h:60
ItemPointerData t_tid
Definition: itup.h:37
void(* copytup)(Tuplesortstate *state, SortTuple *stup, void *tup)
Definition: tuplesort.c:283
#define Min(x, y)
Definition: c.h:928
char buffer[SLAB_SLOT_SIZE]
Definition: tuplesort.c:201
void PrepareSortSupportFromOrderingOp(Oid orderingOp, SortSupport ssup)
Definition: sortsupport.c:135
bool randomAccess
Definition: tuplesort.c:244
int64 maxSpace
Definition: tuplesort.c:254
static MemoryContext MemoryContextSwitchTo(MemoryContext context)
Definition: palloc.h:109
Datum datum1
Definition: tuplesort.c:180
Tuplesortstate * tuplesort_begin_cluster(TupleDesc tupDesc, Relation indexRel, int workMem, SortCoordinate coordinate, bool randomAccess)
Definition: tuplesort.c:952
SortTupleComparator comparetup
Definition: tuplesort.c:275
#define CLUSTER_SORT
Definition: tuplesort.c:122
Sharedsort * sharedsort
Definition: tuplesort.h:55
#define FLEXIBLE_ARRAY_MEMBER
Definition: c.h:284
int errcode(int sqlerrcode)
Definition: elog.c:610
#define LogicalTapeReadExact(tapeset, tapenum, ptr, len)
Definition: tuplesort.c:602
bool growmemtuples
Definition: tuplesort.c:314
Tuplesortstate * tuplesort_begin_index_hash(Relation heapRel, Relation indexRel, uint32 high_mask, uint32 low_mask, uint32 max_buckets, int workMem, SortCoordinate coordinate, bool randomAccess)
Definition: tuplesort.c:1125
Size GetMemoryChunkSpace(void *pointer)
Definition: mcxt.c:428
long firstblocknumber
Definition: logtape.h:50
#define SLAB_SLOT_SIZE
Definition: tuplesort.c:196
static int comparetup_cluster(const SortTuple *a, const SortTuple *b, Tuplesortstate *state)
Definition: tuplesort.c:3935
void MemoryContextReset(MemoryContext context)
Definition: mcxt.c:137
uint32 BlockNumber
Definition: block.h:31
#define INDEX_SORT
Definition: tuplesort.c:120
IndexInfo * BuildIndexInfo(Relation index)
Definition: index.c:2301
static void inittapes(Tuplesortstate *state, bool mergeruns)
Definition: tuplesort.c:2623
void tuplesort_initialize_shared(Sharedsort *shared, int nWorkers, dsm_segment *seg)
Definition: tuplesort.c:4559
#define LOG
Definition: elog.h:26
Form_pg_class rd_rel
Definition: rel.h:109
void heap_freetuple(HeapTuple htup)
Definition: heaptuple.c:1338
unsigned int Oid
Definition: postgres_ext.h:31
#define HEAP_SORT
Definition: tuplesort.c:119
static int worker_get_identifier(Tuplesortstate *state)
Definition: tuplesort.c:4602
static void copytup_cluster(Tuplesortstate *state, SortTuple *stup, void *tup)
Definition: tuplesort.c:4046
static int comparetup_index_btree(const SortTuple *a, const SortTuple *b, Tuplesortstate *state)
Definition: tuplesort.c:4181
bool isnull1
Definition: tuplesort.c:181
void tuplesort_get_stats(Tuplesortstate *state, TuplesortInstrumentation *stats)
Definition: tuplesort.c:3359
const char * tuplesort_method_name(TuplesortMethod m)
Definition: tuplesort.c:3403
MinimalTuple heap_copy_minimal_tuple(MinimalTuple mtup)
Definition: heaptuple.c:1439
void MemoryContextResetOnly(MemoryContext context)
Definition: mcxt.c:156
int markpos_offset
Definition: tuplesort.c:402
bool trace_sort
Definition: tuplesort.c:140
static void mergeruns(Tuplesortstate *state)
Definition: tuplesort.c:2792
void LogicalTapeRewindForWrite(LogicalTapeSet *lts, int tapenum)
Definition: logtape.c:951
static void markrunend(Tuplesortstate *state, int tapenum)
Definition: tuplesort.c:3694
static void tuplesort_updatemax(Tuplesortstate *state)
Definition: tuplesort.c:1462
static unsigned int getlen(Tuplesortstate *state, int tapenum, bool eofOK)
Definition: tuplesort.c:3681
#define PARALLEL_SORT(state)
Definition: tuplesort.c:125
#define MaxAllocHugeSize
Definition: memutils.h:44
signed int int32
Definition: c.h:363
static void init_slab_allocator(Tuplesortstate *state, int numSlots)
Definition: tuplesort.c:2756
#define DATUM_SORT
Definition: tuplesort.c:121
uint16 OffsetNumber
Definition: off.h:24
HeapTupleHeader t_data
Definition: htup.h:68
void pg_rusage_init(PGRUsage *ru0)
Definition: pg_rusage.c:27
Tuplesortstate * tuplesort_begin_heap(TupleDesc tupDesc, int nkeys, AttrNumber *attNums, Oid *sortOperators, Oid *sortCollations, bool *nullsFirstFlags, int workMem, SortCoordinate coordinate, bool randomAccess)
Definition: tuplesort.c:878
uint32 Bucket
Definition: hash.h:35
IndexTuple index_form_tuple(TupleDesc tupleDescriptor, Datum *values, bool *isnull)
Definition: indextuple.c:47
static void tuplesort_begin_batch(Tuplesortstate *state)
Definition: tuplesort.c:814
void FreeExecutorState(EState *estate)
Definition: execUtils.c:185
#define GetPerTupleExprContext(estate)
Definition: executor.h:506
void * tuple
Definition: tuplesort.c:179
int errtableconstraint(Relation rel, const char *conname)
Definition: relcache.c:5551
#define SpinLockAcquire(lock)
Definition: spin.h:62
#define TAPE_BUFFER_OVERHEAD
Definition: tuplesort.c:231
static bool consider_abort_common(Tuplesortstate *state)
Definition: tuplesort.c:1977
Tuplesortstate * tuplesort_begin_index_gist(Relation heapRel, Relation indexRel, int workMem, SortCoordinate coordinate, bool randomAccess)
Definition: tuplesort.c:1171
void pfree(void *pointer)
Definition: mcxt.c:1057
union SlabSlot * nextfree
Definition: tuplesort.c:200
TupSortStatus maxSpaceStatus
Definition: tuplesort.c:259
static int compare(const void *arg1, const void *arg2)
Definition: geqo_pool.c:145
void LogicalTapeWrite(LogicalTapeSet *lts, int tapenum, void *ptr, size_t size)
Definition: logtape.c:775
Oid * rd_indcollation
Definition: rel.h:199
static void readtup_cluster(Tuplesortstate *state, SortTuple *stup, int tapenum, unsigned int len)
Definition: tuplesort.c:4143
#define ERROR
Definition: elog.h:43
void PrepareTempTablespaces(void)
Definition: tablespace.c:1326
void tuplesort_reset(Tuplesortstate *state)
Definition: tuplesort.c:1513
void PrepareSortSupportFromGistIndexRel(Relation indexRel, SortSupport ssup)
Definition: sortsupport.c:189
void heap_free_minimal_tuple(MinimalTuple mtup)
Definition: heaptuple.c:1427
void * lastReturnedTuple
Definition: tuplesort.c:358
MemoryContext sortcontext
Definition: tuplesort.c:262
static void writetup_heap(Tuplesortstate *state, int tapenum, SortTuple *stup)
Definition: tuplesort.c:3876
static void writetup_index(Tuplesortstate *state, int tapenum, SortTuple *stup)
Definition: tuplesort.c:4376
MemoryContext ssup_cxt
Definition: sortsupport.h:66
uint32 high_mask
Definition: tuplesort.c:465
#define MERGE_BUFFER_SIZE
Definition: tuplesort.c:232
ItemPointerData t_self
Definition: htup.h:65
static int comparetup_datum(const SortTuple *a, const SortTuple *b, Tuplesortstate *state)
Definition: tuplesort.c:4422
void tuplesort_rescan(Tuplesortstate *state)
Definition: tuplesort.c:3258
void(* readtup)(Tuplesortstate *state, SortTuple *stup, int tapenum, unsigned int len)
Definition: tuplesort.c:301
#define ALLOCSET_DEFAULT_SIZES
Definition: memutils.h:192
bool tuplesort_gettupleslot(Tuplesortstate *state, bool forward, bool copy, TupleTableSlot *slot, Datum *abbrev)
Definition: tuplesort.c:2389
int(* comparator)(Datum x, Datum y, SortSupport ssup)
Definition: sortsupport.h:106
static void free_sort_tuple(Tuplesortstate *state, SortTuple *stup)
Definition: tuplesort.c:4755
static void readtup_index(Tuplesortstate *state, SortTuple *stup, int tapenum, unsigned int len)
Definition: tuplesort.c:4398
uint32 t_len
Definition: htup.h:64
static void copytup_datum(Tuplesortstate *state, SortTuple *stup, void *tup)
Definition: tuplesort.c:4443
static MinimalTuple ExecCopySlotMinimalTuple(TupleTableSlot *slot)
Definition: tuptable.h:463
void tuplesort_set_bound(Tuplesortstate *state, int64 bound)
Definition: tuplesort.c:1315
int(* abbrev_full_comparator)(Datum x, Datum y, SortSupport ssup)
Definition: sortsupport.h:191
static char * buf
Definition: pg_test_fsync.c:68
Sharedsort * shared
Definition: tuplesort.c:421
TuplesortMethod sortMethod
Definition: tuplesort.h:91
static void readtup_datum(Tuplesortstate *state, SortTuple *stup, int tapenum, unsigned int len)
Definition: tuplesort.c:4491
void ExecDropSingleTupleTableSlot(TupleTableSlot *slot)
Definition: execTuples.c:1224
MinimalTupleData * MinimalTuple
Definition: htup.h:27
IndexTupleData * IndexTuple
Definition: itup.h:53
#define COMPARETUP(state, a, b)
Definition: tuplesort.c:541
int errdetail(const char *fmt,...)
Definition: elog.c:954
size_t LogicalTapeBackspace(LogicalTapeSet *lts, int tapenum, size_t size)
Definition: logtape.c:1137
#define MINORDER
Definition: tuplesort.c:229
union SlabSlot SlabSlot
IndexInfo * indexInfo
Definition: tuplesort.c:451
static void tuplesort_heap_insert(Tuplesortstate *state, SortTuple *tuple)
Definition: tuplesort.c:3564
const char * pg_rusage_show(const PGRUsage *ru0)
Definition: pg_rusage.c:40
#define LEADER(state)
Definition: tuplesort.c:550
#define FREEMEM(state, amt)
Definition: tuplesort.c:547
#define RelationGetRelationName(relation)
Definition: rel.h:490
Datum(* abbrev_converter)(Datum original, SortSupport ssup)
Definition: sortsupport.h:172
static bool tuplesort_gettuple_common(Tuplesortstate *state, bool forward, SortTuple *stup)
Definition: tuplesort.c:2132
unsigned int uint32
Definition: c.h:375
LogicalTapeSet * tapeset
Definition: tuplesort.c:264
Oid t_tableOid
Definition: htup.h:66
void index_deform_tuple(IndexTuple tup, TupleDesc tupleDescriptor, Datum *values, bool *isnull)
Definition: indextuple.c:434
void LogicalTapeTell(LogicalTapeSet *lts, int tapenum, long *blocknum, int *offset)
Definition: logtape.c:1245
MemoryContext CurrentMemoryContext
Definition: mcxt.c:38
#define WORKER(state)
Definition: tuplesort.c:549
int64 allowedMem
Definition: tuplesort.c:251
#define IndexRelationGetNumberOfKeyAttributes(relation)
Definition: rel.h:475
int workersFinished
Definition: tuplesort.c:503
void PrepareSortSupportFromIndexRel(Relation indexRel, int16 strategy, SortSupport ssup)
Definition: sortsupport.c:162
int nTapes
Definition: tuplesort.c:509
static void selectnewtape(Tuplesortstate *state)
Definition: tuplesort.c:2724
static void tuplesort_free(Tuplesortstate *state)
Definition: tuplesort.c:1372
#define AssertArg(condition)
Definition: c.h:748
Datum datumCopy(Datum value, bool typByVal, int typLen)
Definition: datum.c:131
#define COPYTUP(state, stup, tup)
Definition: tuplesort.c:542
EState * CreateExecutorState(void)
Definition: execUtils.c:89
static void copytup_heap(Tuplesortstate *state, SortTuple *stup, void *tup)
Definition: tuplesort.c:3798
#define heap_getattr(tup, attnum, tupleDesc, isnull)
Definition: htup_details.h:762
#define READTUP(state, stup, tape, len)
Definition: tuplesort.c:544
#define SK_BT_NULLS_FIRST
Definition: nbtree.h:957
#define WRITETUP(state, tape, stup)
Definition: tuplesort.c:543
#define SpinLockRelease(lock)
Definition: spin.h:64
Size mul_size(Size s1, Size s2)
Definition: shmem.c:515
static void writetup_datum(Tuplesortstate *state, int tapenum, SortTuple *stup)
Definition: tuplesort.c:4450
#define RELEASE_SLAB_SLOT(state, tuple)
Definition: tuplesort.c:529
void * palloc0(Size size)
Definition: mcxt.c:981
Relation indexRel
Definition: tuplesort.c:459
char * slabMemoryBegin
Definition: tuplesort.c:345
uintptr_t Datum
Definition: postgres.h:367
Size add_size(Size s1, Size s2)
Definition: shmem.c:498
AttrNumber ssup_attno
Definition: sortsupport.h:81
static void puttuple_common(Tuplesortstate *state, SortTuple *tuple)
Definition: tuplesort.c:1868
void(* writetup)(Tuplesortstate *state, int tapenum, SortTuple *stup)
Definition: tuplesort.c:293
static void tuplesort_heap_delete_top(Tuplesortstate *state)
Definition: tuplesort.c:3599
#define InvalidOid
Definition: postgres_ext.h:36
void tuplesort_putindextuplevalues(Tuplesortstate *state, Relation rel, ItemPointer self, Datum *values, bool *isnull)
Definition: tuplesort.c:1708
#define ereport(elevel,...)
Definition: elog.h:144
static bool mergereadnext(Tuplesortstate *state, int srcTape, SortTuple *stup)
Definition: tuplesort.c:3130
#define Max(x, y)
Definition: c.h:922
void tuplesort_attach_shared(Sharedsort *shared, dsm_segment *seg)
Definition: tuplesort.c:4582
int sk_flags
Definition: skey.h:66
List * ii_Expressions
Definition: execnodes.h:160
#define Assert(condition)
Definition: c.h:746
#define SK_BT_DESC
Definition: nbtree.h:956
Definition: regguts.h:298
bool enforceUnique
Definition: tuplesort.c:462
int srctape
Definition: tuplesort.c:182
TuplesortSpaceType
Definition: tuplesort.h:83
bool(* abbrev_abort)(int memtupcount, SortSupport ssup)
Definition: sortsupport.h:182
long markpos_block
Definition: tuplesort.c:401
struct ItemPointerData ItemPointerData
#define INDEX_MAX_KEYS
size_t Size
Definition: c.h:474
bool eof_reached
Definition: tuplesort.c:398
TupSortStatus
Definition: tuplesort.c:208
int tuplesort_merge_order(int64 allowedMem)
Definition: tuplesort.c:2583
void get_typlenbyval(Oid typid, int16 *typlen, bool *typbyval)
Definition: lsyscache.c:2139
TupleTableSlot * ecxt_scantuple
Definition: execnodes.h:225
#define MAXALIGN(LEN)
Definition: c.h:699
#define INITIAL_MEMTUPSIZE
Definition: tuplesort.c:135
#define index_getattr(tup, attnum, tupleDesc, isnull)
Definition: itup.h:100
static void tuplesort_sort_memtuples(Tuplesortstate *state)
Definition: tuplesort.c:3536
#define ItemPointerGetOffsetNumber(pointer)
Definition: itemptr.h:117
ScanKeyData scankeys[INDEX_MAX_KEYS]
Definition: nbtree.h:666
Size tuplesort_estimate_shared(int nWorkers)
Definition: tuplesort.c:4538
uint32 max_buckets
Definition: tuplesort.c:467
#define MINIMAL_TUPLE_OFFSET
Definition: htup_details.h:619
#define INT64_FORMAT
Definition: c.h:417
void LogicalTapeRewindForRead(LogicalTapeSet *lts, int tapenum, size_t buffer_size)
Definition: logtape.c:863
int * tp_dummy
Definition: tuplesort.c:387
static void readtup_heap(Tuplesortstate *state, SortTuple *stup, int tapenum, unsigned int len)
Definition: tuplesort.c:3903
bool slabAllocatorUsed
Definition: tuplesort.c:343
static void leader_takeover_tapes(Tuplesortstate *state)
Definition: tuplesort.c:4689
#define DatumGetPointer(X)
Definition: postgres.h:549
void LogicalTapeFreeze(LogicalTapeSet *lts, int tapenum, TapeShare *share)
Definition: logtape.c:1034
static Datum values[MAXATTR]
Definition: bootstrap.c:165
static Tuplesortstate * tuplesort_begin_common(int workMem, SortCoordinate coordinate, bool randomAccess)
Definition: tuplesort.c:702
static void * readtup_alloc(Tuplesortstate *state, Size tuplen)
Definition: tuplesort.c:3708
MemoryContext tuplecontext
Definition: tuplesort.c:263
void * repalloc_huge(void *pointer, Size size)
Definition: mcxt.c:1140
void * palloc(Size size)
Definition: mcxt.c:950
int errmsg(const char *fmt,...)
Definition: elog.c:821
void SharedFileSetAttach(SharedFileSet *fileset, dsm_segment *seg)
int * tp_tapenum
Definition: tuplesort.c:388
#define USEMEM(state, amt)
Definition: tuplesort.c:546
bool markpos_eof
Definition: tuplesort.c:403
#define HEAPTUPLESIZE
Definition: htup.h:73
void * MemoryContextAlloc(MemoryContext context, Size size)
Definition: mcxt.c:797
Oid sk_collation
Definition: skey.h:70
#define elog(elevel,...)
Definition: elog.h:214
int i
int currentWorker
Definition: tuplesort.c:502
uint32 low_mask
Definition: tuplesort.c:466
TapeShare tapes[FLEXIBLE_ARRAY_MEMBER]
Definition: tuplesort.c:515
void tuplesort_markpos(Tuplesortstate *state)
Definition: tuplesort.c:3293
const TupleTableSlotOps TTSOpsHeapTuple
Definition: execTuples.c:84
void tuplesort_putheaptuple(Tuplesortstate *state, HeapTuple tup)
Definition: tuplesort.c:1687
void LogicalTapeSetClose(LogicalTapeSet *lts)
Definition: logtape.c:737
void LogicalTapeSeek(LogicalTapeSet *lts, int tapenum, long blocknum, int offset)
Definition: logtape.c:1211
TuplesortSpaceType spaceType
Definition: tuplesort.h:92
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:99
SortSupport onlyKey
Definition: tuplesort.c:436
Tuplesortstate * tuplesort_begin_index_btree(Relation heapRel, Relation indexRel, bool enforceUnique, int workMem, SortCoordinate coordinate, bool randomAccess)
Definition: tuplesort.c:1047
void LogicalTapeSetForgetFreeSpace(LogicalTapeSet *lts)
Definition: logtape.c:764
char * BuildIndexValueDescription(Relation indexRelation, Datum *values, bool *isnull)
Definition: genam.c:177
#define ItemPointerGetBlockNumber(pointer)
Definition: itemptr.h:98
SlabSlot * slabFreeHead
Definition: tuplesort.c:347
long LogicalTapeSetBlocks(LogicalTapeSet *lts)
Definition: logtape.c:1272
AttrNumber ii_IndexAttrNumbers[INDEX_MAX_KEYS]
Definition: execnodes.h:159
void tuplesort_end(Tuplesortstate *state)
Definition: tuplesort.c:1445
#define BTLessStrategyNumber
Definition: stratnum.h:29
static void worker_nomergeruns(Tuplesortstate *state)
Definition: tuplesort.c:4668
static int ApplySortComparator(Datum datum1, bool isNull1, Datum datum2, bool isNull2, SortSupport ssup)
Definition: sortsupport.h:200
Size datumGetSize(Datum value, bool typByVal, int typLen)
Definition: datum.c:64
bool tuplesort_skiptuples(Tuplesortstate *state, int64 ntuples, bool forward)
Definition: tuplesort.c:2515
static void make_bounded_heap(Tuplesortstate *state)
Definition: tuplesort.c:3447
static void tuplesort_heap_replace_top(Tuplesortstate *state, SortTuple *tuple)
Definition: tuplesort.c:3623
int16 AttrNumber
Definition: attnum.h:21
#define MAXORDER
Definition: tuplesort.c:230
static void beginmerge(Tuplesortstate *state)
Definition: tuplesort.c:3082
#define LACKMEM(state)
Definition: tuplesort.c:545
static void inittapestate(Tuplesortstate *state, int maxTapes)
Definition: tuplesort.c:2680
SharedFileSet fileset
Definition: tuplesort.c:506
long val
Definition: informix.c:664
TupleTableSlot * ExecStoreHeapTuple(HeapTuple tuple, TupleTableSlot *slot, bool shouldFree)
Definition: execTuples.c:1322
#define offsetof(type, field)
Definition: c.h:669
bool * mergeactive
Definition: tuplesort.c:376
AttrNumber sk_attno
Definition: skey.h:67
TupleDesc tupDesc
Definition: tuplesort.c:429
#define IndexTupleSize(itup)
Definition: itup.h:71
void tuplesort_puttupleslot(Tuplesortstate *state, TupleTableSlot *slot)
Definition: tuplesort.c:1665
SortTuple * memtuples
Definition: tuplesort.c:311
static int ApplySortAbbrevFullComparator(Datum datum1, bool isNull1, Datum datum2, bool isNull2, SortSupport ssup)
Definition: sortsupport.h:238