PostgreSQL Source Code  git master
tuplesort.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * tuplesort.c
4  * Generalized tuple sorting routines.
5  *
6  * This module handles sorting of heap tuples, index tuples, or single
7  * Datums (and could easily support other kinds of sortable objects,
8  * if necessary). It works efficiently for both small and large amounts
9  * of data. Small amounts are sorted in-memory using qsort(). Large
10  * amounts are sorted using temporary files and a standard external sort
11  * algorithm.
12  *
13  * See Knuth, volume 3, for more than you want to know about external
14  * sorting algorithms. The algorithm we use is a balanced k-way merge.
15  * Before PostgreSQL 15, we used the polyphase merge algorithm (Knuth's
16  * Algorithm 5.4.2D), but with modern hardware, a straightforward balanced
17  * merge is better. Knuth is assuming that tape drives are expensive
18  * beasts, and in particular that there will always be many more runs than
19  * tape drives. The polyphase merge algorithm was good at keeping all the
20  * tape drives busy, but in our implementation a "tape drive" doesn't cost
21  * much more than a few Kb of memory buffers, so we can afford to have
22  * lots of them. In particular, if we can have as many tape drives as
23  * sorted runs, we can eliminate any repeated I/O at all.
24  *
25  * Historically, we divided the input into sorted runs using replacement
26  * selection, in the form of a priority tree implemented as a heap
27  * (essentially Knuth's Algorithm 5.2.3H), but now we always use quicksort
28  * for run generation.
29  *
30  * The approximate amount of memory allowed for any one sort operation
31  * is specified in kilobytes by the caller (most pass work_mem). Initially,
32  * we absorb tuples and simply store them in an unsorted array as long as
33  * we haven't exceeded workMem. If we reach the end of the input without
34  * exceeding workMem, we sort the array using qsort() and subsequently return
35  * tuples just by scanning the tuple array sequentially. If we do exceed
36  * workMem, we begin to emit tuples into sorted runs in temporary tapes.
37  * When tuples are dumped in batch after quicksorting, we begin a new run
38  * with a new output tape. If we reach the max number of tapes, we write
39  * subsequent runs on the existing tapes in a round-robin fashion. We will
40  * need multiple merge passes to finish the merge in that case. After the
41  * end of the input is reached, we dump out remaining tuples in memory into
42  * a final run, then merge the runs.
43  *
44  * When merging runs, we use a heap containing just the frontmost tuple from
45  * each source run; we repeatedly output the smallest tuple and replace it
46  * with the next tuple from its source tape (if any). When the heap empties,
47  * the merge is complete. The basic merge algorithm thus needs very little
48  * memory --- only M tuples for an M-way merge, and M is constrained to a
49  * small number. However, we can still make good use of our full workMem
50  * allocation by pre-reading additional blocks from each source tape. Without
51  * prereading, our access pattern to the temporary file would be very erratic;
52  * on average we'd read one block from each of M source tapes during the same
53  * time that we're writing M blocks to the output tape, so there is no
54  * sequentiality of access at all, defeating the read-ahead methods used by
55  * most Unix kernels. Worse, the output tape gets written into a very random
56  * sequence of blocks of the temp file, ensuring that things will be even
57  * worse when it comes time to read that tape. A straightforward merge pass
58  * thus ends up doing a lot of waiting for disk seeks. We can improve matters
59  * by prereading from each source tape sequentially, loading about workMem/M
60  * bytes from each tape in turn, and making the sequential blocks immediately
61  * available for reuse. This approach helps to localize both read and write
62  * accesses. The pre-reading is handled by logtape.c, we just tell it how
63  * much memory to use for the buffers.
64  *
65  * In the current code we determine the number of input tapes M on the basis
66  * of workMem: we want workMem/M to be large enough that we read a fair
67  * amount of data each time we read from a tape, so as to maintain the
68  * locality of access described above. Nonetheless, with large workMem we
69  * can have many tapes. The logical "tapes" are implemented by logtape.c,
70  * which avoids space wastage by recycling disk space as soon as each block
71  * is read from its "tape".
72  *
73  * When the caller requests random access to the sort result, we form
74  * the final sorted run on a logical tape which is then "frozen", so
75  * that we can access it randomly. When the caller does not need random
76  * access, we return from tuplesort_performsort() as soon as we are down
77  * to one run per logical tape. The final merge is then performed
78  * on-the-fly as the caller repeatedly calls tuplesort_getXXX; this
79  * saves one cycle of writing all the data out to disk and reading it in.
80  *
81  * This module supports parallel sorting. Parallel sorts involve coordination
82  * among one or more worker processes, and a leader process, each with its own
83  * tuplesort state. The leader process (or, more accurately, the
84  * Tuplesortstate associated with a leader process) creates a full tapeset
85  * consisting of worker tapes with one run to merge; a run for every
86  * worker process. This is then merged. Worker processes are guaranteed to
87  * produce exactly one output run from their partial input.
88  *
89  *
90  * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
91  * Portions Copyright (c) 1994, Regents of the University of California
92  *
93  * IDENTIFICATION
94  * src/backend/utils/sort/tuplesort.c
95  *
96  *-------------------------------------------------------------------------
97  */
98 
99 #include "postgres.h"
100 
101 #include <limits.h>
102 
103 #include "access/hash.h"
104 #include "access/htup_details.h"
105 #include "access/nbtree.h"
106 #include "catalog/index.h"
107 #include "catalog/pg_am.h"
108 #include "commands/tablespace.h"
109 #include "executor/executor.h"
110 #include "miscadmin.h"
111 #include "pg_trace.h"
112 #include "utils/datum.h"
113 #include "utils/logtape.h"
114 #include "utils/lsyscache.h"
115 #include "utils/memutils.h"
116 #include "utils/pg_rusage.h"
117 #include "utils/rel.h"
118 #include "utils/sortsupport.h"
119 #include "utils/tuplesort.h"
120 
121 
122 /* sort-type codes for sort__start probes */
123 #define HEAP_SORT 0
124 #define INDEX_SORT 1
125 #define DATUM_SORT 2
126 #define CLUSTER_SORT 3
127 
128 /* Sort parallel code from state for sort__start probes */
129 #define PARALLEL_SORT(state) ((state)->shared == NULL ? 0 : \
130  (state)->worker >= 0 ? 1 : 2)
131 
132 /*
133  * Initial size of memtuples array. We're trying to select this size so that
134  * array doesn't exceed ALLOCSET_SEPARATE_THRESHOLD and so that the overhead of
135  * allocation might possibly be lowered. However, we don't consider array sizes
136  * less than 1024.
137  *
138  */
139 #define INITIAL_MEMTUPSIZE Max(1024, \
140  ALLOCSET_SEPARATE_THRESHOLD / sizeof(SortTuple) + 1)
141 
142 /* GUC variables */
143 #ifdef TRACE_SORT
144 bool trace_sort = false;
145 #endif
146 
147 #ifdef DEBUG_BOUNDED_SORT
148 bool optimize_bounded_sort = true;
149 #endif
150 
151 
152 /*
153  * The objects we actually sort are SortTuple structs. These contain
154  * a pointer to the tuple proper (might be a MinimalTuple or IndexTuple),
155  * which is a separate palloc chunk --- we assume it is just one chunk and
156  * can be freed by a simple pfree() (except during merge, when we use a
157  * simple slab allocator). SortTuples also contain the tuple's first key
158  * column in Datum/nullflag format, and a source/input tape number that
159  * tracks which tape each heap element/slot belongs to during merging.
160  *
161  * Storing the first key column lets us save heap_getattr or index_getattr
162  * calls during tuple comparisons. We could extract and save all the key
163  * columns not just the first, but this would increase code complexity and
164  * overhead, and wouldn't actually save any comparison cycles in the common
165  * case where the first key determines the comparison result. Note that
166  * for a pass-by-reference datatype, datum1 points into the "tuple" storage.
167  *
168  * There is one special case: when the sort support infrastructure provides an
169  * "abbreviated key" representation, where the key is (typically) a pass by
170  * value proxy for a pass by reference type. In this case, the abbreviated key
171  * is stored in datum1 in place of the actual first key column.
172  *
173  * When sorting single Datums, the data value is represented directly by
174  * datum1/isnull1 for pass by value types (or null values). If the datatype is
175  * pass-by-reference and isnull1 is false, then "tuple" points to a separately
176  * palloc'd data value, otherwise "tuple" is NULL. The value of datum1 is then
177  * either the same pointer as "tuple", or is an abbreviated key value as
178  * described above. Accordingly, "tuple" is always used in preference to
179  * datum1 as the authoritative value for pass-by-reference cases.
180  */
181 typedef struct
182 {
183  void *tuple; /* the tuple itself */
184  Datum datum1; /* value of first key column */
185  bool isnull1; /* is first key column NULL? */
186  int srctape; /* source tape number */
187 } SortTuple;
188 
189 /*
190  * During merge, we use a pre-allocated set of fixed-size slots to hold
191  * tuples. To avoid palloc/pfree overhead.
192  *
193  * Merge doesn't require a lot of memory, so we can afford to waste some,
194  * by using gratuitously-sized slots. If a tuple is larger than 1 kB, the
195  * palloc() overhead is not significant anymore.
196  *
197  * 'nextfree' is valid when this chunk is in the free list. When in use, the
198  * slot holds a tuple.
199  */
200 #define SLAB_SLOT_SIZE 1024
201 
202 typedef union SlabSlot
203 {
207 
208 /*
209  * Possible states of a Tuplesort object. These denote the states that
210  * persist between calls of Tuplesort routines.
211  */
212 typedef enum
213 {
214  TSS_INITIAL, /* Loading tuples; still within memory limit */
215  TSS_BOUNDED, /* Loading tuples into bounded-size heap */
216  TSS_BUILDRUNS, /* Loading tuples; writing to tape */
217  TSS_SORTEDINMEM, /* Sort completed entirely in memory */
218  TSS_SORTEDONTAPE, /* Sort completed, final run is on tape */
219  TSS_FINALMERGE /* Performing final merge on-the-fly */
221 
222 /*
223  * Parameters for calculation of number of tapes to use --- see inittapes()
224  * and tuplesort_merge_order().
225  *
226  * In this calculation we assume that each tape will cost us about 1 blocks
227  * worth of buffer space. This ignores the overhead of all the other data
228  * structures needed for each tape, but it's probably close enough.
229  *
230  * MERGE_BUFFER_SIZE is how much buffer space we'd like to allocate for each
231  * input tape, for pre-reading (see discussion at top of file). This is *in
232  * addition to* the 1 block already included in TAPE_BUFFER_OVERHEAD.
233  */
234 #define MINORDER 6 /* minimum merge order */
235 #define MAXORDER 500 /* maximum merge order */
236 #define TAPE_BUFFER_OVERHEAD BLCKSZ
237 #define MERGE_BUFFER_SIZE (BLCKSZ * 32)
238 
239 typedef int (*SortTupleComparator) (const SortTuple *a, const SortTuple *b,
241 
242 /*
243  * Private state of a Tuplesort operation.
244  */
246 {
247  TupSortStatus status; /* enumerated value as shown above */
248  int nKeys; /* number of columns in sort key */
249  int sortopt; /* Bitmask of flags used to setup sort */
250  bool bounded; /* did caller specify a maximum number of
251  * tuples to return? */
252  bool boundUsed; /* true if we made use of a bounded heap */
253  int bound; /* if bounded, the maximum number of tuples */
254  bool tuples; /* Can SortTuple.tuple ever be set? */
255  int64 availMem; /* remaining memory available, in bytes */
256  int64 allowedMem; /* total memory allowed, in bytes */
257  int maxTapes; /* max number of input tapes to merge in each
258  * pass */
259  int64 maxSpace; /* maximum amount of space occupied among sort
260  * of groups, either in-memory or on-disk */
261  bool isMaxSpaceDisk; /* true when maxSpace is value for on-disk
262  * space, false when it's value for in-memory
263  * space */
264  TupSortStatus maxSpaceStatus; /* sort status when maxSpace was reached */
265  MemoryContext maincontext; /* memory context for tuple sort metadata that
266  * persists across multiple batches */
267  MemoryContext sortcontext; /* memory context holding most sort data */
268  MemoryContext tuplecontext; /* sub-context of sortcontext for tuple data */
269  LogicalTapeSet *tapeset; /* logtape.c object for tapes in a temp file */
270 
271  /*
272  * These function pointers decouple the routines that must know what kind
273  * of tuple we are sorting from the routines that don't need to know it.
274  * They are set up by the tuplesort_begin_xxx routines.
275  *
276  * Function to compare two tuples; result is per qsort() convention, ie:
277  * <0, 0, >0 according as a<b, a=b, a>b. The API must match
278  * qsort_arg_comparator.
279  */
281 
282  /*
283  * Function to copy a supplied input tuple into palloc'd space and set up
284  * its SortTuple representation (ie, set tuple/datum1/isnull1). Also,
285  * state->availMem must be decreased by the amount of space used for the
286  * tuple copy (note the SortTuple struct itself is not counted).
287  */
288  void (*copytup) (Tuplesortstate *state, SortTuple *stup, void *tup);
289 
290  /*
291  * Function to write a stored tuple onto tape. The representation of the
292  * tuple on tape need not be the same as it is in memory; requirements on
293  * the tape representation are given below. Unless the slab allocator is
294  * used, after writing the tuple, pfree() the out-of-line data (not the
295  * SortTuple struct!), and increase state->availMem by the amount of
296  * memory space thereby released.
297  */
299  SortTuple *stup);
300 
301  /*
302  * Function to read a stored tuple from tape back into memory. 'len' is
303  * the already-read length of the stored tuple. The tuple is allocated
304  * from the slab memory arena, or is palloc'd, see readtup_alloc().
305  */
307  LogicalTape *tape, unsigned int len);
308 
309  /*
310  * Whether SortTuple's datum1 and isnull1 members are maintained by the
311  * above routines. If not, some sort specializations are disabled.
312  */
314 
315  /*
316  * This array holds the tuples now in sort memory. If we are in state
317  * INITIAL, the tuples are in no particular order; if we are in state
318  * SORTEDINMEM, the tuples are in final sorted order; in states BUILDRUNS
319  * and FINALMERGE, the tuples are organized in "heap" order per Algorithm
320  * H. In state SORTEDONTAPE, the array is not used.
321  */
322  SortTuple *memtuples; /* array of SortTuple structs */
323  int memtupcount; /* number of tuples currently present */
324  int memtupsize; /* allocated length of memtuples array */
325  bool growmemtuples; /* memtuples' growth still underway? */
326 
327  /*
328  * Memory for tuples is sometimes allocated using a simple slab allocator,
329  * rather than with palloc(). Currently, we switch to slab allocation
330  * when we start merging. Merging only needs to keep a small, fixed
331  * number of tuples in memory at any time, so we can avoid the
332  * palloc/pfree overhead by recycling a fixed number of fixed-size slots
333  * to hold the tuples.
334  *
335  * For the slab, we use one large allocation, divided into SLAB_SLOT_SIZE
336  * slots. The allocation is sized to have one slot per tape, plus one
337  * additional slot. We need that many slots to hold all the tuples kept
338  * in the heap during merge, plus the one we have last returned from the
339  * sort, with tuplesort_gettuple.
340  *
341  * Initially, all the slots are kept in a linked list of free slots. When
342  * a tuple is read from a tape, it is put to the next available slot, if
343  * it fits. If the tuple is larger than SLAB_SLOT_SIZE, it is palloc'd
344  * instead.
345  *
346  * When we're done processing a tuple, we return the slot back to the free
347  * list, or pfree() if it was palloc'd. We know that a tuple was
348  * allocated from the slab, if its pointer value is between
349  * slabMemoryBegin and -End.
350  *
351  * When the slab allocator is used, the USEMEM/LACKMEM mechanism of
352  * tracking memory usage is not used.
353  */
355 
356  char *slabMemoryBegin; /* beginning of slab memory arena */
357  char *slabMemoryEnd; /* end of slab memory arena */
358  SlabSlot *slabFreeHead; /* head of free list */
359 
360  /* Memory used for input and output tape buffers. */
362 
363  /*
364  * When we return a tuple to the caller in tuplesort_gettuple_XXX, that
365  * came from a tape (that is, in TSS_SORTEDONTAPE or TSS_FINALMERGE
366  * modes), we remember the tuple in 'lastReturnedTuple', so that we can
367  * recycle the memory on next gettuple call.
368  */
370 
371  /*
372  * While building initial runs, this is the current output run number.
373  * Afterwards, it is the number of initial runs we made.
374  */
376 
377  /*
378  * Logical tapes, for merging.
379  *
380  * The initial runs are written in the output tapes. In each merge pass,
381  * the output tapes of the previous pass become the input tapes, and new
382  * output tapes are created as needed. When nInputTapes equals
383  * nInputRuns, there is only one merge pass left.
384  */
388 
392 
393  LogicalTape *destTape; /* current output tape */
394 
395  /*
396  * These variables are used after completion of sorting to keep track of
397  * the next tuple to return. (In the tape case, the tape's current read
398  * position is also critical state.)
399  */
400  LogicalTape *result_tape; /* actual tape of finished output */
401  int current; /* array index (only used if SORTEDINMEM) */
402  bool eof_reached; /* reached EOF (needed for cursors) */
403 
404  /* markpos_xxx holds marked position for mark and restore */
405  long markpos_block; /* tape block# (only used if SORTEDONTAPE) */
406  int markpos_offset; /* saved "current", or offset in tape block */
407  bool markpos_eof; /* saved "eof_reached" */
408 
409  /*
410  * These variables are used during parallel sorting.
411  *
412  * worker is our worker identifier. Follows the general convention that
413  * -1 value relates to a leader tuplesort, and values >= 0 worker
414  * tuplesorts. (-1 can also be a serial tuplesort.)
415  *
416  * shared is mutable shared memory state, which is used to coordinate
417  * parallel sorts.
418  *
419  * nParticipants is the number of worker Tuplesortstates known by the
420  * leader to have actually been launched, which implies that they must
421  * finish a run that the leader needs to merge. Typically includes a
422  * worker state held by the leader process itself. Set in the leader
423  * Tuplesortstate only.
424  */
425  int worker;
428 
429  /*
430  * The sortKeys variable is used by every case other than the hash index
431  * case; it is set by tuplesort_begin_xxx. tupDesc is only used by the
432  * MinimalTuple and CLUSTER routines, though.
433  */
435  SortSupport sortKeys; /* array of length nKeys */
436 
437  /*
438  * This variable is shared by the single-key MinimalTuple case and the
439  * Datum case (which both use qsort_ssup()). Otherwise, it's NULL. The
440  * presence of a value in this field is also checked by various sort
441  * specialization functions as an optimization when comparing the leading
442  * key in a tiebreak situation to determine if there are any subsequent
443  * keys to sort on.
444  */
446 
447  /*
448  * Additional state for managing "abbreviated key" sortsupport routines
449  * (which currently may be used by all cases except the hash index case).
450  * Tracks the intervals at which the optimization's effectiveness is
451  * tested.
452  */
453  int64 abbrevNext; /* Tuple # at which to next check
454  * applicability */
455 
456  /*
457  * These variables are specific to the CLUSTER case; they are set by
458  * tuplesort_begin_cluster.
459  */
460  IndexInfo *indexInfo; /* info about index being used for reference */
461  EState *estate; /* for evaluating index expressions */
462 
463  /*
464  * These variables are specific to the IndexTuple case; they are set by
465  * tuplesort_begin_index_xxx and used only by the IndexTuple routines.
466  */
467  Relation heapRel; /* table the index is being built on */
468  Relation indexRel; /* index being built */
469 
470  /* These are specific to the index_btree subcase: */
471  bool enforceUnique; /* complain if we find duplicate tuples */
472  bool uniqueNullsNotDistinct; /* unique constraint null treatment */
473 
474  /* These are specific to the index_hash subcase: */
475  uint32 high_mask; /* masks for sortable part of hash code */
478 
479  /*
480  * These variables are specific to the Datum case; they are set by
481  * tuplesort_begin_datum and used only by the DatumTuple routines.
482  */
484  /* we need typelen in order to know how to copy the Datums. */
486 
487  /*
488  * Resource snapshot for time of sort start.
489  */
490 #ifdef TRACE_SORT
492 #endif
493 };
494 
495 /*
496  * Private mutable state of tuplesort-parallel-operation. This is allocated
497  * in shared memory.
498  */
500 {
501  /* mutex protects all fields prior to tapes */
503 
504  /*
505  * currentWorker generates ordinal identifier numbers for parallel sort
506  * workers. These start from 0, and are always gapless.
507  *
508  * Workers increment workersFinished to indicate having finished. If this
509  * is equal to state.nParticipants within the leader, leader is ready to
510  * merge worker runs.
511  */
514 
515  /* Temporary file space */
517 
518  /* Size of tapes flexible array */
519  int nTapes;
520 
521  /*
522  * Tapes array used by workers to report back information needed by the
523  * leader to concatenate all worker tapes into one for merging
524  */
526 };
527 
528 /*
529  * Is the given tuple allocated from the slab memory arena?
530  */
531 #define IS_SLAB_SLOT(state, tuple) \
532  ((char *) (tuple) >= (state)->slabMemoryBegin && \
533  (char *) (tuple) < (state)->slabMemoryEnd)
534 
535 /*
536  * Return the given tuple to the slab memory free list, or free it
537  * if it was palloc'd.
538  */
539 #define RELEASE_SLAB_SLOT(state, tuple) \
540  do { \
541  SlabSlot *buf = (SlabSlot *) tuple; \
542  \
543  if (IS_SLAB_SLOT((state), buf)) \
544  { \
545  buf->nextfree = (state)->slabFreeHead; \
546  (state)->slabFreeHead = buf; \
547  } else \
548  pfree(buf); \
549  } while(0)
550 
551 #define COMPARETUP(state,a,b) ((*(state)->comparetup) (a, b, state))
552 #define COPYTUP(state,stup,tup) ((*(state)->copytup) (state, stup, tup))
553 #define WRITETUP(state,tape,stup) ((*(state)->writetup) (state, tape, stup))
554 #define READTUP(state,stup,tape,len) ((*(state)->readtup) (state, stup, tape, len))
555 #define LACKMEM(state) ((state)->availMem < 0 && !(state)->slabAllocatorUsed)
556 #define USEMEM(state,amt) ((state)->availMem -= (amt))
557 #define FREEMEM(state,amt) ((state)->availMem += (amt))
558 #define SERIAL(state) ((state)->shared == NULL)
559 #define WORKER(state) ((state)->shared && (state)->worker != -1)
560 #define LEADER(state) ((state)->shared && (state)->worker == -1)
561 
562 /*
563  * NOTES about on-tape representation of tuples:
564  *
565  * We require the first "unsigned int" of a stored tuple to be the total size
566  * on-tape of the tuple, including itself (so it is never zero; an all-zero
567  * unsigned int is used to delimit runs). The remainder of the stored tuple
568  * may or may not match the in-memory representation of the tuple ---
569  * any conversion needed is the job of the writetup and readtup routines.
570  *
571  * If state->sortopt contains TUPLESORT_RANDOMACCESS, then the stored
572  * representation of the tuple must be followed by another "unsigned int" that
573  * is a copy of the length --- so the total tape space used is actually
574  * sizeof(unsigned int) more than the stored length value. This allows
575  * read-backwards. When the random access flag was not specified, the
576  * write/read routines may omit the extra length word.
577  *
578  * writetup is expected to write both length words as well as the tuple
579  * data. When readtup is called, the tape is positioned just after the
580  * front length word; readtup must read the tuple data and advance past
581  * the back length word (if present).
582  *
583  * The write/read routines can make use of the tuple description data
584  * stored in the Tuplesortstate record, if needed. They are also expected
585  * to adjust state->availMem by the amount of memory space (not tape space!)
586  * released or consumed. There is no error return from either writetup
587  * or readtup; they should ereport() on failure.
588  *
589  *
590  * NOTES about memory consumption calculations:
591  *
592  * We count space allocated for tuples against the workMem limit, plus
593  * the space used by the variable-size memtuples array. Fixed-size space
594  * is not counted; it's small enough to not be interesting.
595  *
596  * Note that we count actual space used (as shown by GetMemoryChunkSpace)
597  * rather than the originally-requested size. This is important since
598  * palloc can add substantial overhead. It's not a complete answer since
599  * we won't count any wasted space in palloc allocation blocks, but it's
600  * a lot better than what we were doing before 7.3. As of 9.6, a
601  * separate memory context is used for caller passed tuples. Resetting
602  * it at certain key increments significantly ameliorates fragmentation.
603  * Note that this places a responsibility on copytup routines to use the
604  * correct memory context for these tuples (and to not use the reset
605  * context for anything whose lifetime needs to span multiple external
606  * sort runs). readtup routines use the slab allocator (they cannot use
607  * the reset context because it gets deleted at the point that merging
608  * begins).
609  */
610 
611 /* When using this macro, beware of double evaluation of len */
612 #define LogicalTapeReadExact(tape, ptr, len) \
613  do { \
614  if (LogicalTapeRead(tape, ptr, len) != (size_t) (len)) \
615  elog(ERROR, "unexpected end of data"); \
616  } while(0)
617 
618 
619 static Tuplesortstate *tuplesort_begin_common(int workMem,
620  SortCoordinate coordinate,
621  int sortopt);
623 static void puttuple_common(Tuplesortstate *state, SortTuple *tuple);
625 static void inittapes(Tuplesortstate *state, bool mergeruns);
626 static void inittapestate(Tuplesortstate *state, int maxTapes);
627 static void selectnewtape(Tuplesortstate *state);
628 static void init_slab_allocator(Tuplesortstate *state, int numSlots);
629 static void mergeruns(Tuplesortstate *state);
630 static void mergeonerun(Tuplesortstate *state);
631 static void beginmerge(Tuplesortstate *state);
632 static bool mergereadnext(Tuplesortstate *state, LogicalTape *srcTape, SortTuple *stup);
633 static void dumptuples(Tuplesortstate *state, bool alltuples);
641 static unsigned int getlen(LogicalTape *tape, bool eofOK);
642 static void markrunend(LogicalTape *tape);
643 static void *readtup_alloc(Tuplesortstate *state, Size tuplen);
644 static int comparetup_heap(const SortTuple *a, const SortTuple *b,
646 static void copytup_heap(Tuplesortstate *state, SortTuple *stup, void *tup);
647 static void writetup_heap(Tuplesortstate *state, LogicalTape *tape,
648  SortTuple *stup);
649 static void readtup_heap(Tuplesortstate *state, SortTuple *stup,
650  LogicalTape *tape, unsigned int len);
651 static int comparetup_cluster(const SortTuple *a, const SortTuple *b,
653 static void copytup_cluster(Tuplesortstate *state, SortTuple *stup, void *tup);
655  SortTuple *stup);
656 static void readtup_cluster(Tuplesortstate *state, SortTuple *stup,
657  LogicalTape *tape, unsigned int len);
658 static int comparetup_index_btree(const SortTuple *a, const SortTuple *b,
660 static int comparetup_index_hash(const SortTuple *a, const SortTuple *b,
662 static void copytup_index(Tuplesortstate *state, SortTuple *stup, void *tup);
663 static void writetup_index(Tuplesortstate *state, LogicalTape *tape,
664  SortTuple *stup);
665 static void readtup_index(Tuplesortstate *state, SortTuple *stup,
666  LogicalTape *tape, unsigned int len);
667 static int comparetup_datum(const SortTuple *a, const SortTuple *b,
669 static void copytup_datum(Tuplesortstate *state, SortTuple *stup, void *tup);
670 static void writetup_datum(Tuplesortstate *state, LogicalTape *tape,
671  SortTuple *stup);
672 static void readtup_datum(Tuplesortstate *state, SortTuple *stup,
673  LogicalTape *tape, unsigned int len);
678 static void free_sort_tuple(Tuplesortstate *state, SortTuple *stup);
679 static void tuplesort_free(Tuplesortstate *state);
681 
682 /*
683  * Specialized comparators that we can inline into specialized sorts. The goal
684  * is to try to sort two tuples without having to follow the pointers to the
685  * comparator or the tuple.
686  *
687  * XXX: For now, these fall back to comparator functions that will compare the
688  * leading datum a second time.
689  *
690  * XXX: For now, there is no specialization for cases where datum1 is
691  * authoritative and we don't even need to fall back to a callback at all (that
692  * would be true for types like int4/int8/timestamp/date, but not true for
693  * abbreviations of text or multi-key sorts. There could be! Is it worth it?
694  */
695 
696 /* Used if first key's comparator is ssup_datum_unsigned_compare */
699 {
700  int compare;
701 
702  compare = ApplyUnsignedSortComparator(a->datum1, a->isnull1,
703  b->datum1, b->isnull1,
704  &state->sortKeys[0]);
705  if (compare != 0)
706  return compare;
707 
708  /*
709  * No need to waste effort calling the tiebreak function when there are no
710  * other keys to sort on.
711  */
712  if (state->onlyKey != NULL)
713  return 0;
714 
715  return state->comparetup(a, b, state);
716 }
717 
718 #if SIZEOF_DATUM >= 8
719 /* Used if first key's comparator is ssup_datum_signed_compare */
721 qsort_tuple_signed_compare(SortTuple *a, SortTuple *b, Tuplesortstate *state)
722 {
723  int compare;
724 
725  compare = ApplySignedSortComparator(a->datum1, a->isnull1,
726  b->datum1, b->isnull1,
727  &state->sortKeys[0]);
728 
729  if (compare != 0)
730  return compare;
731 
732  /*
733  * No need to waste effort calling the tiebreak function when there are no
734  * other keys to sort on.
735  */
736  if (state->onlyKey != NULL)
737  return 0;
738 
739  return state->comparetup(a, b, state);
740 }
741 #endif
742 
743 /* Used if first key's comparator is ssup_datum_int32_compare */
746 {
747  int compare;
748 
749  compare = ApplyInt32SortComparator(a->datum1, a->isnull1,
750  b->datum1, b->isnull1,
751  &state->sortKeys[0]);
752 
753  if (compare != 0)
754  return compare;
755 
756  /*
757  * No need to waste effort calling the tiebreak function when there are no
758  * other keys to sort on.
759  */
760  if (state->onlyKey != NULL)
761  return 0;
762 
763  return state->comparetup(a, b, state);
764 }
765 
766 /*
767  * Special versions of qsort just for SortTuple objects. qsort_tuple() sorts
768  * any variant of SortTuples, using the appropriate comparetup function.
769  * qsort_ssup() is specialized for the case where the comparetup function
770  * reduces to ApplySortComparator(), that is single-key MinimalTuple sorts
771  * and Datum sorts. qsort_tuple_{unsigned,signed,int32} are specialized for
772  * common comparison functions on pass-by-value leading datums.
773  */
774 
775 #define ST_SORT qsort_tuple_unsigned
776 #define ST_ELEMENT_TYPE SortTuple
777 #define ST_COMPARE(a, b, state) qsort_tuple_unsigned_compare(a, b, state)
778 #define ST_COMPARE_ARG_TYPE Tuplesortstate
779 #define ST_CHECK_FOR_INTERRUPTS
780 #define ST_SCOPE static
781 #define ST_DEFINE
782 #include "lib/sort_template.h"
783 
784 #if SIZEOF_DATUM >= 8
785 #define ST_SORT qsort_tuple_signed
786 #define ST_ELEMENT_TYPE SortTuple
787 #define ST_COMPARE(a, b, state) qsort_tuple_signed_compare(a, b, state)
788 #define ST_COMPARE_ARG_TYPE Tuplesortstate
789 #define ST_CHECK_FOR_INTERRUPTS
790 #define ST_SCOPE static
791 #define ST_DEFINE
792 #include "lib/sort_template.h"
793 #endif
794 
795 #define ST_SORT qsort_tuple_int32
796 #define ST_ELEMENT_TYPE SortTuple
797 #define ST_COMPARE(a, b, state) qsort_tuple_int32_compare(a, b, state)
798 #define ST_COMPARE_ARG_TYPE Tuplesortstate
799 #define ST_CHECK_FOR_INTERRUPTS
800 #define ST_SCOPE static
801 #define ST_DEFINE
802 #include "lib/sort_template.h"
803 
804 #define ST_SORT qsort_tuple
805 #define ST_ELEMENT_TYPE SortTuple
806 #define ST_COMPARE_RUNTIME_POINTER
807 #define ST_COMPARE_ARG_TYPE Tuplesortstate
808 #define ST_CHECK_FOR_INTERRUPTS
809 #define ST_SCOPE static
810 #define ST_DECLARE
811 #define ST_DEFINE
812 #include "lib/sort_template.h"
813 
814 #define ST_SORT qsort_ssup
815 #define ST_ELEMENT_TYPE SortTuple
816 #define ST_COMPARE(a, b, ssup) \
817  ApplySortComparator((a)->datum1, (a)->isnull1, \
818  (b)->datum1, (b)->isnull1, (ssup))
819 #define ST_COMPARE_ARG_TYPE SortSupportData
820 #define ST_CHECK_FOR_INTERRUPTS
821 #define ST_SCOPE static
822 #define ST_DEFINE
823 #include "lib/sort_template.h"
824 
825 /*
826  * tuplesort_begin_xxx
827  *
828  * Initialize for a tuple sort operation.
829  *
830  * After calling tuplesort_begin, the caller should call tuplesort_putXXX
831  * zero or more times, then call tuplesort_performsort when all the tuples
832  * have been supplied. After performsort, retrieve the tuples in sorted
833  * order by calling tuplesort_getXXX until it returns false/NULL. (If random
834  * access was requested, rescan, markpos, and restorepos can also be called.)
835  * Call tuplesort_end to terminate the operation and release memory/disk space.
836  *
837  * Each variant of tuplesort_begin has a workMem parameter specifying the
838  * maximum number of kilobytes of RAM to use before spilling data to disk.
839  * (The normal value of this parameter is work_mem, but some callers use
840  * other values.) Each variant also has a sortopt which is a bitmask of
841  * sort options. See TUPLESORT_* definitions in tuplesort.h
842  */
843 
844 static Tuplesortstate *
845 tuplesort_begin_common(int workMem, SortCoordinate coordinate, int sortopt)
846 {
848  MemoryContext maincontext;
849  MemoryContext sortcontext;
850  MemoryContext oldcontext;
851 
852  /* See leader_takeover_tapes() remarks on random access support */
853  if (coordinate && (sortopt & TUPLESORT_RANDOMACCESS))
854  elog(ERROR, "random access disallowed under parallel sort");
855 
856  /*
857  * Memory context surviving tuplesort_reset. This memory context holds
858  * data which is useful to keep while sorting multiple similar batches.
859  */
861  "TupleSort main",
863 
864  /*
865  * Create a working memory context for one sort operation. The content of
866  * this context is deleted by tuplesort_reset.
867  */
868  sortcontext = AllocSetContextCreate(maincontext,
869  "TupleSort sort",
871 
872  /*
873  * Additionally a working memory context for tuples is setup in
874  * tuplesort_begin_batch.
875  */
876 
877  /*
878  * Make the Tuplesortstate within the per-sortstate context. This way, we
879  * don't need a separate pfree() operation for it at shutdown.
880  */
881  oldcontext = MemoryContextSwitchTo(maincontext);
882 
884 
885 #ifdef TRACE_SORT
886  if (trace_sort)
887  pg_rusage_init(&state->ru_start);
888 #endif
889 
890  state->sortopt = sortopt;
891  state->tuples = true;
892 
893  /*
894  * workMem is forced to be at least 64KB, the current minimum valid value
895  * for the work_mem GUC. This is a defense against parallel sort callers
896  * that divide out memory among many workers in a way that leaves each
897  * with very little memory.
898  */
899  state->allowedMem = Max(workMem, 64) * (int64) 1024;
900  state->sortcontext = sortcontext;
901  state->maincontext = maincontext;
902 
903  /*
904  * Initial size of array must be more than ALLOCSET_SEPARATE_THRESHOLD;
905  * see comments in grow_memtuples().
906  */
907  state->memtupsize = INITIAL_MEMTUPSIZE;
908  state->memtuples = NULL;
909 
910  /*
911  * After all of the other non-parallel-related state, we setup all of the
912  * state needed for each batch.
913  */
915 
916  /*
917  * Initialize parallel-related state based on coordination information
918  * from caller
919  */
920  if (!coordinate)
921  {
922  /* Serial sort */
923  state->shared = NULL;
924  state->worker = -1;
925  state->nParticipants = -1;
926  }
927  else if (coordinate->isWorker)
928  {
929  /* Parallel worker produces exactly one final run from all input */
930  state->shared = coordinate->sharedsort;
931  state->worker = worker_get_identifier(state);
932  state->nParticipants = -1;
933  }
934  else
935  {
936  /* Parallel leader state only used for final merge */
937  state->shared = coordinate->sharedsort;
938  state->worker = -1;
939  state->nParticipants = coordinate->nParticipants;
940  Assert(state->nParticipants >= 1);
941  }
942 
943  MemoryContextSwitchTo(oldcontext);
944 
945  return state;
946 }
947 
948 /*
949  * tuplesort_begin_batch
950  *
951  * Setup, or reset, all state need for processing a new set of tuples with this
952  * sort state. Called both from tuplesort_begin_common (the first time sorting
953  * with this sort state) and tuplesort_reset (for subsequent usages).
954  */
955 static void
957 {
958  MemoryContext oldcontext;
959 
960  oldcontext = MemoryContextSwitchTo(state->maincontext);
961 
962  /*
963  * Caller tuple (e.g. IndexTuple) memory context.
964  *
965  * A dedicated child context used exclusively for caller passed tuples
966  * eases memory management. Resetting at key points reduces
967  * fragmentation. Note that the memtuples array of SortTuples is allocated
968  * in the parent context, not this context, because there is no need to
969  * free memtuples early. For bounded sorts, tuples may be pfreed in any
970  * order, so we use a regular aset.c context so that it can make use of
971  * free'd memory. When the sort is not bounded, we make use of a
972  * generation.c context as this keeps allocations more compact with less
973  * wastage. Allocations are also slightly more CPU efficient.
974  */
975  if (state->sortopt & TUPLESORT_ALLOWBOUNDED)
976  state->tuplecontext = AllocSetContextCreate(state->sortcontext,
977  "Caller tuples",
979  else
980  state->tuplecontext = GenerationContextCreate(state->sortcontext,
981  "Caller tuples",
983 
984 
985  state->status = TSS_INITIAL;
986  state->bounded = false;
987  state->boundUsed = false;
988 
989  state->availMem = state->allowedMem;
990 
991  state->tapeset = NULL;
992 
993  state->memtupcount = 0;
994 
995  /*
996  * Initial size of array must be more than ALLOCSET_SEPARATE_THRESHOLD;
997  * see comments in grow_memtuples().
998  */
999  state->growmemtuples = true;
1000  state->slabAllocatorUsed = false;
1001  if (state->memtuples != NULL && state->memtupsize != INITIAL_MEMTUPSIZE)
1002  {
1003  pfree(state->memtuples);
1004  state->memtuples = NULL;
1005  state->memtupsize = INITIAL_MEMTUPSIZE;
1006  }
1007  if (state->memtuples == NULL)
1008  {
1009  state->memtuples = (SortTuple *) palloc(state->memtupsize * sizeof(SortTuple));
1010  USEMEM(state, GetMemoryChunkSpace(state->memtuples));
1011  }
1012 
1013  /* workMem must be large enough for the minimal memtuples array */
1014  if (LACKMEM(state))
1015  elog(ERROR, "insufficient memory allowed for sort");
1016 
1017  state->currentRun = 0;
1018 
1019  /*
1020  * Tape variables (inputTapes, outputTapes, etc.) will be initialized by
1021  * inittapes(), if needed.
1022  */
1023 
1024  state->result_tape = NULL; /* flag that result tape has not been formed */
1025 
1026  MemoryContextSwitchTo(oldcontext);
1027 }
1028 
1031  int nkeys, AttrNumber *attNums,
1032  Oid *sortOperators, Oid *sortCollations,
1033  bool *nullsFirstFlags,
1034  int workMem, SortCoordinate coordinate, int sortopt)
1035 {
1036  Tuplesortstate *state = tuplesort_begin_common(workMem, coordinate,
1037  sortopt);
1038  MemoryContext oldcontext;
1039  int i;
1040 
1041  oldcontext = MemoryContextSwitchTo(state->maincontext);
1042 
1043  AssertArg(nkeys > 0);
1044 
1045 #ifdef TRACE_SORT
1046  if (trace_sort)
1047  elog(LOG,
1048  "begin tuple sort: nkeys = %d, workMem = %d, randomAccess = %c",
1049  nkeys, workMem, sortopt & TUPLESORT_RANDOMACCESS ? 't' : 'f');
1050 #endif
1051 
1052  state->nKeys = nkeys;
1053 
1054  TRACE_POSTGRESQL_SORT_START(HEAP_SORT,
1055  false, /* no unique check */
1056  nkeys,
1057  workMem,
1058  sortopt & TUPLESORT_RANDOMACCESS,
1059  PARALLEL_SORT(state));
1060 
1061  state->comparetup = comparetup_heap;
1062  state->copytup = copytup_heap;
1063  state->writetup = writetup_heap;
1064  state->readtup = readtup_heap;
1065  state->haveDatum1 = true;
1066 
1067  state->tupDesc = tupDesc; /* assume we need not copy tupDesc */
1068  state->abbrevNext = 10;
1069 
1070  /* Prepare SortSupport data for each column */
1071  state->sortKeys = (SortSupport) palloc0(nkeys * sizeof(SortSupportData));
1072 
1073  for (i = 0; i < nkeys; i++)
1074  {
1075  SortSupport sortKey = state->sortKeys + i;
1076 
1077  AssertArg(attNums[i] != 0);
1078  AssertArg(sortOperators[i] != 0);
1079 
1080  sortKey->ssup_cxt = CurrentMemoryContext;
1081  sortKey->ssup_collation = sortCollations[i];
1082  sortKey->ssup_nulls_first = nullsFirstFlags[i];
1083  sortKey->ssup_attno = attNums[i];
1084  /* Convey if abbreviation optimization is applicable in principle */
1085  sortKey->abbreviate = (i == 0 && state->haveDatum1);
1086 
1087  PrepareSortSupportFromOrderingOp(sortOperators[i], sortKey);
1088  }
1089 
1090  /*
1091  * The "onlyKey" optimization cannot be used with abbreviated keys, since
1092  * tie-breaker comparisons may be required. Typically, the optimization
1093  * is only of value to pass-by-value types anyway, whereas abbreviated
1094  * keys are typically only of value to pass-by-reference types.
1095  */
1096  if (nkeys == 1 && !state->sortKeys->abbrev_converter)
1097  state->onlyKey = state->sortKeys;
1098 
1099  MemoryContextSwitchTo(oldcontext);
1100 
1101  return state;
1102 }
1103 
1106  Relation indexRel,
1107  int workMem,
1108  SortCoordinate coordinate, int sortopt)
1109 {
1110  Tuplesortstate *state = tuplesort_begin_common(workMem, coordinate,
1111  sortopt);
1112  BTScanInsert indexScanKey;
1113  MemoryContext oldcontext;
1114  int i;
1115 
1116  Assert(indexRel->rd_rel->relam == BTREE_AM_OID);
1117 
1118  oldcontext = MemoryContextSwitchTo(state->maincontext);
1119 
1120 #ifdef TRACE_SORT
1121  if (trace_sort)
1122  elog(LOG,
1123  "begin tuple sort: nkeys = %d, workMem = %d, randomAccess = %c",
1125  workMem, sortopt & TUPLESORT_RANDOMACCESS ? 't' : 'f');
1126 #endif
1127 
1128  state->nKeys = IndexRelationGetNumberOfKeyAttributes(indexRel);
1129 
1130  TRACE_POSTGRESQL_SORT_START(CLUSTER_SORT,
1131  false, /* no unique check */
1132  state->nKeys,
1133  workMem,
1134  sortopt & TUPLESORT_RANDOMACCESS,
1135  PARALLEL_SORT(state));
1136 
1137  state->comparetup = comparetup_cluster;
1138  state->copytup = copytup_cluster;
1139  state->writetup = writetup_cluster;
1140  state->readtup = readtup_cluster;
1141  state->abbrevNext = 10;
1142 
1143  state->indexInfo = BuildIndexInfo(indexRel);
1144 
1145  /*
1146  * If we don't have a simple leading attribute, we don't currently
1147  * initialize datum1, so disable optimizations that require it.
1148  */
1149  if (state->indexInfo->ii_IndexAttrNumbers[0] == 0)
1150  state->haveDatum1 = false;
1151  else
1152  state->haveDatum1 = true;
1153 
1154  state->tupDesc = tupDesc; /* assume we need not copy tupDesc */
1155 
1156  indexScanKey = _bt_mkscankey(indexRel, NULL);
1157 
1158  if (state->indexInfo->ii_Expressions != NULL)
1159  {
1160  TupleTableSlot *slot;
1161  ExprContext *econtext;
1162 
1163  /*
1164  * We will need to use FormIndexDatum to evaluate the index
1165  * expressions. To do that, we need an EState, as well as a
1166  * TupleTableSlot to put the table tuples into. The econtext's
1167  * scantuple has to point to that slot, too.
1168  */
1169  state->estate = CreateExecutorState();
1170  slot = MakeSingleTupleTableSlot(tupDesc, &TTSOpsHeapTuple);
1171  econtext = GetPerTupleExprContext(state->estate);
1172  econtext->ecxt_scantuple = slot;
1173  }
1174 
1175  /* Prepare SortSupport data for each column */
1176  state->sortKeys = (SortSupport) palloc0(state->nKeys *
1177  sizeof(SortSupportData));
1178 
1179  for (i = 0; i < state->nKeys; i++)
1180  {
1181  SortSupport sortKey = state->sortKeys + i;
1182  ScanKey scanKey = indexScanKey->scankeys + i;
1183  int16 strategy;
1184 
1185  sortKey->ssup_cxt = CurrentMemoryContext;
1186  sortKey->ssup_collation = scanKey->sk_collation;
1187  sortKey->ssup_nulls_first =
1188  (scanKey->sk_flags & SK_BT_NULLS_FIRST) != 0;
1189  sortKey->ssup_attno = scanKey->sk_attno;
1190  /* Convey if abbreviation optimization is applicable in principle */
1191  sortKey->abbreviate = (i == 0 && state->haveDatum1);
1192 
1193  AssertState(sortKey->ssup_attno != 0);
1194 
1195  strategy = (scanKey->sk_flags & SK_BT_DESC) != 0 ?
1197 
1198  PrepareSortSupportFromIndexRel(indexRel, strategy, sortKey);
1199  }
1200 
1201  pfree(indexScanKey);
1202 
1203  MemoryContextSwitchTo(oldcontext);
1204 
1205  return state;
1206 }
1207 
1210  Relation indexRel,
1211  bool enforceUnique,
1212  bool uniqueNullsNotDistinct,
1213  int workMem,
1214  SortCoordinate coordinate,
1215  int sortopt)
1216 {
1217  Tuplesortstate *state = tuplesort_begin_common(workMem, coordinate,
1218  sortopt);
1219  BTScanInsert indexScanKey;
1220  MemoryContext oldcontext;
1221  int i;
1222 
1223  oldcontext = MemoryContextSwitchTo(state->maincontext);
1224 
1225 #ifdef TRACE_SORT
1226  if (trace_sort)
1227  elog(LOG,
1228  "begin index sort: unique = %c, workMem = %d, randomAccess = %c",
1229  enforceUnique ? 't' : 'f',
1230  workMem, sortopt & TUPLESORT_RANDOMACCESS ? 't' : 'f');
1231 #endif
1232 
1233  state->nKeys = IndexRelationGetNumberOfKeyAttributes(indexRel);
1234 
1235  TRACE_POSTGRESQL_SORT_START(INDEX_SORT,
1236  enforceUnique,
1237  state->nKeys,
1238  workMem,
1239  sortopt & TUPLESORT_RANDOMACCESS,
1240  PARALLEL_SORT(state));
1241 
1242  state->comparetup = comparetup_index_btree;
1243  state->copytup = copytup_index;
1244  state->writetup = writetup_index;
1245  state->readtup = readtup_index;
1246  state->abbrevNext = 10;
1247  state->haveDatum1 = true;
1248 
1249  state->heapRel = heapRel;
1250  state->indexRel = indexRel;
1251  state->enforceUnique = enforceUnique;
1252  state->uniqueNullsNotDistinct = uniqueNullsNotDistinct;
1253 
1254  indexScanKey = _bt_mkscankey(indexRel, NULL);
1255 
1256  /* Prepare SortSupport data for each column */
1257  state->sortKeys = (SortSupport) palloc0(state->nKeys *
1258  sizeof(SortSupportData));
1259 
1260  for (i = 0; i < state->nKeys; i++)
1261  {
1262  SortSupport sortKey = state->sortKeys + i;
1263  ScanKey scanKey = indexScanKey->scankeys + i;
1264  int16 strategy;
1265 
1266  sortKey->ssup_cxt = CurrentMemoryContext;
1267  sortKey->ssup_collation = scanKey->sk_collation;
1268  sortKey->ssup_nulls_first =
1269  (scanKey->sk_flags & SK_BT_NULLS_FIRST) != 0;
1270  sortKey->ssup_attno = scanKey->sk_attno;
1271  /* Convey if abbreviation optimization is applicable in principle */
1272  sortKey->abbreviate = (i == 0 && state->haveDatum1);
1273 
1274  AssertState(sortKey->ssup_attno != 0);
1275 
1276  strategy = (scanKey->sk_flags & SK_BT_DESC) != 0 ?
1278 
1279  PrepareSortSupportFromIndexRel(indexRel, strategy, sortKey);
1280  }
1281 
1282  pfree(indexScanKey);
1283 
1284  MemoryContextSwitchTo(oldcontext);
1285 
1286  return state;
1287 }
1288 
1291  Relation indexRel,
1292  uint32 high_mask,
1293  uint32 low_mask,
1294  uint32 max_buckets,
1295  int workMem,
1296  SortCoordinate coordinate,
1297  int sortopt)
1298 {
1299  Tuplesortstate *state = tuplesort_begin_common(workMem, coordinate,
1300  sortopt);
1301  MemoryContext oldcontext;
1302 
1303  oldcontext = MemoryContextSwitchTo(state->maincontext);
1304 
1305 #ifdef TRACE_SORT
1306  if (trace_sort)
1307  elog(LOG,
1308  "begin index sort: high_mask = 0x%x, low_mask = 0x%x, "
1309  "max_buckets = 0x%x, workMem = %d, randomAccess = %c",
1310  high_mask,
1311  low_mask,
1312  max_buckets,
1313  workMem,
1314  sortopt & TUPLESORT_RANDOMACCESS ? 't' : 'f');
1315 #endif
1316 
1317  state->nKeys = 1; /* Only one sort column, the hash code */
1318 
1319  state->comparetup = comparetup_index_hash;
1320  state->copytup = copytup_index;
1321  state->writetup = writetup_index;
1322  state->readtup = readtup_index;
1323  state->haveDatum1 = true;
1324 
1325  state->heapRel = heapRel;
1326  state->indexRel = indexRel;
1327 
1328  state->high_mask = high_mask;
1329  state->low_mask = low_mask;
1330  state->max_buckets = max_buckets;
1331 
1332  MemoryContextSwitchTo(oldcontext);
1333 
1334  return state;
1335 }
1336 
1339  Relation indexRel,
1340  int workMem,
1341  SortCoordinate coordinate,
1342  int sortopt)
1343 {
1344  Tuplesortstate *state = tuplesort_begin_common(workMem, coordinate,
1345  sortopt);
1346  MemoryContext oldcontext;
1347  int i;
1348 
1349  oldcontext = MemoryContextSwitchTo(state->sortcontext);
1350 
1351 #ifdef TRACE_SORT
1352  if (trace_sort)
1353  elog(LOG,
1354  "begin index sort: workMem = %d, randomAccess = %c",
1355  workMem, sortopt & TUPLESORT_RANDOMACCESS ? 't' : 'f');
1356 #endif
1357 
1358  state->nKeys = IndexRelationGetNumberOfKeyAttributes(indexRel);
1359 
1360  state->comparetup = comparetup_index_btree;
1361  state->copytup = copytup_index;
1362  state->writetup = writetup_index;
1363  state->readtup = readtup_index;
1364  state->haveDatum1 = true;
1365 
1366  state->heapRel = heapRel;
1367  state->indexRel = indexRel;
1368 
1369  /* Prepare SortSupport data for each column */
1370  state->sortKeys = (SortSupport) palloc0(state->nKeys *
1371  sizeof(SortSupportData));
1372 
1373  for (i = 0; i < state->nKeys; i++)
1374  {
1375  SortSupport sortKey = state->sortKeys + i;
1376 
1377  sortKey->ssup_cxt = CurrentMemoryContext;
1378  sortKey->ssup_collation = indexRel->rd_indcollation[i];
1379  sortKey->ssup_nulls_first = false;
1380  sortKey->ssup_attno = i + 1;
1381  /* Convey if abbreviation optimization is applicable in principle */
1382  sortKey->abbreviate = (i == 0 && state->haveDatum1);
1383 
1384  AssertState(sortKey->ssup_attno != 0);
1385 
1386  /* Look for a sort support function */
1387  PrepareSortSupportFromGistIndexRel(indexRel, sortKey);
1388  }
1389 
1390  MemoryContextSwitchTo(oldcontext);
1391 
1392  return state;
1393 }
1394 
1396 tuplesort_begin_datum(Oid datumType, Oid sortOperator, Oid sortCollation,
1397  bool nullsFirstFlag, int workMem,
1398  SortCoordinate coordinate, int sortopt)
1399 {
1400  Tuplesortstate *state = tuplesort_begin_common(workMem, coordinate,
1401  sortopt);
1402  MemoryContext oldcontext;
1403  int16 typlen;
1404  bool typbyval;
1405 
1406  oldcontext = MemoryContextSwitchTo(state->maincontext);
1407 
1408 #ifdef TRACE_SORT
1409  if (trace_sort)
1410  elog(LOG,
1411  "begin datum sort: workMem = %d, randomAccess = %c",
1412  workMem, sortopt & TUPLESORT_RANDOMACCESS ? 't' : 'f');
1413 #endif
1414 
1415  state->nKeys = 1; /* always a one-column sort */
1416 
1417  TRACE_POSTGRESQL_SORT_START(DATUM_SORT,
1418  false, /* no unique check */
1419  1,
1420  workMem,
1421  sortopt & TUPLESORT_RANDOMACCESS,
1422  PARALLEL_SORT(state));
1423 
1424  state->comparetup = comparetup_datum;
1425  state->copytup = copytup_datum;
1426  state->writetup = writetup_datum;
1427  state->readtup = readtup_datum;
1428  state->abbrevNext = 10;
1429  state->haveDatum1 = true;
1430 
1431  state->datumType = datumType;
1432 
1433  /* lookup necessary attributes of the datum type */
1434  get_typlenbyval(datumType, &typlen, &typbyval);
1435  state->datumTypeLen = typlen;
1436  state->tuples = !typbyval;
1437 
1438  /* Prepare SortSupport data */
1439  state->sortKeys = (SortSupport) palloc0(sizeof(SortSupportData));
1440 
1441  state->sortKeys->ssup_cxt = CurrentMemoryContext;
1442  state->sortKeys->ssup_collation = sortCollation;
1443  state->sortKeys->ssup_nulls_first = nullsFirstFlag;
1444 
1445  /*
1446  * Abbreviation is possible here only for by-reference types. In theory,
1447  * a pass-by-value datatype could have an abbreviated form that is cheaper
1448  * to compare. In a tuple sort, we could support that, because we can
1449  * always extract the original datum from the tuple as needed. Here, we
1450  * can't, because a datum sort only stores a single copy of the datum; the
1451  * "tuple" field of each SortTuple is NULL.
1452  */
1453  state->sortKeys->abbreviate = !typbyval;
1454 
1455  PrepareSortSupportFromOrderingOp(sortOperator, state->sortKeys);
1456 
1457  /*
1458  * The "onlyKey" optimization cannot be used with abbreviated keys, since
1459  * tie-breaker comparisons may be required. Typically, the optimization
1460  * is only of value to pass-by-value types anyway, whereas abbreviated
1461  * keys are typically only of value to pass-by-reference types.
1462  */
1463  if (!state->sortKeys->abbrev_converter)
1464  state->onlyKey = state->sortKeys;
1465 
1466  MemoryContextSwitchTo(oldcontext);
1467 
1468  return state;
1469 }
1470 
1471 /*
1472  * tuplesort_set_bound
1473  *
1474  * Advise tuplesort that at most the first N result tuples are required.
1475  *
1476  * Must be called before inserting any tuples. (Actually, we could allow it
1477  * as long as the sort hasn't spilled to disk, but there seems no need for
1478  * delayed calls at the moment.)
1479  *
1480  * This is a hint only. The tuplesort may still return more tuples than
1481  * requested. Parallel leader tuplesorts will always ignore the hint.
1482  */
1483 void
1485 {
1486  /* Assert we're called before loading any tuples */
1487  Assert(state->status == TSS_INITIAL && state->memtupcount == 0);
1488  /* Assert we allow bounded sorts */
1489  Assert(state->sortopt & TUPLESORT_ALLOWBOUNDED);
1490  /* Can't set the bound twice, either */
1491  Assert(!state->bounded);
1492  /* Also, this shouldn't be called in a parallel worker */
1493  Assert(!WORKER(state));
1494 
1495  /* Parallel leader allows but ignores hint */
1496  if (LEADER(state))
1497  return;
1498 
1499 #ifdef DEBUG_BOUNDED_SORT
1500  /* Honor GUC setting that disables the feature (for easy testing) */
1501  if (!optimize_bounded_sort)
1502  return;
1503 #endif
1504 
1505  /* We want to be able to compute bound * 2, so limit the setting */
1506  if (bound > (int64) (INT_MAX / 2))
1507  return;
1508 
1509  state->bounded = true;
1510  state->bound = (int) bound;
1511 
1512  /*
1513  * Bounded sorts are not an effective target for abbreviated key
1514  * optimization. Disable by setting state to be consistent with no
1515  * abbreviation support.
1516  */
1517  state->sortKeys->abbrev_converter = NULL;
1518  if (state->sortKeys->abbrev_full_comparator)
1519  state->sortKeys->comparator = state->sortKeys->abbrev_full_comparator;
1520 
1521  /* Not strictly necessary, but be tidy */
1522  state->sortKeys->abbrev_abort = NULL;
1523  state->sortKeys->abbrev_full_comparator = NULL;
1524 }
1525 
1526 /*
1527  * tuplesort_used_bound
1528  *
1529  * Allow callers to find out if the sort state was able to use a bound.
1530  */
1531 bool
1533 {
1534  return state->boundUsed;
1535 }
1536 
1537 /*
1538  * tuplesort_free
1539  *
1540  * Internal routine for freeing resources of tuplesort.
1541  */
1542 static void
1544 {
1545  /* context swap probably not needed, but let's be safe */
1546  MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext);
1547 
1548 #ifdef TRACE_SORT
1549  long spaceUsed;
1550 
1551  if (state->tapeset)
1552  spaceUsed = LogicalTapeSetBlocks(state->tapeset);
1553  else
1554  spaceUsed = (state->allowedMem - state->availMem + 1023) / 1024;
1555 #endif
1556 
1557  /*
1558  * Delete temporary "tape" files, if any.
1559  *
1560  * Note: want to include this in reported total cost of sort, hence need
1561  * for two #ifdef TRACE_SORT sections.
1562  *
1563  * We don't bother to destroy the individual tapes here. They will go away
1564  * with the sortcontext. (In TSS_FINALMERGE state, we have closed
1565  * finished tapes already.)
1566  */
1567  if (state->tapeset)
1568  LogicalTapeSetClose(state->tapeset);
1569 
1570 #ifdef TRACE_SORT
1571  if (trace_sort)
1572  {
1573  if (state->tapeset)
1574  elog(LOG, "%s of worker %d ended, %ld disk blocks used: %s",
1575  SERIAL(state) ? "external sort" : "parallel external sort",
1576  state->worker, spaceUsed, pg_rusage_show(&state->ru_start));
1577  else
1578  elog(LOG, "%s of worker %d ended, %ld KB used: %s",
1579  SERIAL(state) ? "internal sort" : "unperformed parallel sort",
1580  state->worker, spaceUsed, pg_rusage_show(&state->ru_start));
1581  }
1582 
1583  TRACE_POSTGRESQL_SORT_DONE(state->tapeset != NULL, spaceUsed);
1584 #else
1585 
1586  /*
1587  * If you disabled TRACE_SORT, you can still probe sort__done, but you
1588  * ain't getting space-used stats.
1589  */
1590  TRACE_POSTGRESQL_SORT_DONE(state->tapeset != NULL, 0L);
1591 #endif
1592 
1593  /* Free any execution state created for CLUSTER case */
1594  if (state->estate != NULL)
1595  {
1596  ExprContext *econtext = GetPerTupleExprContext(state->estate);
1597 
1599  FreeExecutorState(state->estate);
1600  }
1601 
1602  MemoryContextSwitchTo(oldcontext);
1603 
1604  /*
1605  * Free the per-sort memory context, thereby releasing all working memory.
1606  */
1607  MemoryContextReset(state->sortcontext);
1608 }
1609 
1610 /*
1611  * tuplesort_end
1612  *
1613  * Release resources and clean up.
1614  *
1615  * NOTE: after calling this, any pointers returned by tuplesort_getXXX are
1616  * pointing to garbage. Be careful not to attempt to use or free such
1617  * pointers afterwards!
1618  */
1619 void
1621 {
1623 
1624  /*
1625  * Free the main memory context, including the Tuplesortstate struct
1626  * itself.
1627  */
1628  MemoryContextDelete(state->maincontext);
1629 }
1630 
1631 /*
1632  * tuplesort_updatemax
1633  *
1634  * Update maximum resource usage statistics.
1635  */
1636 static void
1638 {
1639  int64 spaceUsed;
1640  bool isSpaceDisk;
1641 
1642  /*
1643  * Note: it might seem we should provide both memory and disk usage for a
1644  * disk-based sort. However, the current code doesn't track memory space
1645  * accurately once we have begun to return tuples to the caller (since we
1646  * don't account for pfree's the caller is expected to do), so we cannot
1647  * rely on availMem in a disk sort. This does not seem worth the overhead
1648  * to fix. Is it worth creating an API for the memory context code to
1649  * tell us how much is actually used in sortcontext?
1650  */
1651  if (state->tapeset)
1652  {
1653  isSpaceDisk = true;
1654  spaceUsed = LogicalTapeSetBlocks(state->tapeset) * BLCKSZ;
1655  }
1656  else
1657  {
1658  isSpaceDisk = false;
1659  spaceUsed = state->allowedMem - state->availMem;
1660  }
1661 
1662  /*
1663  * Sort evicts data to the disk when it wasn't able to fit that data into
1664  * main memory. This is why we assume space used on the disk to be more
1665  * important for tracking resource usage than space used in memory. Note
1666  * that the amount of space occupied by some tupleset on the disk might be
1667  * less than amount of space occupied by the same tupleset in memory due
1668  * to more compact representation.
1669  */
1670  if ((isSpaceDisk && !state->isMaxSpaceDisk) ||
1671  (isSpaceDisk == state->isMaxSpaceDisk && spaceUsed > state->maxSpace))
1672  {
1673  state->maxSpace = spaceUsed;
1674  state->isMaxSpaceDisk = isSpaceDisk;
1675  state->maxSpaceStatus = state->status;
1676  }
1677 }
1678 
1679 /*
1680  * tuplesort_reset
1681  *
1682  * Reset the tuplesort. Reset all the data in the tuplesort, but leave the
1683  * meta-information in. After tuplesort_reset, tuplesort is ready to start
1684  * a new sort. This allows avoiding recreation of tuple sort states (and
1685  * save resources) when sorting multiple small batches.
1686  */
1687 void
1689 {
1692 
1693  /*
1694  * After we've freed up per-batch memory, re-setup all of the state common
1695  * to both the first batch and any subsequent batch.
1696  */
1698 
1699  state->lastReturnedTuple = NULL;
1700  state->slabMemoryBegin = NULL;
1701  state->slabMemoryEnd = NULL;
1702  state->slabFreeHead = NULL;
1703 }
1704 
1705 /*
1706  * Grow the memtuples[] array, if possible within our memory constraint. We
1707  * must not exceed INT_MAX tuples in memory or the caller-provided memory
1708  * limit. Return true if we were able to enlarge the array, false if not.
1709  *
1710  * Normally, at each increment we double the size of the array. When doing
1711  * that would exceed a limit, we attempt one last, smaller increase (and then
1712  * clear the growmemtuples flag so we don't try any more). That allows us to
1713  * use memory as fully as permitted; sticking to the pure doubling rule could
1714  * result in almost half going unused. Because availMem moves around with
1715  * tuple addition/removal, we need some rule to prevent making repeated small
1716  * increases in memtupsize, which would just be useless thrashing. The
1717  * growmemtuples flag accomplishes that and also prevents useless
1718  * recalculations in this function.
1719  */
1720 static bool
1722 {
1723  int newmemtupsize;
1724  int memtupsize = state->memtupsize;
1725  int64 memNowUsed = state->allowedMem - state->availMem;
1726 
1727  /* Forget it if we've already maxed out memtuples, per comment above */
1728  if (!state->growmemtuples)
1729  return false;
1730 
1731  /* Select new value of memtupsize */
1732  if (memNowUsed <= state->availMem)
1733  {
1734  /*
1735  * We've used no more than half of allowedMem; double our usage,
1736  * clamping at INT_MAX tuples.
1737  */
1738  if (memtupsize < INT_MAX / 2)
1739  newmemtupsize = memtupsize * 2;
1740  else
1741  {
1742  newmemtupsize = INT_MAX;
1743  state->growmemtuples = false;
1744  }
1745  }
1746  else
1747  {
1748  /*
1749  * This will be the last increment of memtupsize. Abandon doubling
1750  * strategy and instead increase as much as we safely can.
1751  *
1752  * To stay within allowedMem, we can't increase memtupsize by more
1753  * than availMem / sizeof(SortTuple) elements. In practice, we want
1754  * to increase it by considerably less, because we need to leave some
1755  * space for the tuples to which the new array slots will refer. We
1756  * assume the new tuples will be about the same size as the tuples
1757  * we've already seen, and thus we can extrapolate from the space
1758  * consumption so far to estimate an appropriate new size for the
1759  * memtuples array. The optimal value might be higher or lower than
1760  * this estimate, but it's hard to know that in advance. We again
1761  * clamp at INT_MAX tuples.
1762  *
1763  * This calculation is safe against enlarging the array so much that
1764  * LACKMEM becomes true, because the memory currently used includes
1765  * the present array; thus, there would be enough allowedMem for the
1766  * new array elements even if no other memory were currently used.
1767  *
1768  * We do the arithmetic in float8, because otherwise the product of
1769  * memtupsize and allowedMem could overflow. Any inaccuracy in the
1770  * result should be insignificant; but even if we computed a
1771  * completely insane result, the checks below will prevent anything
1772  * really bad from happening.
1773  */
1774  double grow_ratio;
1775 
1776  grow_ratio = (double) state->allowedMem / (double) memNowUsed;
1777  if (memtupsize * grow_ratio < INT_MAX)
1778  newmemtupsize = (int) (memtupsize * grow_ratio);
1779  else
1780  newmemtupsize = INT_MAX;
1781 
1782  /* We won't make any further enlargement attempts */
1783  state->growmemtuples = false;
1784  }
1785 
1786  /* Must enlarge array by at least one element, else report failure */
1787  if (newmemtupsize <= memtupsize)
1788  goto noalloc;
1789 
1790  /*
1791  * On a 32-bit machine, allowedMem could exceed MaxAllocHugeSize. Clamp
1792  * to ensure our request won't be rejected. Note that we can easily
1793  * exhaust address space before facing this outcome. (This is presently
1794  * impossible due to guc.c's MAX_KILOBYTES limitation on work_mem, but
1795  * don't rely on that at this distance.)
1796  */
1797  if ((Size) newmemtupsize >= MaxAllocHugeSize / sizeof(SortTuple))
1798  {
1799  newmemtupsize = (int) (MaxAllocHugeSize / sizeof(SortTuple));
1800  state->growmemtuples = false; /* can't grow any more */
1801  }
1802 
1803  /*
1804  * We need to be sure that we do not cause LACKMEM to become true, else
1805  * the space management algorithm will go nuts. The code above should
1806  * never generate a dangerous request, but to be safe, check explicitly
1807  * that the array growth fits within availMem. (We could still cause
1808  * LACKMEM if the memory chunk overhead associated with the memtuples
1809  * array were to increase. That shouldn't happen because we chose the
1810  * initial array size large enough to ensure that palloc will be treating
1811  * both old and new arrays as separate chunks. But we'll check LACKMEM
1812  * explicitly below just in case.)
1813  */
1814  if (state->availMem < (int64) ((newmemtupsize - memtupsize) * sizeof(SortTuple)))
1815  goto noalloc;
1816 
1817  /* OK, do it */
1818  FREEMEM(state, GetMemoryChunkSpace(state->memtuples));
1819  state->memtupsize = newmemtupsize;
1820  state->memtuples = (SortTuple *)
1821  repalloc_huge(state->memtuples,
1822  state->memtupsize * sizeof(SortTuple));
1823  USEMEM(state, GetMemoryChunkSpace(state->memtuples));
1824  if (LACKMEM(state))
1825  elog(ERROR, "unexpected out-of-memory situation in tuplesort");
1826  return true;
1827 
1828 noalloc:
1829  /* If for any reason we didn't realloc, shut off future attempts */
1830  state->growmemtuples = false;
1831  return false;
1832 }
1833 
1834 /*
1835  * Accept one tuple while collecting input data for sort.
1836  *
1837  * Note that the input data is always copied; the caller need not save it.
1838  */
1839 void
1841 {
1842  MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext);
1843  SortTuple stup;
1844 
1845  /*
1846  * Copy the given tuple into memory we control, and decrease availMem.
1847  * Then call the common code.
1848  */
1849  COPYTUP(state, &stup, (void *) slot);
1850 
1851  puttuple_common(state, &stup);
1852 
1853  MemoryContextSwitchTo(oldcontext);
1854 }
1855 
1856 /*
1857  * Accept one tuple while collecting input data for sort.
1858  *
1859  * Note that the input data is always copied; the caller need not save it.
1860  */
1861 void
1863 {
1864  MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext);
1865  SortTuple stup;
1866 
1867  /*
1868  * Copy the given tuple into memory we control, and decrease availMem.
1869  * Then call the common code.
1870  */
1871  COPYTUP(state, &stup, (void *) tup);
1872 
1873  puttuple_common(state, &stup);
1874 
1875  MemoryContextSwitchTo(oldcontext);
1876 }
1877 
1878 /*
1879  * Collect one index tuple while collecting input data for sort, building
1880  * it from caller-supplied values.
1881  */
1882 void
1884  ItemPointer self, Datum *values,
1885  bool *isnull)
1886 {
1887  MemoryContext oldcontext = MemoryContextSwitchTo(state->tuplecontext);
1888  SortTuple stup;
1889  Datum original;
1890  IndexTuple tuple;
1891 
1892  stup.tuple = index_form_tuple(RelationGetDescr(rel), values, isnull);
1893  tuple = ((IndexTuple) stup.tuple);
1894  tuple->t_tid = *self;
1896  /* set up first-column key value */
1897  original = index_getattr(tuple,
1898  1,
1899  RelationGetDescr(state->indexRel),
1900  &stup.isnull1);
1901 
1902  MemoryContextSwitchTo(state->sortcontext);
1903 
1904  if (!state->sortKeys || !state->sortKeys->abbrev_converter || stup.isnull1)
1905  {
1906  /*
1907  * Store ordinary Datum representation, or NULL value. If there is a
1908  * converter it won't expect NULL values, and cost model is not
1909  * required to account for NULL, so in that case we avoid calling
1910  * converter and just set datum1 to zeroed representation (to be
1911  * consistent, and to support cheap inequality tests for NULL
1912  * abbreviated keys).
1913  */
1914  stup.datum1 = original;
1915  }
1916  else if (!consider_abort_common(state))
1917  {
1918  /* Store abbreviated key representation */
1919  stup.datum1 = state->sortKeys->abbrev_converter(original,
1920  state->sortKeys);
1921  }
1922  else
1923  {
1924  /* Abort abbreviation */
1925  int i;
1926 
1927  stup.datum1 = original;
1928 
1929  /*
1930  * Set state to be consistent with never trying abbreviation.
1931  *
1932  * Alter datum1 representation in already-copied tuples, so as to
1933  * ensure a consistent representation (current tuple was just
1934  * handled). It does not matter if some dumped tuples are already
1935  * sorted on tape, since serialized tuples lack abbreviated keys
1936  * (TSS_BUILDRUNS state prevents control reaching here in any case).
1937  */
1938  for (i = 0; i < state->memtupcount; i++)
1939  {
1940  SortTuple *mtup = &state->memtuples[i];
1941 
1942  tuple = mtup->tuple;
1943  mtup->datum1 = index_getattr(tuple,
1944  1,
1945  RelationGetDescr(state->indexRel),
1946  &mtup->isnull1);
1947  }
1948  }
1949 
1950  puttuple_common(state, &stup);
1951 
1952  MemoryContextSwitchTo(oldcontext);
1953 }
1954 
1955 /*
1956  * Accept one Datum while collecting input data for sort.
1957  *
1958  * If the Datum is pass-by-ref type, the value will be copied.
1959  */
1960 void
1962 {
1963  MemoryContext oldcontext = MemoryContextSwitchTo(state->tuplecontext);
1964  SortTuple stup;
1965 
1966  /*
1967  * Pass-by-value types or null values are just stored directly in
1968  * stup.datum1 (and stup.tuple is not used and set to NULL).
1969  *
1970  * Non-null pass-by-reference values need to be copied into memory we
1971  * control, and possibly abbreviated. The copied value is pointed to by
1972  * stup.tuple and is treated as the canonical copy (e.g. to return via
1973  * tuplesort_getdatum or when writing to tape); stup.datum1 gets the
1974  * abbreviated value if abbreviation is happening, otherwise it's
1975  * identical to stup.tuple.
1976  */
1977 
1978  if (isNull || !state->tuples)
1979  {
1980  /*
1981  * Set datum1 to zeroed representation for NULLs (to be consistent,
1982  * and to support cheap inequality tests for NULL abbreviated keys).
1983  */
1984  stup.datum1 = !isNull ? val : (Datum) 0;
1985  stup.isnull1 = isNull;
1986  stup.tuple = NULL; /* no separate storage */
1987  MemoryContextSwitchTo(state->sortcontext);
1988  }
1989  else
1990  {
1991  Datum original = datumCopy(val, false, state->datumTypeLen);
1992 
1993  stup.isnull1 = false;
1994  stup.tuple = DatumGetPointer(original);
1996  MemoryContextSwitchTo(state->sortcontext);
1997 
1998  if (!state->sortKeys->abbrev_converter)
1999  {
2000  stup.datum1 = original;
2001  }
2002  else if (!consider_abort_common(state))
2003  {
2004  /* Store abbreviated key representation */
2005  stup.datum1 = state->sortKeys->abbrev_converter(original,
2006  state->sortKeys);
2007  }
2008  else
2009  {
2010  /* Abort abbreviation */
2011  int i;
2012 
2013  stup.datum1 = original;
2014 
2015  /*
2016  * Set state to be consistent with never trying abbreviation.
2017  *
2018  * Alter datum1 representation in already-copied tuples, so as to
2019  * ensure a consistent representation (current tuple was just
2020  * handled). It does not matter if some dumped tuples are already
2021  * sorted on tape, since serialized tuples lack abbreviated keys
2022  * (TSS_BUILDRUNS state prevents control reaching here in any
2023  * case).
2024  */
2025  for (i = 0; i < state->memtupcount; i++)
2026  {
2027  SortTuple *mtup = &state->memtuples[i];
2028 
2029  mtup->datum1 = PointerGetDatum(mtup->tuple);
2030  }
2031  }
2032  }
2033 
2034  puttuple_common(state, &stup);
2035 
2036  MemoryContextSwitchTo(oldcontext);
2037 }
2038 
2039 /*
2040  * Shared code for tuple and datum cases.
2041  */
2042 static void
2044 {
2045  Assert(!LEADER(state));
2046 
2047  switch (state->status)
2048  {
2049  case TSS_INITIAL:
2050 
2051  /*
2052  * Save the tuple into the unsorted array. First, grow the array
2053  * as needed. Note that we try to grow the array when there is
2054  * still one free slot remaining --- if we fail, there'll still be
2055  * room to store the incoming tuple, and then we'll switch to
2056  * tape-based operation.
2057  */
2058  if (state->memtupcount >= state->memtupsize - 1)
2059  {
2060  (void) grow_memtuples(state);
2061  Assert(state->memtupcount < state->memtupsize);
2062  }
2063  state->memtuples[state->memtupcount++] = *tuple;
2064 
2065  /*
2066  * Check if it's time to switch over to a bounded heapsort. We do
2067  * so if the input tuple count exceeds twice the desired tuple
2068  * count (this is a heuristic for where heapsort becomes cheaper
2069  * than a quicksort), or if we've just filled workMem and have
2070  * enough tuples to meet the bound.
2071  *
2072  * Note that once we enter TSS_BOUNDED state we will always try to
2073  * complete the sort that way. In the worst case, if later input
2074  * tuples are larger than earlier ones, this might cause us to
2075  * exceed workMem significantly.
2076  */
2077  if (state->bounded &&
2078  (state->memtupcount > state->bound * 2 ||
2079  (state->memtupcount > state->bound && LACKMEM(state))))
2080  {
2081 #ifdef TRACE_SORT
2082  if (trace_sort)
2083  elog(LOG, "switching to bounded heapsort at %d tuples: %s",
2084  state->memtupcount,
2085  pg_rusage_show(&state->ru_start));
2086 #endif
2088  return;
2089  }
2090 
2091  /*
2092  * Done if we still fit in available memory and have array slots.
2093  */
2094  if (state->memtupcount < state->memtupsize && !LACKMEM(state))
2095  return;
2096 
2097  /*
2098  * Nope; time to switch to tape-based operation.
2099  */
2100  inittapes(state, true);
2101 
2102  /*
2103  * Dump all tuples.
2104  */
2105  dumptuples(state, false);
2106  break;
2107 
2108  case TSS_BOUNDED:
2109 
2110  /*
2111  * We don't want to grow the array here, so check whether the new
2112  * tuple can be discarded before putting it in. This should be a
2113  * good speed optimization, too, since when there are many more
2114  * input tuples than the bound, most input tuples can be discarded
2115  * with just this one comparison. Note that because we currently
2116  * have the sort direction reversed, we must check for <= not >=.
2117  */
2118  if (COMPARETUP(state, tuple, &state->memtuples[0]) <= 0)
2119  {
2120  /* new tuple <= top of the heap, so we can discard it */
2121  free_sort_tuple(state, tuple);
2123  }
2124  else
2125  {
2126  /* discard top of heap, replacing it with the new tuple */
2127  free_sort_tuple(state, &state->memtuples[0]);
2129  }
2130  break;
2131 
2132  case TSS_BUILDRUNS:
2133 
2134  /*
2135  * Save the tuple into the unsorted array (there must be space)
2136  */
2137  state->memtuples[state->memtupcount++] = *tuple;
2138 
2139  /*
2140  * If we are over the memory limit, dump all tuples.
2141  */
2142  dumptuples(state, false);
2143  break;
2144 
2145  default:
2146  elog(ERROR, "invalid tuplesort state");
2147  break;
2148  }
2149 }
2150 
2151 static bool
2153 {
2154  Assert(state->sortKeys[0].abbrev_converter != NULL);
2155  Assert(state->sortKeys[0].abbrev_abort != NULL);
2156  Assert(state->sortKeys[0].abbrev_full_comparator != NULL);
2157 
2158  /*
2159  * Check effectiveness of abbreviation optimization. Consider aborting
2160  * when still within memory limit.
2161  */
2162  if (state->status == TSS_INITIAL &&
2163  state->memtupcount >= state->abbrevNext)
2164  {
2165  state->abbrevNext *= 2;
2166 
2167  /*
2168  * Check opclass-supplied abbreviation abort routine. It may indicate
2169  * that abbreviation should not proceed.
2170  */
2171  if (!state->sortKeys->abbrev_abort(state->memtupcount,
2172  state->sortKeys))
2173  return false;
2174 
2175  /*
2176  * Finally, restore authoritative comparator, and indicate that
2177  * abbreviation is not in play by setting abbrev_converter to NULL
2178  */
2179  state->sortKeys[0].comparator = state->sortKeys[0].abbrev_full_comparator;
2180  state->sortKeys[0].abbrev_converter = NULL;
2181  /* Not strictly necessary, but be tidy */
2182  state->sortKeys[0].abbrev_abort = NULL;
2183  state->sortKeys[0].abbrev_full_comparator = NULL;
2184 
2185  /* Give up - expect original pass-by-value representation */
2186  return true;
2187  }
2188 
2189  return false;
2190 }
2191 
2192 /*
2193  * All tuples have been provided; finish the sort.
2194  */
2195 void
2197 {
2198  MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext);
2199 
2200 #ifdef TRACE_SORT
2201  if (trace_sort)
2202  elog(LOG, "performsort of worker %d starting: %s",
2203  state->worker, pg_rusage_show(&state->ru_start));
2204 #endif
2205 
2206  switch (state->status)
2207  {
2208  case TSS_INITIAL:
2209 
2210  /*
2211  * We were able to accumulate all the tuples within the allowed
2212  * amount of memory, or leader to take over worker tapes
2213  */
2214  if (SERIAL(state))
2215  {
2216  /* Just qsort 'em and we're done */
2218  state->status = TSS_SORTEDINMEM;
2219  }
2220  else if (WORKER(state))
2221  {
2222  /*
2223  * Parallel workers must still dump out tuples to tape. No
2224  * merge is required to produce single output run, though.
2225  */
2226  inittapes(state, false);
2227  dumptuples(state, true);
2229  state->status = TSS_SORTEDONTAPE;
2230  }
2231  else
2232  {
2233  /*
2234  * Leader will take over worker tapes and merge worker runs.
2235  * Note that mergeruns sets the correct state->status.
2236  */
2238  mergeruns(state);
2239  }
2240  state->current = 0;
2241  state->eof_reached = false;
2242  state->markpos_block = 0L;
2243  state->markpos_offset = 0;
2244  state->markpos_eof = false;
2245  break;
2246 
2247  case TSS_BOUNDED:
2248 
2249  /*
2250  * We were able to accumulate all the tuples required for output
2251  * in memory, using a heap to eliminate excess tuples. Now we
2252  * have to transform the heap to a properly-sorted array.
2253  */
2255  state->current = 0;
2256  state->eof_reached = false;
2257  state->markpos_offset = 0;
2258  state->markpos_eof = false;
2259  state->status = TSS_SORTEDINMEM;
2260  break;
2261 
2262  case TSS_BUILDRUNS:
2263 
2264  /*
2265  * Finish tape-based sort. First, flush all tuples remaining in
2266  * memory out to tape; then merge until we have a single remaining
2267  * run (or, if !randomAccess and !WORKER(), one run per tape).
2268  * Note that mergeruns sets the correct state->status.
2269  */
2270  dumptuples(state, true);
2271  mergeruns(state);
2272  state->eof_reached = false;
2273  state->markpos_block = 0L;
2274  state->markpos_offset = 0;
2275  state->markpos_eof = false;
2276  break;
2277 
2278  default:
2279  elog(ERROR, "invalid tuplesort state");
2280  break;
2281  }
2282 
2283 #ifdef TRACE_SORT
2284  if (trace_sort)
2285  {
2286  if (state->status == TSS_FINALMERGE)
2287  elog(LOG, "performsort of worker %d done (except %d-way final merge): %s",
2288  state->worker, state->nInputTapes,
2289  pg_rusage_show(&state->ru_start));
2290  else
2291  elog(LOG, "performsort of worker %d done: %s",
2292  state->worker, pg_rusage_show(&state->ru_start));
2293  }
2294 #endif
2295 
2296  MemoryContextSwitchTo(oldcontext);
2297 }
2298 
2299 /*
2300  * Internal routine to fetch the next tuple in either forward or back
2301  * direction into *stup. Returns false if no more tuples.
2302  * Returned tuple belongs to tuplesort memory context, and must not be freed
2303  * by caller. Note that fetched tuple is stored in memory that may be
2304  * recycled by any future fetch.
2305  */
2306 static bool
2308  SortTuple *stup)
2309 {
2310  unsigned int tuplen;
2311  size_t nmoved;
2312 
2313  Assert(!WORKER(state));
2314 
2315  switch (state->status)
2316  {
2317  case TSS_SORTEDINMEM:
2318  Assert(forward || state->sortopt & TUPLESORT_RANDOMACCESS);
2319  Assert(!state->slabAllocatorUsed);
2320  if (forward)
2321  {
2322  if (state->current < state->memtupcount)
2323  {
2324  *stup = state->memtuples[state->current++];
2325  return true;
2326  }
2327  state->eof_reached = true;
2328 
2329  /*
2330  * Complain if caller tries to retrieve more tuples than
2331  * originally asked for in a bounded sort. This is because
2332  * returning EOF here might be the wrong thing.
2333  */
2334  if (state->bounded && state->current >= state->bound)
2335  elog(ERROR, "retrieved too many tuples in a bounded sort");
2336 
2337  return false;
2338  }
2339  else
2340  {
2341  if (state->current <= 0)
2342  return false;
2343 
2344  /*
2345  * if all tuples are fetched already then we return last
2346  * tuple, else - tuple before last returned.
2347  */
2348  if (state->eof_reached)
2349  state->eof_reached = false;
2350  else
2351  {
2352  state->current--; /* last returned tuple */
2353  if (state->current <= 0)
2354  return false;
2355  }
2356  *stup = state->memtuples[state->current - 1];
2357  return true;
2358  }
2359  break;
2360 
2361  case TSS_SORTEDONTAPE:
2362  Assert(forward || state->sortopt & TUPLESORT_RANDOMACCESS);
2363  Assert(state->slabAllocatorUsed);
2364 
2365  /*
2366  * The slot that held the tuple that we returned in previous
2367  * gettuple call can now be reused.
2368  */
2369  if (state->lastReturnedTuple)
2370  {
2371  RELEASE_SLAB_SLOT(state, state->lastReturnedTuple);
2372  state->lastReturnedTuple = NULL;
2373  }
2374 
2375  if (forward)
2376  {
2377  if (state->eof_reached)
2378  return false;
2379 
2380  if ((tuplen = getlen(state->result_tape, true)) != 0)
2381  {
2382  READTUP(state, stup, state->result_tape, tuplen);
2383 
2384  /*
2385  * Remember the tuple we return, so that we can recycle
2386  * its memory on next call. (This can be NULL, in the
2387  * !state->tuples case).
2388  */
2389  state->lastReturnedTuple = stup->tuple;
2390 
2391  return true;
2392  }
2393  else
2394  {
2395  state->eof_reached = true;
2396  return false;
2397  }
2398  }
2399 
2400  /*
2401  * Backward.
2402  *
2403  * if all tuples are fetched already then we return last tuple,
2404  * else - tuple before last returned.
2405  */
2406  if (state->eof_reached)
2407  {
2408  /*
2409  * Seek position is pointing just past the zero tuplen at the
2410  * end of file; back up to fetch last tuple's ending length
2411  * word. If seek fails we must have a completely empty file.
2412  */
2413  nmoved = LogicalTapeBackspace(state->result_tape,
2414  2 * sizeof(unsigned int));
2415  if (nmoved == 0)
2416  return false;
2417  else if (nmoved != 2 * sizeof(unsigned int))
2418  elog(ERROR, "unexpected tape position");
2419  state->eof_reached = false;
2420  }
2421  else
2422  {
2423  /*
2424  * Back up and fetch previously-returned tuple's ending length
2425  * word. If seek fails, assume we are at start of file.
2426  */
2427  nmoved = LogicalTapeBackspace(state->result_tape,
2428  sizeof(unsigned int));
2429  if (nmoved == 0)
2430  return false;
2431  else if (nmoved != sizeof(unsigned int))
2432  elog(ERROR, "unexpected tape position");
2433  tuplen = getlen(state->result_tape, false);
2434 
2435  /*
2436  * Back up to get ending length word of tuple before it.
2437  */
2438  nmoved = LogicalTapeBackspace(state->result_tape,
2439  tuplen + 2 * sizeof(unsigned int));
2440  if (nmoved == tuplen + sizeof(unsigned int))
2441  {
2442  /*
2443  * We backed up over the previous tuple, but there was no
2444  * ending length word before it. That means that the prev
2445  * tuple is the first tuple in the file. It is now the
2446  * next to read in forward direction (not obviously right,
2447  * but that is what in-memory case does).
2448  */
2449  return false;
2450  }
2451  else if (nmoved != tuplen + 2 * sizeof(unsigned int))
2452  elog(ERROR, "bogus tuple length in backward scan");
2453  }
2454 
2455  tuplen = getlen(state->result_tape, false);
2456 
2457  /*
2458  * Now we have the length of the prior tuple, back up and read it.
2459  * Note: READTUP expects we are positioned after the initial
2460  * length word of the tuple, so back up to that point.
2461  */
2462  nmoved = LogicalTapeBackspace(state->result_tape,
2463  tuplen);
2464  if (nmoved != tuplen)
2465  elog(ERROR, "bogus tuple length in backward scan");
2466  READTUP(state, stup, state->result_tape, tuplen);
2467 
2468  /*
2469  * Remember the tuple we return, so that we can recycle its memory
2470  * on next call. (This can be NULL, in the Datum case).
2471  */
2472  state->lastReturnedTuple = stup->tuple;
2473 
2474  return true;
2475 
2476  case TSS_FINALMERGE:
2477  Assert(forward);
2478  /* We are managing memory ourselves, with the slab allocator. */
2479  Assert(state->slabAllocatorUsed);
2480 
2481  /*
2482  * The slab slot holding the tuple that we returned in previous
2483  * gettuple call can now be reused.
2484  */
2485  if (state->lastReturnedTuple)
2486  {
2487  RELEASE_SLAB_SLOT(state, state->lastReturnedTuple);
2488  state->lastReturnedTuple = NULL;
2489  }
2490 
2491  /*
2492  * This code should match the inner loop of mergeonerun().
2493  */
2494  if (state->memtupcount > 0)
2495  {
2496  int srcTapeIndex = state->memtuples[0].srctape;
2497  LogicalTape *srcTape = state->inputTapes[srcTapeIndex];
2498  SortTuple newtup;
2499 
2500  *stup = state->memtuples[0];
2501 
2502  /*
2503  * Remember the tuple we return, so that we can recycle its
2504  * memory on next call. (This can be NULL, in the Datum case).
2505  */
2506  state->lastReturnedTuple = stup->tuple;
2507 
2508  /*
2509  * Pull next tuple from tape, and replace the returned tuple
2510  * at top of the heap with it.
2511  */
2512  if (!mergereadnext(state, srcTape, &newtup))
2513  {
2514  /*
2515  * If no more data, we've reached end of run on this tape.
2516  * Remove the top node from the heap.
2517  */
2519  state->nInputRuns--;
2520 
2521  /*
2522  * Close the tape. It'd go away at the end of the sort
2523  * anyway, but better to release the memory early.
2524  */
2525  LogicalTapeClose(srcTape);
2526  return true;
2527  }
2528  newtup.srctape = srcTapeIndex;
2530  return true;
2531  }
2532  return false;
2533 
2534  default:
2535  elog(ERROR, "invalid tuplesort state");
2536  return false; /* keep compiler quiet */
2537  }
2538 }
2539 
2540 /*
2541  * Fetch the next tuple in either forward or back direction.
2542  * If successful, put tuple in slot and return true; else, clear the slot
2543  * and return false.
2544  *
2545  * Caller may optionally be passed back abbreviated value (on true return
2546  * value) when abbreviation was used, which can be used to cheaply avoid
2547  * equality checks that might otherwise be required. Caller can safely make a
2548  * determination of "non-equal tuple" based on simple binary inequality. A
2549  * NULL value in leading attribute will set abbreviated value to zeroed
2550  * representation, which caller may rely on in abbreviated inequality check.
2551  *
2552  * If copy is true, the slot receives a tuple that's been copied into the
2553  * caller's memory context, so that it will stay valid regardless of future
2554  * manipulations of the tuplesort's state (up to and including deleting the
2555  * tuplesort). If copy is false, the slot will just receive a pointer to a
2556  * tuple held within the tuplesort, which is more efficient, but only safe for
2557  * callers that are prepared to have any subsequent manipulation of the
2558  * tuplesort's state invalidate slot contents.
2559  */
2560 bool
2561 tuplesort_gettupleslot(Tuplesortstate *state, bool forward, bool copy,
2562  TupleTableSlot *slot, Datum *abbrev)
2563 {
2564  MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext);
2565  SortTuple stup;
2566 
2567  if (!tuplesort_gettuple_common(state, forward, &stup))
2568  stup.tuple = NULL;
2569 
2570  MemoryContextSwitchTo(oldcontext);
2571 
2572  if (stup.tuple)
2573  {
2574  /* Record abbreviated key for caller */
2575  if (state->sortKeys->abbrev_converter && abbrev)
2576  *abbrev = stup.datum1;
2577 
2578  if (copy)
2580 
2581  ExecStoreMinimalTuple((MinimalTuple) stup.tuple, slot, copy);
2582  return true;
2583  }
2584  else
2585  {
2586  ExecClearTuple(slot);
2587  return false;
2588  }
2589 }
2590 
2591 /*
2592  * Fetch the next tuple in either forward or back direction.
2593  * Returns NULL if no more tuples. Returned tuple belongs to tuplesort memory
2594  * context, and must not be freed by caller. Caller may not rely on tuple
2595  * remaining valid after any further manipulation of tuplesort.
2596  */
2597 HeapTuple
2599 {
2600  MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext);
2601  SortTuple stup;
2602 
2603  if (!tuplesort_gettuple_common(state, forward, &stup))
2604  stup.tuple = NULL;
2605 
2606  MemoryContextSwitchTo(oldcontext);
2607 
2608  return stup.tuple;
2609 }
2610 
2611 /*
2612  * Fetch the next index tuple in either forward or back direction.
2613  * Returns NULL if no more tuples. Returned tuple belongs to tuplesort memory
2614  * context, and must not be freed by caller. Caller may not rely on tuple
2615  * remaining valid after any further manipulation of tuplesort.
2616  */
2617 IndexTuple
2619 {
2620  MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext);
2621  SortTuple stup;
2622 
2623  if (!tuplesort_gettuple_common(state, forward, &stup))
2624  stup.tuple = NULL;
2625 
2626  MemoryContextSwitchTo(oldcontext);
2627 
2628  return (IndexTuple) stup.tuple;
2629 }
2630 
2631 /*
2632  * Fetch the next Datum in either forward or back direction.
2633  * Returns false if no more datums.
2634  *
2635  * If the Datum is pass-by-ref type, the returned value is freshly palloc'd
2636  * in caller's context, and is now owned by the caller (this differs from
2637  * similar routines for other types of tuplesorts).
2638  *
2639  * Caller may optionally be passed back abbreviated value (on true return
2640  * value) when abbreviation was used, which can be used to cheaply avoid
2641  * equality checks that might otherwise be required. Caller can safely make a
2642  * determination of "non-equal tuple" based on simple binary inequality. A
2643  * NULL value will have a zeroed abbreviated value representation, which caller
2644  * may rely on in abbreviated inequality check.
2645  */
2646 bool
2648  Datum *val, bool *isNull, Datum *abbrev)
2649 {
2650  MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext);
2651  SortTuple stup;
2652 
2653  if (!tuplesort_gettuple_common(state, forward, &stup))
2654  {
2655  MemoryContextSwitchTo(oldcontext);
2656  return false;
2657  }
2658 
2659  /* Ensure we copy into caller's memory context */
2660  MemoryContextSwitchTo(oldcontext);
2661 
2662  /* Record abbreviated key for caller */
2663  if (state->sortKeys->abbrev_converter && abbrev)
2664  *abbrev = stup.datum1;
2665 
2666  if (stup.isnull1 || !state->tuples)
2667  {
2668  *val = stup.datum1;
2669  *isNull = stup.isnull1;
2670  }
2671  else
2672  {
2673  /* use stup.tuple because stup.datum1 may be an abbreviation */
2674  *val = datumCopy(PointerGetDatum(stup.tuple), false, state->datumTypeLen);
2675  *isNull = false;
2676  }
2677 
2678  return true;
2679 }
2680 
2681 /*
2682  * Advance over N tuples in either forward or back direction,
2683  * without returning any data. N==0 is a no-op.
2684  * Returns true if successful, false if ran out of tuples.
2685  */
2686 bool
2687 tuplesort_skiptuples(Tuplesortstate *state, int64 ntuples, bool forward)
2688 {
2689  MemoryContext oldcontext;
2690 
2691  /*
2692  * We don't actually support backwards skip yet, because no callers need
2693  * it. The API is designed to allow for that later, though.
2694  */
2695  Assert(forward);
2696  Assert(ntuples >= 0);
2697  Assert(!WORKER(state));
2698 
2699  switch (state->status)
2700  {
2701  case TSS_SORTEDINMEM:
2702  if (state->memtupcount - state->current >= ntuples)
2703  {
2704  state->current += ntuples;
2705  return true;
2706  }
2707  state->current = state->memtupcount;
2708  state->eof_reached = true;
2709 
2710  /*
2711  * Complain if caller tries to retrieve more tuples than
2712  * originally asked for in a bounded sort. This is because
2713  * returning EOF here might be the wrong thing.
2714  */
2715  if (state->bounded && state->current >= state->bound)
2716  elog(ERROR, "retrieved too many tuples in a bounded sort");
2717 
2718  return false;
2719 
2720  case TSS_SORTEDONTAPE:
2721  case TSS_FINALMERGE:
2722 
2723  /*
2724  * We could probably optimize these cases better, but for now it's
2725  * not worth the trouble.
2726  */
2727  oldcontext = MemoryContextSwitchTo(state->sortcontext);
2728  while (ntuples-- > 0)
2729  {
2730  SortTuple stup;
2731 
2732  if (!tuplesort_gettuple_common(state, forward, &stup))
2733  {
2734  MemoryContextSwitchTo(oldcontext);
2735  return false;
2736  }
2738  }
2739  MemoryContextSwitchTo(oldcontext);
2740  return true;
2741 
2742  default:
2743  elog(ERROR, "invalid tuplesort state");
2744  return false; /* keep compiler quiet */
2745  }
2746 }
2747 
2748 /*
2749  * tuplesort_merge_order - report merge order we'll use for given memory
2750  * (note: "merge order" just means the number of input tapes in the merge).
2751  *
2752  * This is exported for use by the planner. allowedMem is in bytes.
2753  */
2754 int
2755 tuplesort_merge_order(int64 allowedMem)
2756 {
2757  int mOrder;
2758 
2759  /*----------
2760  * In the merge phase, we need buffer space for each input and output tape.
2761  * Each pass in the balanced merge algorithm reads from M input tapes, and
2762  * writes to N output tapes. Each tape consumes TAPE_BUFFER_OVERHEAD bytes
2763  * of memory. In addition to that, we want MERGE_BUFFER_SIZE workspace per
2764  * input tape.
2765  *
2766  * totalMem = M * (TAPE_BUFFER_OVERHEAD + MERGE_BUFFER_SIZE) +
2767  * N * TAPE_BUFFER_OVERHEAD
2768  *
2769  * Except for the last and next-to-last merge passes, where there can be
2770  * fewer tapes left to process, M = N. We choose M so that we have the
2771  * desired amount of memory available for the input buffers
2772  * (TAPE_BUFFER_OVERHEAD + MERGE_BUFFER_SIZE), given the total memory
2773  * available for the tape buffers (allowedMem).
2774  *
2775  * Note: you might be thinking we need to account for the memtuples[]
2776  * array in this calculation, but we effectively treat that as part of the
2777  * MERGE_BUFFER_SIZE workspace.
2778  *----------
2779  */
2780  mOrder = allowedMem /
2782 
2783  /*
2784  * Even in minimum memory, use at least a MINORDER merge. On the other
2785  * hand, even when we have lots of memory, do not use more than a MAXORDER
2786  * merge. Tapes are pretty cheap, but they're not entirely free. Each
2787  * additional tape reduces the amount of memory available to build runs,
2788  * which in turn can cause the same sort to need more runs, which makes
2789  * merging slower even if it can still be done in a single pass. Also,
2790  * high order merges are quite slow due to CPU cache effects; it can be
2791  * faster to pay the I/O cost of a multi-pass merge than to perform a
2792  * single merge pass across many hundreds of tapes.
2793  */
2794  mOrder = Max(mOrder, MINORDER);
2795  mOrder = Min(mOrder, MAXORDER);
2796 
2797  return mOrder;
2798 }
2799 
2800 /*
2801  * Helper function to calculate how much memory to allocate for the read buffer
2802  * of each input tape in a merge pass.
2803  *
2804  * 'avail_mem' is the amount of memory available for the buffers of all the
2805  * tapes, both input and output.
2806  * 'nInputTapes' and 'nInputRuns' are the number of input tapes and runs.
2807  * 'maxOutputTapes' is the max. number of output tapes we should produce.
2808  */
2809 static int64
2810 merge_read_buffer_size(int64 avail_mem, int nInputTapes, int nInputRuns,
2811  int maxOutputTapes)
2812 {
2813  int nOutputRuns;
2814  int nOutputTapes;
2815 
2816  /*
2817  * How many output tapes will we produce in this pass?
2818  *
2819  * This is nInputRuns / nInputTapes, rounded up.
2820  */
2821  nOutputRuns = (nInputRuns + nInputTapes - 1) / nInputTapes;
2822 
2823  nOutputTapes = Min(nOutputRuns, maxOutputTapes);
2824 
2825  /*
2826  * Each output tape consumes TAPE_BUFFER_OVERHEAD bytes of memory. All
2827  * remaining memory is divided evenly between the input tapes.
2828  *
2829  * This also follows from the formula in tuplesort_merge_order, but here
2830  * we derive the input buffer size from the amount of memory available,
2831  * and M and N.
2832  */
2833  return Max((avail_mem - TAPE_BUFFER_OVERHEAD * nOutputTapes) / nInputTapes, 0);
2834 }
2835 
2836 /*
2837  * inittapes - initialize for tape sorting.
2838  *
2839  * This is called only if we have found we won't sort in memory.
2840  */
2841 static void
2843 {
2844  Assert(!LEADER(state));
2845 
2846  if (mergeruns)
2847  {
2848  /* Compute number of input tapes to use when merging */
2849  state->maxTapes = tuplesort_merge_order(state->allowedMem);
2850  }
2851  else
2852  {
2853  /* Workers can sometimes produce single run, output without merge */
2854  Assert(WORKER(state));
2855  state->maxTapes = MINORDER;
2856  }
2857 
2858 #ifdef TRACE_SORT
2859  if (trace_sort)
2860  elog(LOG, "worker %d switching to external sort with %d tapes: %s",
2861  state->worker, state->maxTapes, pg_rusage_show(&state->ru_start));
2862 #endif
2863 
2864  /* Create the tape set */
2865  inittapestate(state, state->maxTapes);
2866  state->tapeset =
2867  LogicalTapeSetCreate(false,
2868  state->shared ? &state->shared->fileset : NULL,
2869  state->worker);
2870 
2871  state->currentRun = 0;
2872 
2873  /*
2874  * Initialize logical tape arrays.
2875  */
2876  state->inputTapes = NULL;
2877  state->nInputTapes = 0;
2878  state->nInputRuns = 0;
2879 
2880  state->outputTapes = palloc0(state->maxTapes * sizeof(LogicalTape *));
2881  state->nOutputTapes = 0;
2882  state->nOutputRuns = 0;
2883 
2884  state->status = TSS_BUILDRUNS;
2885 
2887 }
2888 
2889 /*
2890  * inittapestate - initialize generic tape management state
2891  */
2892 static void
2894 {
2895  int64 tapeSpace;
2896 
2897  /*
2898  * Decrease availMem to reflect the space needed for tape buffers; but
2899  * don't decrease it to the point that we have no room for tuples. (That
2900  * case is only likely to occur if sorting pass-by-value Datums; in all
2901  * other scenarios the memtuples[] array is unlikely to occupy more than
2902  * half of allowedMem. In the pass-by-value case it's not important to
2903  * account for tuple space, so we don't care if LACKMEM becomes
2904  * inaccurate.)
2905  */
2906  tapeSpace = (int64) maxTapes * TAPE_BUFFER_OVERHEAD;
2907 
2908  if (tapeSpace + GetMemoryChunkSpace(state->memtuples) < state->allowedMem)
2909  USEMEM(state, tapeSpace);
2910 
2911  /*
2912  * Make sure that the temp file(s) underlying the tape set are created in
2913  * suitable temp tablespaces. For parallel sorts, this should have been
2914  * called already, but it doesn't matter if it is called a second time.
2915  */
2917 }
2918 
2919 /*
2920  * selectnewtape -- select next tape to output to.
2921  *
2922  * This is called after finishing a run when we know another run
2923  * must be started. This is used both when building the initial
2924  * runs, and during merge passes.
2925  */
2926 static void
2928 {
2929  /*
2930  * At the beginning of each merge pass, nOutputTapes and nOutputRuns are
2931  * both zero. On each call, we create a new output tape to hold the next
2932  * run, until maxTapes is reached. After that, we assign new runs to the
2933  * existing tapes in a round robin fashion.
2934  */
2935  if (state->nOutputTapes < state->maxTapes)
2936  {
2937  /* Create a new tape to hold the next run */
2938  Assert(state->outputTapes[state->nOutputRuns] == NULL);
2939  Assert(state->nOutputRuns == state->nOutputTapes);
2940  state->destTape = LogicalTapeCreate(state->tapeset);
2941  state->outputTapes[state->nOutputTapes] = state->destTape;
2942  state->nOutputTapes++;
2943  state->nOutputRuns++;
2944  }
2945  else
2946  {
2947  /*
2948  * We have reached the max number of tapes. Append to an existing
2949  * tape.
2950  */
2951  state->destTape = state->outputTapes[state->nOutputRuns % state->nOutputTapes];
2952  state->nOutputRuns++;
2953  }
2954 }
2955 
2956 /*
2957  * Initialize the slab allocation arena, for the given number of slots.
2958  */
2959 static void
2961 {
2962  if (numSlots > 0)
2963  {
2964  char *p;
2965  int i;
2966 
2967  state->slabMemoryBegin = palloc(numSlots * SLAB_SLOT_SIZE);
2968  state->slabMemoryEnd = state->slabMemoryBegin +
2969  numSlots * SLAB_SLOT_SIZE;
2970  state->slabFreeHead = (SlabSlot *) state->slabMemoryBegin;
2971  USEMEM(state, numSlots * SLAB_SLOT_SIZE);
2972 
2973  p = state->slabMemoryBegin;
2974  for (i = 0; i < numSlots - 1; i++)
2975  {
2976  ((SlabSlot *) p)->nextfree = (SlabSlot *) (p + SLAB_SLOT_SIZE);
2977  p += SLAB_SLOT_SIZE;
2978  }
2979  ((SlabSlot *) p)->nextfree = NULL;
2980  }
2981  else
2982  {
2983  state->slabMemoryBegin = state->slabMemoryEnd = NULL;
2984  state->slabFreeHead = NULL;
2985  }
2986  state->slabAllocatorUsed = true;
2987 }
2988 
2989 /*
2990  * mergeruns -- merge all the completed initial runs.
2991  *
2992  * This implements the Balanced k-Way Merge Algorithm. All input data has
2993  * already been written to initial runs on tape (see dumptuples).
2994  */
2995 static void
2997 {
2998  int tapenum;
2999 
3000  Assert(state->status == TSS_BUILDRUNS);
3001  Assert(state->memtupcount == 0);
3002 
3003  if (state->sortKeys != NULL && state->sortKeys->abbrev_converter != NULL)
3004  {
3005  /*
3006  * If there are multiple runs to be merged, when we go to read back
3007  * tuples from disk, abbreviated keys will not have been stored, and
3008  * we don't care to regenerate them. Disable abbreviation from this
3009  * point on.
3010  */
3011  state->sortKeys->abbrev_converter = NULL;
3012  state->sortKeys->comparator = state->sortKeys->abbrev_full_comparator;
3013 
3014  /* Not strictly necessary, but be tidy */
3015  state->sortKeys->abbrev_abort = NULL;
3016  state->sortKeys->abbrev_full_comparator = NULL;
3017  }
3018 
3019  /*
3020  * Reset tuple memory. We've freed all the tuples that we previously
3021  * allocated. We will use the slab allocator from now on.
3022  */
3023  MemoryContextResetOnly(state->tuplecontext);
3024 
3025  /*
3026  * We no longer need a large memtuples array. (We will allocate a smaller
3027  * one for the heap later.)
3028  */
3029  FREEMEM(state, GetMemoryChunkSpace(state->memtuples));
3030  pfree(state->memtuples);
3031  state->memtuples = NULL;
3032 
3033  /*
3034  * Initialize the slab allocator. We need one slab slot per input tape,
3035  * for the tuples in the heap, plus one to hold the tuple last returned
3036  * from tuplesort_gettuple. (If we're sorting pass-by-val Datums,
3037  * however, we don't need to do allocate anything.)
3038  *
3039  * In a multi-pass merge, we could shrink this allocation for the last
3040  * merge pass, if it has fewer tapes than previous passes, but we don't
3041  * bother.
3042  *
3043  * From this point on, we no longer use the USEMEM()/LACKMEM() mechanism
3044  * to track memory usage of individual tuples.
3045  */
3046  if (state->tuples)
3047  init_slab_allocator(state, state->nOutputTapes + 1);
3048  else
3050 
3051  /*
3052  * Allocate a new 'memtuples' array, for the heap. It will hold one tuple
3053  * from each input tape.
3054  *
3055  * We could shrink this, too, between passes in a multi-pass merge, but we
3056  * don't bother. (The initial input tapes are still in outputTapes. The
3057  * number of input tapes will not increase between passes.)
3058  */
3059  state->memtupsize = state->nOutputTapes;
3060  state->memtuples = (SortTuple *) MemoryContextAlloc(state->maincontext,
3061  state->nOutputTapes * sizeof(SortTuple));
3062  USEMEM(state, GetMemoryChunkSpace(state->memtuples));
3063 
3064  /*
3065  * Use all the remaining memory we have available for tape buffers among
3066  * all the input tapes. At the beginning of each merge pass, we will
3067  * divide this memory between the input and output tapes in the pass.
3068  */
3069  state->tape_buffer_mem = state->availMem;
3070  USEMEM(state, state->tape_buffer_mem);
3071 #ifdef TRACE_SORT
3072  if (trace_sort)
3073  elog(LOG, "worker %d using %zu KB of memory for tape buffers",
3074  state->worker, state->tape_buffer_mem / 1024);
3075 #endif
3076 
3077  for (;;)
3078  {
3079  /*
3080  * On the first iteration, or if we have read all the runs from the
3081  * input tapes in a multi-pass merge, it's time to start a new pass.
3082  * Rewind all the output tapes, and make them inputs for the next
3083  * pass.
3084  */
3085  if (state->nInputRuns == 0)
3086  {
3087  int64 input_buffer_size;
3088 
3089  /* Close the old, emptied, input tapes */
3090  if (state->nInputTapes > 0)
3091  {
3092  for (tapenum = 0; tapenum < state->nInputTapes; tapenum++)
3093  LogicalTapeClose(state->inputTapes[tapenum]);
3094  pfree(state->inputTapes);
3095  }
3096 
3097  /* Previous pass's outputs become next pass's inputs. */
3098  state->inputTapes = state->outputTapes;
3099  state->nInputTapes = state->nOutputTapes;
3100  state->nInputRuns = state->nOutputRuns;
3101 
3102  /*
3103  * Reset output tape variables. The actual LogicalTapes will be
3104  * created as needed, here we only allocate the array to hold
3105  * them.
3106  */
3107  state->outputTapes = palloc0(state->nInputTapes * sizeof(LogicalTape *));
3108  state->nOutputTapes = 0;
3109  state->nOutputRuns = 0;
3110 
3111  /*
3112  * Redistribute the memory allocated for tape buffers, among the
3113  * new input and output tapes.
3114  */
3115  input_buffer_size = merge_read_buffer_size(state->tape_buffer_mem,
3116  state->nInputTapes,
3117  state->nInputRuns,
3118  state->maxTapes);
3119 
3120 #ifdef TRACE_SORT
3121  if (trace_sort)
3122  elog(LOG, "starting merge pass of %d input runs on %d tapes, " INT64_FORMAT " KB of memory for each input tape: %s",
3123  state->nInputRuns, state->nInputTapes, input_buffer_size / 1024,
3124  pg_rusage_show(&state->ru_start));
3125 #endif
3126 
3127  /* Prepare the new input tapes for merge pass. */
3128  for (tapenum = 0; tapenum < state->nInputTapes; tapenum++)
3129  LogicalTapeRewindForRead(state->inputTapes[tapenum], input_buffer_size);
3130 
3131  /*
3132  * If there's just one run left on each input tape, then only one
3133  * merge pass remains. If we don't have to produce a materialized
3134  * sorted tape, we can stop at this point and do the final merge
3135  * on-the-fly.
3136  */
3137  if ((state->sortopt & TUPLESORT_RANDOMACCESS) == 0
3138  && state->nInputRuns <= state->nInputTapes
3139  && !WORKER(state))
3140  {
3141  /* Tell logtape.c we won't be writing anymore */
3143  /* Initialize for the final merge pass */
3144  beginmerge(state);
3145  state->status = TSS_FINALMERGE;
3146  return;
3147  }
3148  }
3149 
3150  /* Select an output tape */
3152 
3153  /* Merge one run from each input tape. */
3154  mergeonerun(state);
3155 
3156  /*
3157  * If the input tapes are empty, and we output only one output run,
3158  * we're done. The current output tape contains the final result.
3159  */
3160  if (state->nInputRuns == 0 && state->nOutputRuns <= 1)
3161  break;
3162  }
3163 
3164  /*
3165  * Done. The result is on a single run on a single tape.
3166  */
3167  state->result_tape = state->outputTapes[0];
3168  if (!WORKER(state))
3169  LogicalTapeFreeze(state->result_tape, NULL);
3170  else
3172  state->status = TSS_SORTEDONTAPE;
3173 
3174  /* Close all the now-empty input tapes, to release their read buffers. */
3175  for (tapenum = 0; tapenum < state->nInputTapes; tapenum++)
3176  LogicalTapeClose(state->inputTapes[tapenum]);
3177 }
3178 
3179 /*
3180  * Merge one run from each input tape.
3181  */
3182 static void
3184 {
3185  int srcTapeIndex;
3186  LogicalTape *srcTape;
3187 
3188  /*
3189  * Start the merge by loading one tuple from each active source tape into
3190  * the heap.
3191  */
3192  beginmerge(state);
3193 
3194  /*
3195  * Execute merge by repeatedly extracting lowest tuple in heap, writing it
3196  * out, and replacing it with next tuple from same tape (if there is
3197  * another one).
3198  */
3199  while (state->memtupcount > 0)
3200  {
3201  SortTuple stup;
3202 
3203  /* write the tuple to destTape */
3204  srcTapeIndex = state->memtuples[0].srctape;
3205  srcTape = state->inputTapes[srcTapeIndex];
3206  WRITETUP(state, state->destTape, &state->memtuples[0]);
3207 
3208  /* recycle the slot of the tuple we just wrote out, for the next read */
3209  if (state->memtuples[0].tuple)
3210  RELEASE_SLAB_SLOT(state, state->memtuples[0].tuple);
3211 
3212  /*
3213  * pull next tuple from the tape, and replace the written-out tuple in
3214  * the heap with it.
3215  */
3216  if (mergereadnext(state, srcTape, &stup))
3217  {
3218  stup.srctape = srcTapeIndex;
3220  }
3221  else
3222  {
3224  state->nInputRuns--;
3225  }
3226  }
3227 
3228  /*
3229  * When the heap empties, we're done. Write an end-of-run marker on the
3230  * output tape.
3231  */
3232  markrunend(state->destTape);
3233 }
3234 
3235 /*
3236  * beginmerge - initialize for a merge pass
3237  *
3238  * Fill the merge heap with the first tuple from each input tape.
3239  */
3240 static void
3242 {
3243  int activeTapes;
3244  int srcTapeIndex;
3245 
3246  /* Heap should be empty here */
3247  Assert(state->memtupcount == 0);
3248 
3249  activeTapes = Min(state->nInputTapes, state->nInputRuns);
3250 
3251  for (srcTapeIndex = 0; srcTapeIndex < activeTapes; srcTapeIndex++)
3252  {
3253  SortTuple tup;
3254 
3255  if (mergereadnext(state, state->inputTapes[srcTapeIndex], &tup))
3256  {
3257  tup.srctape = srcTapeIndex;
3259  }
3260  }
3261 }
3262 
3263 /*
3264  * mergereadnext - read next tuple from one merge input tape
3265  *
3266  * Returns false on EOF.
3267  */
3268 static bool
3270 {
3271  unsigned int tuplen;
3272 
3273  /* read next tuple, if any */
3274  if ((tuplen = getlen(srcTape, true)) == 0)
3275  return false;
3276  READTUP(state, stup, srcTape, tuplen);
3277 
3278  return true;
3279 }
3280 
3281 /*
3282  * dumptuples - remove tuples from memtuples and write initial run to tape
3283  *
3284  * When alltuples = true, dump everything currently in memory. (This case is
3285  * only used at end of input data.)
3286  */
3287 static void
3289 {
3290  int memtupwrite;
3291  int i;
3292 
3293  /*
3294  * Nothing to do if we still fit in available memory and have array slots,
3295  * unless this is the final call during initial run generation.
3296  */
3297  if (state->memtupcount < state->memtupsize && !LACKMEM(state) &&
3298  !alltuples)
3299  return;
3300 
3301  /*
3302  * Final call might require no sorting, in rare cases where we just so
3303  * happen to have previously LACKMEM()'d at the point where exactly all
3304  * remaining tuples are loaded into memory, just before input was
3305  * exhausted. In general, short final runs are quite possible, but avoid
3306  * creating a completely empty run. In a worker, though, we must produce
3307  * at least one tape, even if it's empty.
3308  */
3309  if (state->memtupcount == 0 && state->currentRun > 0)
3310  return;
3311 
3312  Assert(state->status == TSS_BUILDRUNS);
3313 
3314  /*
3315  * It seems unlikely that this limit will ever be exceeded, but take no
3316  * chances
3317  */
3318  if (state->currentRun == INT_MAX)
3319  ereport(ERROR,
3320  (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
3321  errmsg("cannot have more than %d runs for an external sort",
3322  INT_MAX)));
3323 
3324  if (state->currentRun > 0)
3326 
3327  state->currentRun++;
3328 
3329 #ifdef TRACE_SORT
3330  if (trace_sort)
3331  elog(LOG, "worker %d starting quicksort of run %d: %s",
3332  state->worker, state->currentRun,
3333  pg_rusage_show(&state->ru_start));
3334 #endif
3335 
3336  /*
3337  * Sort all tuples accumulated within the allowed amount of memory for
3338  * this run using quicksort
3339  */
3341 
3342 #ifdef TRACE_SORT
3343  if (trace_sort)
3344  elog(LOG, "worker %d finished quicksort of run %d: %s",
3345  state->worker, state->currentRun,
3346  pg_rusage_show(&state->ru_start));
3347 #endif
3348 
3349  memtupwrite = state->memtupcount;
3350  for (i = 0; i < memtupwrite; i++)
3351  {
3352  WRITETUP(state, state->destTape, &state->memtuples[i]);
3353  state->memtupcount--;
3354  }
3355 
3356  /*
3357  * Reset tuple memory. We've freed all of the tuples that we previously
3358  * allocated. It's important to avoid fragmentation when there is a stark
3359  * change in the sizes of incoming tuples. Fragmentation due to
3360  * AllocSetFree's bucketing by size class might be particularly bad if
3361  * this step wasn't taken.
3362  */
3363  MemoryContextReset(state->tuplecontext);
3364 
3365  markrunend(state->destTape);
3366 
3367 #ifdef TRACE_SORT
3368  if (trace_sort)
3369  elog(LOG, "worker %d finished writing run %d to tape %d: %s",
3370  state->worker, state->currentRun, (state->currentRun - 1) % state->nOutputTapes + 1,
3371  pg_rusage_show(&state->ru_start));
3372 #endif
3373 }
3374 
3375 /*
3376  * tuplesort_rescan - rewind and replay the scan
3377  */
3378 void
3380 {
3381  MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext);
3382 
3383  Assert(state->sortopt & TUPLESORT_RANDOMACCESS);
3384 
3385  switch (state->status)
3386  {
3387  case TSS_SORTEDINMEM:
3388  state->current = 0;
3389  state->eof_reached = false;
3390  state->markpos_offset = 0;
3391  state->markpos_eof = false;
3392  break;
3393  case TSS_SORTEDONTAPE:
3394  LogicalTapeRewindForRead(state->result_tape, 0);
3395  state->eof_reached = false;
3396  state->markpos_block = 0L;
3397  state->markpos_offset = 0;
3398  state->markpos_eof = false;
3399  break;
3400  default:
3401  elog(ERROR, "invalid tuplesort state");
3402  break;
3403  }
3404 
3405  MemoryContextSwitchTo(oldcontext);
3406 }
3407 
3408 /*
3409  * tuplesort_markpos - saves current position in the merged sort file
3410  */
3411 void
3413 {
3414  MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext);
3415 
3416  Assert(state->sortopt & TUPLESORT_RANDOMACCESS);
3417 
3418  switch (state->status)
3419  {
3420  case TSS_SORTEDINMEM:
3421  state->markpos_offset = state->current;
3422  state->markpos_eof = state->eof_reached;
3423  break;
3424  case TSS_SORTEDONTAPE:
3425  LogicalTapeTell(state->result_tape,
3426  &state->markpos_block,
3427  &state->markpos_offset);
3428  state->markpos_eof = state->eof_reached;
3429  break;
3430  default:
3431  elog(ERROR, "invalid tuplesort state");
3432  break;
3433  }
3434 
3435  MemoryContextSwitchTo(oldcontext);
3436 }
3437 
3438 /*
3439  * tuplesort_restorepos - restores current position in merged sort file to
3440  * last saved position
3441  */
3442 void
3444 {
3445  MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext);
3446 
3447  Assert(state->sortopt & TUPLESORT_RANDOMACCESS);
3448 
3449  switch (state->status)
3450  {
3451  case TSS_SORTEDINMEM:
3452  state->current = state->markpos_offset;
3453  state->eof_reached = state->markpos_eof;
3454  break;
3455  case TSS_SORTEDONTAPE:
3456  LogicalTapeSeek(state->result_tape,
3457  state->markpos_block,
3458  state->markpos_offset);
3459  state->eof_reached = state->markpos_eof;
3460  break;
3461  default:
3462  elog(ERROR, "invalid tuplesort state");
3463  break;
3464  }
3465 
3466  MemoryContextSwitchTo(oldcontext);
3467 }
3468 
3469 /*
3470  * tuplesort_get_stats - extract summary statistics
3471  *
3472  * This can be called after tuplesort_performsort() finishes to obtain
3473  * printable summary information about how the sort was performed.
3474  */
3475 void
3477  TuplesortInstrumentation *stats)
3478 {
3479  /*
3480  * Note: it might seem we should provide both memory and disk usage for a
3481  * disk-based sort. However, the current code doesn't track memory space
3482  * accurately once we have begun to return tuples to the caller (since we
3483  * don't account for pfree's the caller is expected to do), so we cannot
3484  * rely on availMem in a disk sort. This does not seem worth the overhead
3485  * to fix. Is it worth creating an API for the memory context code to
3486  * tell us how much is actually used in sortcontext?
3487  */
3489 
3490  if (state->isMaxSpaceDisk)
3492  else
3494  stats->spaceUsed = (state->maxSpace + 1023) / 1024;
3495 
3496  switch (state->maxSpaceStatus)
3497  {
3498  case TSS_SORTEDINMEM:
3499  if (state->boundUsed)
3501  else
3503  break;
3504  case TSS_SORTEDONTAPE:
3506  break;
3507  case TSS_FINALMERGE:
3509  break;
3510  default:
3512  break;
3513  }
3514 }
3515 
3516 /*
3517  * Convert TuplesortMethod to a string.
3518  */
3519 const char *
3521 {
3522  switch (m)
3523  {
3525  return "still in progress";
3527  return "top-N heapsort";
3528  case SORT_TYPE_QUICKSORT:
3529  return "quicksort";
3531  return "external sort";
3533  return "external merge";
3534  }
3535 
3536  return "unknown";
3537 }
3538 
3539 /*
3540  * Convert TuplesortSpaceType to a string.
3541  */
3542 const char *
3544 {
3546  return t == SORT_SPACE_TYPE_DISK ? "Disk" : "Memory";
3547 }
3548 
3549 
3550 /*
3551  * Heap manipulation routines, per Knuth's Algorithm 5.2.3H.
3552  */
3553 
3554 /*
3555  * Convert the existing unordered array of SortTuples to a bounded heap,
3556  * discarding all but the smallest "state->bound" tuples.
3557  *
3558  * When working with a bounded heap, we want to keep the largest entry
3559  * at the root (array entry zero), instead of the smallest as in the normal
3560  * sort case. This allows us to discard the largest entry cheaply.
3561  * Therefore, we temporarily reverse the sort direction.
3562  */
3563 static void
3565 {
3566  int tupcount = state->memtupcount;
3567  int i;
3568 
3569  Assert(state->status == TSS_INITIAL);
3570  Assert(state->bounded);
3571  Assert(tupcount >= state->bound);
3572  Assert(SERIAL(state));
3573 
3574  /* Reverse sort direction so largest entry will be at root */
3576 
3577  state->memtupcount = 0; /* make the heap empty */
3578  for (i = 0; i < tupcount; i++)
3579  {
3580  if (state->memtupcount < state->bound)
3581  {
3582  /* Insert next tuple into heap */
3583  /* Must copy source tuple to avoid possible overwrite */
3584  SortTuple stup = state->memtuples[i];
3585 
3586  tuplesort_heap_insert(state, &stup);
3587  }
3588  else
3589  {
3590  /*
3591  * The heap is full. Replace the largest entry with the new
3592  * tuple, or just discard it, if it's larger than anything already
3593  * in the heap.
3594  */
3595  if (COMPARETUP(state, &state->memtuples[i], &state->memtuples[0]) <= 0)
3596  {
3597  free_sort_tuple(state, &state->memtuples[i]);
3599  }
3600  else
3601  tuplesort_heap_replace_top(state, &state->memtuples[i]);
3602  }
3603  }
3604 
3605  Assert(state->memtupcount == state->bound);
3606  state->status = TSS_BOUNDED;
3607 }
3608 
3609 /*
3610  * Convert the bounded heap to a properly-sorted array
3611  */
3612 static void
3614 {
3615  int tupcount = state->memtupcount;
3616 
3617  Assert(state->status == TSS_BOUNDED);
3618  Assert(state->bounded);
3619  Assert(tupcount == state->bound);
3620  Assert(SERIAL(state));
3621 
3622  /*
3623  * We can unheapify in place because each delete-top call will remove the
3624  * largest entry, which we can promptly store in the newly freed slot at
3625  * the end. Once we're down to a single-entry heap, we're done.
3626  */
3627  while (state->memtupcount > 1)
3628  {
3629  SortTuple stup = state->memtuples[0];
3630 
3631  /* this sifts-up the next-largest entry and decreases memtupcount */
3633  state->memtuples[state->memtupcount] = stup;
3634  }
3635  state->memtupcount = tupcount;
3636 
3637  /*
3638  * Reverse sort direction back to the original state. This is not
3639  * actually necessary but seems like a good idea for tidiness.
3640  */
3642 
3643  state->status = TSS_SORTEDINMEM;
3644  state->boundUsed = true;
3645 }
3646 
3647 /*
3648  * Sort all memtuples using specialized qsort() routines.
3649  *
3650  * Quicksort is used for small in-memory sorts, and external sort runs.
3651  */
3652 static void
3654 {
3655  Assert(!LEADER(state));
3656 
3657  if (state->memtupcount > 1)
3658  {
3659  /*
3660  * Do we have the leading column's value or abbreviation in datum1,
3661  * and is there a specialization for its comparator?
3662  */
3663  if (state->haveDatum1 && state->sortKeys)
3664  {
3665  if (state->sortKeys[0].comparator == ssup_datum_unsigned_cmp)
3666  {
3667  qsort_tuple_unsigned(state->memtuples,
3668  state->memtupcount,
3669  state);
3670  return;
3671  }
3672 #if SIZEOF_DATUM >= 8
3673  else if (state->sortKeys[0].comparator == ssup_datum_signed_cmp)
3674  {
3675  qsort_tuple_signed(state->memtuples,
3676  state->memtupcount,
3677  state);
3678  return;
3679  }
3680 #endif
3681  else if (state->sortKeys[0].comparator == ssup_datum_int32_cmp)
3682  {
3683  qsort_tuple_int32(state->memtuples,
3684  state->memtupcount,
3685  state);
3686  return;
3687  }
3688  }
3689 
3690  /* Can we use the single-key sort function? */
3691  if (state->onlyKey != NULL)
3692  {
3693  qsort_ssup(state->memtuples, state->memtupcount,
3694  state->onlyKey);
3695  }
3696  else
3697  {
3698  qsort_tuple(state->memtuples,
3699  state->memtupcount,
3700  state->comparetup,
3701  state);
3702  }
3703  }
3704 }
3705 
3706 /*
3707  * Insert a new tuple into an empty or existing heap, maintaining the
3708  * heap invariant. Caller is responsible for ensuring there's room.
3709  *
3710  * Note: For some callers, tuple points to a memtuples[] entry above the
3711  * end of the heap. This is safe as long as it's not immediately adjacent
3712  * to the end of the heap (ie, in the [memtupcount] array entry) --- if it
3713  * is, it might get overwritten before being moved into the heap!
3714  */
3715 static void
3717 {
3718  SortTuple *memtuples;
3719  int j;
3720 
3721  memtuples = state->memtuples;
3722  Assert(state->memtupcount < state->memtupsize);
3723 
3725 
3726  /*
3727  * Sift-up the new entry, per Knuth 5.2.3 exercise 16. Note that Knuth is
3728  * using 1-based array indexes, not 0-based.
3729  */
3730  j = state->memtupcount++;
3731  while (j > 0)
3732  {
3733  int i = (j - 1) >> 1;
3734 
3735  if (COMPARETUP(state, tuple, &memtuples[i]) >= 0)
3736  break;
3737  memtuples[j] = memtuples[i];
3738  j = i;
3739  }
3740  memtuples[j] = *tuple;
3741 }
3742 
3743 /*
3744  * Remove the tuple at state->memtuples[0] from the heap. Decrement
3745  * memtupcount, and sift up to maintain the heap invariant.
3746  *
3747  * The caller has already free'd the tuple the top node points to,
3748  * if necessary.
3749  */
3750 static void
3752 {
3753  SortTuple *memtuples = state->memtuples;
3754  SortTuple *tuple;
3755 
3756  if (--state->memtupcount <= 0)
3757  return;
3758 
3759  /*
3760  * Remove the last tuple in the heap, and re-insert it, by replacing the
3761  * current top node with it.
3762  */
3763  tuple = &memtuples[state->memtupcount];
3765 }
3766 
3767 /*
3768  * Replace the tuple at state->memtuples[0] with a new tuple. Sift up to
3769  * maintain the heap invariant.
3770  *
3771  * This corresponds to Knuth's "sift-up" algorithm (Algorithm 5.2.3H,
3772  * Heapsort, steps H3-H8).
3773  */
3774 static void
3776 {
3777  SortTuple *memtuples = state->memtuples;
3778  unsigned int i,
3779  n;
3780 
3781  Assert(state->memtupcount >= 1);
3782 
3784 
3785  /*
3786  * state->memtupcount is "int", but we use "unsigned int" for i, j, n.
3787  * This prevents overflow in the "2 * i + 1" calculation, since at the top
3788  * of the loop we must have i < n <= INT_MAX <= UINT_MAX/2.
3789  */
3790  n = state->memtupcount;
3791  i = 0; /* i is where the "hole" is */
3792  for (;;)
3793  {
3794  unsigned int j = 2 * i + 1;
3795 
3796  if (j >= n)
3797  break;
3798  if (j + 1 < n &&
3799  COMPARETUP(state, &memtuples[j], &memtuples[j + 1]) > 0)
3800  j++;
3801  if (COMPARETUP(state, tuple, &memtuples[j]) <= 0)
3802  break;
3803  memtuples[i] = memtuples[j];
3804  i = j;
3805  }
3806  memtuples[i] = *tuple;
3807 }
3808 
3809 /*
3810  * Function to reverse the sort direction from its current state
3811  *
3812  * It is not safe to call this when performing hash tuplesorts
3813  */
3814 static void
3816 {
3817  SortSupport sortKey = state->sortKeys;
3818  int nkey;
3819 
3820  for (nkey = 0; nkey < state->nKeys; nkey++, sortKey++)
3821  {
3822  sortKey->ssup_reverse = !sortKey->ssup_reverse;
3823  sortKey->ssup_nulls_first = !sortKey->ssup_nulls_first;
3824  }
3825 }
3826 
3827 
3828 /*
3829  * Tape interface routines
3830  */
3831 
3832 static unsigned int
3833 getlen(LogicalTape *tape, bool eofOK)
3834 {
3835  unsigned int len;
3836 
3837  if (LogicalTapeRead(tape,
3838  &len, sizeof(len)) != sizeof(len))
3839  elog(ERROR, "unexpected end of tape");
3840  if (len == 0 && !eofOK)
3841  elog(ERROR, "unexpected end of data");
3842  return len;
3843 }
3844 
3845 static void
3847 {
3848  unsigned int len = 0;
3849 
3850  LogicalTapeWrite(tape, (void *) &len, sizeof(len));
3851 }
3852 
3853 /*
3854  * Get memory for tuple from within READTUP() routine.
3855  *
3856  * We use next free slot from the slab allocator, or palloc() if the tuple
3857  * is too large for that.
3858  */
3859 static void *
3861 {
3862  SlabSlot *buf;
3863 
3864  /*
3865  * We pre-allocate enough slots in the slab arena that we should never run
3866  * out.
3867  */
3868  Assert(state->slabFreeHead);
3869 
3870  if (tuplen > SLAB_SLOT_SIZE || !state->slabFreeHead)
3871  return MemoryContextAlloc(state->sortcontext, tuplen);
3872  else
3873  {
3874  buf = state->slabFreeHead;
3875  /* Reuse this slot */
3876  state->slabFreeHead = buf->nextfree;
3877 
3878  return buf;
3879  }
3880 }
3881 
3882 
3883 /*
3884  * Routines specialized for HeapTuple (actually MinimalTuple) case
3885  */
3886 
3887 static int
3889 {
3890  SortSupport sortKey = state->sortKeys;
3891  HeapTupleData ltup;
3892  HeapTupleData rtup;
3893  TupleDesc tupDesc;
3894  int nkey;
3895  int32 compare;
3896  AttrNumber attno;
3897  Datum datum1,
3898  datum2;
3899  bool isnull1,
3900  isnull2;
3901 
3902 
3903  /* Compare the leading sort key */
3904  compare = ApplySortComparator(a->datum1, a->isnull1,
3905  b->datum1, b->isnull1,
3906  sortKey);
3907  if (compare != 0)
3908  return compare;
3909 
3910  /* Compare additional sort keys */
3911  ltup.t_len = ((MinimalTuple) a->tuple)->t_len + MINIMAL_TUPLE_OFFSET;
3912  ltup.t_data = (HeapTupleHeader) ((char *) a->tuple - MINIMAL_TUPLE_OFFSET);
3913  rtup.t_len = ((MinimalTuple) b->tuple)->t_len + MINIMAL_TUPLE_OFFSET;
3914  rtup.t_data = (HeapTupleHeader) ((char *) b->tuple - MINIMAL_TUPLE_OFFSET);
3915  tupDesc = state->tupDesc;
3916 
3917  if (sortKey->abbrev_converter)
3918  {
3919  attno = sortKey->ssup_attno;
3920 
3921  datum1 = heap_getattr(&ltup, attno, tupDesc, &isnull1);
3922  datum2 = heap_getattr(&rtup, attno, tupDesc, &isnull2);
3923 
3924  compare = ApplySortAbbrevFullComparator(datum1, isnull1,
3925  datum2, isnull2,
3926  sortKey);
3927  if (compare != 0)
3928  return compare;
3929  }
3930 
3931  sortKey++;
3932  for (nkey = 1; nkey < state->nKeys; nkey++, sortKey++)
3933  {
3934  attno = sortKey->ssup_attno;
3935 
3936  datum1 = heap_getattr(&ltup, attno, tupDesc, &isnull1);
3937  datum2 = heap_getattr(&rtup, attno, tupDesc, &isnull2);
3938 
3939  compare = ApplySortComparator(datum1, isnull1,
3940  datum2, isnull2,
3941  sortKey);
3942  if (compare != 0)
3943  return compare;
3944  }
3945 
3946  return 0;
3947 }
3948 
3949 static void
3951 {
3952  /*
3953  * We expect the passed "tup" to be a TupleTableSlot, and form a
3954  * MinimalTuple using the exported interface for that.
3955  */
3956  TupleTableSlot *slot = (TupleTableSlot *) tup;
3957  Datum original;
3958  MinimalTuple tuple;
3959  HeapTupleData htup;
3960  MemoryContext oldcontext = MemoryContextSwitchTo(state->tuplecontext);
3961 
3962  /* copy the tuple into sort storage */
3963  tuple = ExecCopySlotMinimalTuple(slot);
3964  stup->tuple = (void *) tuple;
3965  USEMEM(state, GetMemoryChunkSpace(tuple));
3966  /* set up first-column key value */
3967  htup.t_len = tuple->t_len + MINIMAL_TUPLE_OFFSET;
3968  htup.t_data = (HeapTupleHeader) ((char *) tuple - MINIMAL_TUPLE_OFFSET);
3969  original = heap_getattr(&htup,
3970  state->sortKeys[0].ssup_attno,
3971  state->tupDesc,
3972  &stup->isnull1);
3973 
3974  MemoryContextSwitchTo(oldcontext);
3975 
3976  if (!state->sortKeys->abbrev_converter || stup->isnull1)
3977  {
3978  /*
3979  * Store ordinary Datum representation, or NULL value. If there is a
3980  * converter it won't expect NULL values, and cost model is not
3981  * required to account for NULL, so in that case we avoid calling
3982  * converter and just set datum1 to zeroed representation (to be
3983  * consistent, and to support cheap inequality tests for NULL
3984  * abbreviated keys).
3985  */
3986  stup->datum1 = original;
3987  }
3988  else if (!consider_abort_common(state))
3989  {
3990  /* Store abbreviated key representation */
3991  stup->datum1 = state->sortKeys->abbrev_converter(original,
3992  state->sortKeys);
3993  }
3994  else
3995  {
3996  /* Abort abbreviation */
3997  int i;
3998 
3999  stup->datum1 = original;
4000 
4001  /*
4002  * Set state to be consistent with never trying abbreviation.
4003  *
4004  * Alter datum1 representation in already-copied tuples, so as to
4005  * ensure a consistent representation (current tuple was just
4006  * handled). It does not matter if some dumped tuples are already
4007  * sorted on tape, since serialized tuples lack abbreviated keys
4008  * (TSS_BUILDRUNS state prevents control reaching here in any case).
4009  */
4010  for (i = 0; i < state->memtupcount; i++)
4011  {
4012  SortTuple *mtup = &state->memtuples[i];
4013 
4014  htup.t_len = ((MinimalTuple) mtup->tuple)->t_len +
4016  htup.t_data = (HeapTupleHeader) ((char *) mtup->tuple -
4018 
4019  mtup->datum1 = heap_getattr(&htup,
4020  state->sortKeys[0].ssup_attno,
4021  state->tupDesc,
4022  &mtup->isnull1);
4023  }
4024  }
4025 }
4026 
4027 static void
4029 {
4030  MinimalTuple tuple = (MinimalTuple) stup->tuple;
4031 
4032  /* the part of the MinimalTuple we'll write: */
4033  char *tupbody = (char *) tuple + MINIMAL_TUPLE_DATA_OFFSET;
4034  unsigned int tupbodylen = tuple->t_len - MINIMAL_TUPLE_DATA_OFFSET;
4035 
4036  /* total on-disk footprint: */
4037  unsigned int tuplen = tupbodylen + sizeof(int);
4038 
4039  LogicalTapeWrite(tape, (void *) &tuplen, sizeof(tuplen));
4040  LogicalTapeWrite(tape, (void *) tupbody, tupbodylen);
4041  if (state->sortopt & TUPLESORT_RANDOMACCESS) /* need trailing length
4042  * word? */
4043  LogicalTapeWrite(tape, (void *) &tuplen, sizeof(tuplen));
4044 
4045  if (!state->slabAllocatorUsed)
4046  {
4048  heap_free_minimal_tuple(tuple);
4049  }
4050 }
4051 
4052 static void
4054  LogicalTape *tape, unsigned int len)
4055 {
4056  unsigned int tupbodylen = len - sizeof(int);
4057  unsigned int tuplen = tupbodylen + MINIMAL_TUPLE_DATA_OFFSET;
4058  MinimalTuple tuple = (MinimalTuple) readtup_alloc(state, tuplen);
4059  char *tupbody = (char *) tuple + MINIMAL_TUPLE_DATA_OFFSET;
4060  HeapTupleData htup;
4061 
4062  /* read in the tuple proper */
4063  tuple->t_len = tuplen;
4064  LogicalTapeReadExact(tape, tupbody, tupbodylen);
4065  if (state->sortopt & TUPLESORT_RANDOMACCESS) /* need trailing length
4066  * word? */
4067  LogicalTapeReadExact(tape, &tuplen, sizeof(tuplen));
4068  stup->tuple = (void *) tuple;
4069  /* set up first-column key value */
4070  htup.t_len = tuple->t_len + MINIMAL_TUPLE_OFFSET;
4071  htup.t_data = (HeapTupleHeader) ((char *) tuple - MINIMAL_TUPLE_OFFSET);
4072  stup->datum1 = heap_getattr(&htup,
4073  state->sortKeys[0].ssup_attno,
4074  state->tupDesc,
4075  &stup->isnull1);
4076 }
4077 
4078 /*
4079  * Routines specialized for the CLUSTER case (HeapTuple data, with
4080  * comparisons per a btree index definition)
4081  */
4082 
4083 static int
4086 {
4087  SortSupport sortKey = state->sortKeys;
4088  HeapTuple ltup;
4089  HeapTuple rtup;
4090  TupleDesc tupDesc;
4091  int nkey;
4092  int32 compare;
4093  Datum datum1,
4094  datum2;
4095  bool isnull1,
4096  isnull2;
4097 
4098  /* Be prepared to compare additional sort keys */
4099  ltup = (HeapTuple) a->tuple;
4100  rtup = (HeapTuple) b->tuple;
4101  tupDesc = state->tupDesc;
4102 
4103  /* Compare the leading sort key, if it's simple */
4104  if (state->haveDatum1)
4105  {
4106  compare = ApplySortComparator(a->datum1, a->isnull1,
4107  b->datum1, b->isnull1,
4108  sortKey);
4109  if (compare != 0)
4110  return compare;
4111 
4112  if (sortKey->abbrev_converter)
4113  {
4114  AttrNumber leading = state->indexInfo->ii_IndexAttrNumbers[0];
4115 
4116  datum1 = heap_getattr(ltup, leading, tupDesc, &isnull1);
4117  datum2 = heap_getattr(rtup, leading, tupDesc, &isnull2);
4118 
4119  compare = ApplySortAbbrevFullComparator(datum1, isnull1,
4120  datum2, isnull2,
4121  sortKey);
4122  }
4123  if (compare != 0 || state->nKeys == 1)
4124  return compare;
4125  /* Compare additional columns the hard way */
4126  sortKey++;
4127  nkey = 1;
4128  }
4129  else
4130  {
4131  /* Must compare all keys the hard way */
4132  nkey = 0;
4133  }
4134 
4135  if (state->indexInfo->ii_Expressions == NULL)
4136  {
4137  /* If not expression index, just compare the proper heap attrs */
4138 
4139  for (; nkey < state->nKeys; nkey++, sortKey++)
4140  {
4141  AttrNumber attno = state->indexInfo->ii_IndexAttrNumbers[nkey];
4142 
4143  datum1 = heap_getattr(ltup, attno, tupDesc, &isnull1);
4144  datum2 = heap_getattr(rtup, attno, tupDesc, &isnull2);
4145 
4146  compare = ApplySortComparator(datum1, isnull1,
4147  datum2, isnull2,
4148  sortKey);
4149  if (compare != 0)
4150  return compare;
4151  }
4152  }
4153  else
4154  {
4155  /*
4156  * In the expression index case, compute the whole index tuple and
4157  * then compare values. It would perhaps be faster to compute only as
4158  * many columns as we need to compare, but that would require
4159  * duplicating all the logic in FormIndexDatum.
4160  */
4161  Datum l_index_values[INDEX_MAX_KEYS];
4162  bool l_index_isnull[INDEX_MAX_KEYS];
4163  Datum r_index_values[INDEX_MAX_KEYS];
4164  bool r_index_isnull[INDEX_MAX_KEYS];
4165  TupleTableSlot *ecxt_scantuple;
4166 
4167  /* Reset context each time to prevent memory leakage */
4169 
4170  ecxt_scantuple = GetPerTupleExprContext(state->estate)->ecxt_scantuple;
4171 
4172  ExecStoreHeapTuple(ltup, ecxt_scantuple, false);
4173  FormIndexDatum(state->indexInfo, ecxt_scantuple, state->estate,
4174  l_index_values, l_index_isnull);
4175 
4176  ExecStoreHeapTuple(rtup, ecxt_scantuple, false);
4177  FormIndexDatum(state->indexInfo, ecxt_scantuple, state->estate,
4178  r_index_values, r_index_isnull);
4179 
4180  for (; nkey < state->nKeys; nkey++, sortKey++)
4181  {
4182  compare = ApplySortComparator(l_index_values[nkey],
4183  l_index_isnull[nkey],
4184  r_index_values[nkey],
4185  r_index_isnull[nkey],
4186  sortKey);
4187  if (compare != 0)
4188  return compare;
4189  }
4190  }
4191 
4192  return 0;
4193 }
4194 
4195 static void
4197 {
4198  HeapTuple tuple = (HeapTuple) tup;
4199  Datum original;
4200  MemoryContext oldcontext = MemoryContextSwitchTo(state->tuplecontext);
4201 
4202  /* copy the tuple into sort storage */
4203  tuple = heap_copytuple(tuple);
4204  stup->tuple = (void *) tuple;
4205  USEMEM(state, GetMemoryChunkSpace(tuple));
4206 
4207  MemoryContextSwitchTo(oldcontext);
4208 
4209  /*
4210  * set up first-column key value, and potentially abbreviate, if it's a
4211  * simple column
4212  */
4213  if (!state->haveDatum1)
4214  return;
4215 
4216  original = heap_getattr(tuple,
4217  state->indexInfo->ii_IndexAttrNumbers[0],
4218  state->tupDesc,
4219  &stup->isnull1);
4220 
4221  if (!state->sortKeys->abbrev_converter || stup->isnull1)
4222  {
4223  /*
4224  * Store ordinary Datum representation, or NULL value. If there is a
4225  * converter it won't expect NULL values, and cost model is not
4226  * required to account for NULL, so in that case we avoid calling
4227  * converter and just set datum1 to zeroed representation (to be
4228  * consistent, and to support cheap inequality tests for NULL
4229  * abbreviated keys).
4230  */
4231  stup->datum1 = original;
4232  }
4233  else if (!consider_abort_common(state))
4234  {
4235  /* Store abbreviated key representation */
4236  stup->datum1 = state->sortKeys->abbrev_converter(original,
4237  state->sortKeys);
4238  }
4239  else
4240  {
4241  /* Abort abbreviation */
4242  int i;
4243 
4244  stup->datum1 = original;
4245 
4246  /*
4247  * Set state to be consistent with never trying abbreviation.
4248  *
4249  * Alter datum1 representation in already-copied tuples, so as to
4250  * ensure a consistent representation (current tuple was just
4251  * handled). It does not matter if some dumped tuples are already
4252  * sorted on tape, since serialized tuples lack abbreviated keys
4253  * (TSS_BUILDRUNS state prevents control reaching here in any case).
4254  */
4255  for (i = 0; i < state->memtupcount; i++)
4256  {
4257  SortTuple *mtup = &state->memtuples[i];
4258 
4259  tuple = (HeapTuple) mtup->tuple;
4260  mtup->datum1 = heap_getattr(tuple,
4261  state->indexInfo->ii_IndexAttrNumbers[0],
4262  state->tupDesc,
4263  &mtup->isnull1);
4264  }
4265  }
4266 }
4267 
4268 static void
4270 {
4271  HeapTuple tuple = (HeapTuple) stup->tuple;
4272  unsigned int tuplen = tuple->t_len + sizeof(ItemPointerData) + sizeof(int);
4273 
4274  /* We need to store t_self, but not other fields of HeapTupleData */
4275  LogicalTapeWrite(tape, &tuplen, sizeof(tuplen));
4276  LogicalTapeWrite(tape, &tuple->t_self, sizeof(ItemPointerData));
4277  LogicalTapeWrite(tape, tuple->t_data, tuple->t_len);
4278  if (state->sortopt & TUPLESORT_RANDOMACCESS) /* need trailing length
4279  * word? */
4280  LogicalTapeWrite(tape, &tuplen, sizeof(tuplen));
4281 
4282  if (!state->slabAllocatorUsed)
4283  {
4285  heap_freetuple(tuple);
4286  }
4287 }
4288 
4289 static void
4291  LogicalTape *tape, unsigned int tuplen)
4292 {
4293  unsigned int t_len = tuplen - sizeof(ItemPointerData) - sizeof(int);
4295  t_len + HEAPTUPLESIZE);
4296 
4297  /* Reconstruct the HeapTupleData header */
4298  tuple->t_data = (HeapTupleHeader) ((char *) tuple + HEAPTUPLESIZE);
4299  tuple->t_len = t_len;
4300  LogicalTapeReadExact(tape, &tuple->t_self, sizeof(ItemPointerData));
4301  /* We don't currently bother to reconstruct t_tableOid */
4302  tuple->t_tableOid = InvalidOid;
4303  /* Read in the tuple body */
4304  LogicalTapeReadExact(tape, tuple->t_data, tuple->t_len);
4305  if (state->sortopt & TUPLESORT_RANDOMACCESS) /* need trailing length
4306  * word? */
4307  LogicalTapeReadExact(tape, &tuplen, sizeof(tuplen));
4308  stup->tuple = (void *) tuple;
4309  /* set up first-column key value, if it's a simple column */
4310  if (state->haveDatum1)
4311  stup->datum1 = heap_getattr(tuple,
4312  state->indexInfo->ii_IndexAttrNumbers[0],
4313  state->tupDesc,
4314  &stup->isnull1);
4315 }
4316 
4317 /*
4318  * Routines specialized for IndexTuple case
4319  *
4320  * The btree and hash cases require separate comparison functions, but the
4321  * IndexTuple representation is the same so the copy/write/read support
4322  * functions can be shared.
4323  */
4324 
4325 static int
4328 {
4329  /*
4330  * This is similar to comparetup_heap(), but expects index tuples. There
4331  * is also special handling for enforcing uniqueness, and special
4332  * treatment for equal keys at the end.
4333  */
4334  SortSupport sortKey = state->sortKeys;
4335  IndexTuple tuple1;
4336  IndexTuple tuple2;
4337  int keysz;
4338  TupleDesc tupDes;
4339  bool equal_hasnull = false;
4340  int nkey;
4341  int32 compare;
4342  Datum datum1,
4343  datum2;
4344  bool isnull1,
4345  isnull2;
4346 
4347 
4348  /* Compare the leading sort key */
4349  compare = ApplySortComparator(a->datum1, a->isnull1,
4350  b->datum1, b->isnull1,
4351  sortKey);
4352  if (compare != 0)
4353  return compare;
4354 
4355  /* Compare additional sort keys */
4356  tuple1 = (IndexTuple) a->tuple;
4357  tuple2 = (IndexTuple) b->tuple;
4358  keysz = state->nKeys;
4359  tupDes = RelationGetDescr(state->indexRel);
4360 
4361  if (sortKey->abbrev_converter)
4362  {
4363  datum1 = index_getattr(tuple1, 1, tupDes, &isnull1);
4364  datum2 = index_getattr(tuple2, 1, tupDes, &isnull2);
4365 
4366  compare = ApplySortAbbrevFullComparator(datum1, isnull1,
4367  datum2, isnull2,
4368  sortKey);
4369  if (compare != 0)
4370  return compare;
4371  }
4372 
4373  /* they are equal, so we only need to examine one null flag */
4374  if (a->isnull1)
4375  equal_hasnull = true;
4376 
4377  sortKey++;
4378  for (nkey = 2; nkey <= keysz; nkey++, sortKey++)
4379  {
4380  datum1 = index_getattr(tuple1, nkey, tupDes, &isnull1);
4381  datum2 = index_getattr(tuple2, nkey, tupDes, &isnull2);
4382 
4383  compare = ApplySortComparator(datum1, isnull1,
4384  datum2, isnull2,
4385  sortKey);
4386  if (compare != 0)
4387  return compare; /* done when we find unequal attributes */
4388 
4389  /* they are equal, so we only need to examine one null flag */
4390  if (isnull1)
4391  equal_hasnull = true;
4392  }
4393 
4394  /*
4395  * If btree has asked us to enforce uniqueness, complain if two equal
4396  * tuples are detected (unless there was at least one NULL field and NULLS
4397  * NOT DISTINCT was not set).
4398  *
4399  * It is sufficient to make the test here, because if two tuples are equal
4400  * they *must* get compared at some stage of the sort --- otherwise the
4401  * sort algorithm wouldn't have checked whether one must appear before the
4402  * other.
4403  */
4404  if (state->enforceUnique && !(!state->uniqueNullsNotDistinct && equal_hasnull))
4405  {
4407  bool isnull[INDEX_MAX_KEYS];
4408  char *key_desc;
4409 
4410  /*
4411  * Some rather brain-dead implementations of qsort (such as the one in
4412  * QNX 4) will sometimes call the comparison routine to compare a
4413  * value to itself, but we always use our own implementation, which
4414  * does not.
4415  */
4416  Assert(tuple1 != tuple2);
4417 
4418  index_deform_tuple(tuple1, tupDes, values, isnull);
4419 
4420  key_desc = BuildIndexValueDescription(state->indexRel, values, isnull);
4421 
4422  ereport(ERROR,
4423  (errcode(ERRCODE_UNIQUE_VIOLATION),
4424  errmsg("could not create unique index \"%s\"",
4425  RelationGetRelationName(state->indexRel)),
4426  key_desc ? errdetail("Key %s is duplicated.", key_desc) :
4427  errdetail("Duplicate keys exist."),
4428  errtableconstraint(state->heapRel,
4429  RelationGetRelationName(state->indexRel))));
4430  }
4431 
4432  /*
4433  * If key values are equal, we sort on ItemPointer. This is required for
4434  * btree indexes, since heap TID is treated as an implicit last key
4435  * attribute in order to ensure that all keys in the index are physically
4436  * unique.
4437  */
4438  {
4439  BlockNumber blk1 = ItemPointerGetBlockNumber(&tuple1->t_tid);
4440  BlockNumber blk2 = ItemPointerGetBlockNumber(&tuple2->t_tid);
4441 
4442  if (blk1 != blk2)
4443  return (blk1 < blk2) ? -1 : 1;
4444  }
4445  {
4448 
4449  if (pos1 != pos2)
4450  return (pos1 < pos2) ? -1 : 1;
4451  }
4452 
4453  /* ItemPointer values should never be equal */
4454  Assert(false);
4455 
4456  return 0;
4457 }
4458 
4459 static int
4462 {
4463  Bucket bucket1;
4464  Bucket bucket2;
4465  IndexTuple tuple1;
4466  IndexTuple tuple2;
4467 
4468  /*
4469  * Fetch hash keys and mask off bits we don't want to sort by. We know
4470  * that the first column of the index tuple is the hash key.
4471  */
4472  Assert(!a->isnull1);
4473  bucket1 = _hash_hashkey2bucket(DatumGetUInt32(a->datum1),
4474  state->max_buckets, state->high_mask,
4475  state->low_mask);
4476  Assert(!b->isnull1);
4477  bucket2 = _hash_hashkey2bucket(DatumGetUInt32(b->datum1),
4478  state->max_buckets, state->high_mask,
4479  state->low_mask);
4480  if (bucket1 > bucket2)
4481  return 1;
4482  else if (bucket1 < bucket2)
4483  return -1;
4484 
4485  /*
4486  * If hash values are equal, we sort on ItemPointer. This does not affect
4487  * validity of the finished index, but it may be useful to have index
4488  * scans in physical order.
4489  */
4490  tuple1 = (IndexTuple) a->tuple;
4491  tuple2 = (IndexTuple) b->tuple;
4492 
4493  {
4494  BlockNumber blk1 = ItemPointerGetBlockNumber(&tuple1->t_tid);
4495  BlockNumber blk2 = ItemPointerGetBlockNumber(&tuple2->t_tid);
4496 
4497  if (blk1 != blk2)
4498  return (blk1 < blk2) ? -1 : 1;
4499  }
4500  {
4503 
4504  if (pos1 != pos2)
4505  return (pos1 < pos2) ? -1 : 1;
4506  }
4507 
4508  /* ItemPointer values should never be equal */
4509  Assert(false);
4510 
4511  return 0;
4512 }
4513 
4514 static void
4516 {
4517  /* Not currently needed */
4518  elog(ERROR, "copytup_index() should not be called");
4519 }
4520 
4521 static void
4523 {
4524  IndexTuple tuple = (IndexTuple) stup->tuple;
4525  unsigned int tuplen;
4526 
4527  tuplen = IndexTupleSize(tuple) + sizeof(tuplen);
4528  LogicalTapeWrite(tape, (void *) &tuplen, sizeof(tuplen));
4529  LogicalTapeWrite(tape, (void *) tuple, IndexTupleSize(tuple));
4530  if (state->sortopt & TUPLESORT_RANDOMACCESS) /* need trailing length
4531  * word? */
4532  LogicalTapeWrite(tape, (void *) &tuplen, sizeof(tuplen));
4533 
4534  if (!state->slabAllocatorUsed)
4535  {
4537  pfree(tuple);
4538  }
4539 }
4540 
4541 static void
4543  LogicalTape *tape, unsigned int len)
4544 {
4545  unsigned int tuplen = len - sizeof(unsigned int);
4546  IndexTuple tuple = (IndexTuple) readtup_alloc(state, tuplen);
4547 
4548  LogicalTapeReadExact(tape, tuple, tuplen);
4549  if (state->sortopt & TUPLESORT_RANDOMACCESS) /* need trailing length
4550  * word? */
4551  LogicalTapeReadExact(tape, &tuplen, sizeof(tuplen));
4552  stup->tuple = (void *) tuple;
4553  /* set up first-column key value */
4554  stup->datum1 = index_getattr(tuple,
4555  1,
4556  RelationGetDescr(state->indexRel),
4557  &stup->isnull1);
4558 }
4559 
4560 /*
4561  * Routines specialized for DatumTuple case
4562  */
4563 
4564 static int
4566 {
4567  int compare;
4568 
4569  compare = ApplySortComparator(a->datum1, a->isnull1,
4570  b->datum1, b->isnull1,
4571  state->sortKeys);
4572  if (compare != 0)
4573  return compare;
4574 
4575  /* if we have abbreviations, then "tuple" has the original value */
4576 
4577  if (state->sortKeys->abbrev_converter)
4579  PointerGetDatum(b->tuple), b->isnull1,
4580  state->sortKeys);
4581 
4582  return compare;
4583 }
4584 
4585 static void
4587 {
4588  /* Not currently needed */
4589  elog(ERROR, "copytup_datum() should not be called");
4590 }
4591 
4592 static void
4594 {
4595  void *waddr;
4596  unsigned int tuplen;
4597  unsigned int writtenlen;
4598 
4599  if (stup->isnull1)
4600  {
4601  waddr = NULL;
4602  tuplen = 0;
4603  }
4604  else if (!state->tuples)
4605  {
4606  waddr = &stup->datum1;
4607  tuplen = sizeof(Datum);
4608  }
4609  else
4610  {
4611  waddr = stup->tuple;
4612  tuplen = datumGetSize(PointerGetDatum(stup->tuple), false, state->datumTypeLen);
4613  Assert(tuplen != 0);
4614  }
4615 
4616  writtenlen = tuplen + sizeof(unsigned int);
4617 
4618  LogicalTapeWrite(tape, (void *) &writtenlen, sizeof(writtenlen));
4619  LogicalTapeWrite(tape, waddr, tuplen);
4620  if (state->sortopt & TUPLESORT_RANDOMACCESS) /* need trailing length
4621  * word? */
4622  LogicalTapeWrite(tape, (void *) &writtenlen, sizeof(writtenlen));
4623 
4624  if (!state->slabAllocatorUsed && stup->tuple)
4625  {
4627  pfree(stup->tuple);
4628  }
4629 }
4630 
4631 static void
4633  LogicalTape *tape, unsigned int len)
4634 {
4635  unsigned int tuplen = len - sizeof(unsigned int);
4636 
4637  if (tuplen == 0)
4638  {
4639  /* it's NULL */
4640  stup->datum1 = (Datum) 0;
4641  stup->isnull1 = true;
4642  stup->tuple = NULL;
4643  }
4644  else if (!state->tuples)
4645  {
4646  Assert(tuplen == sizeof(Datum));
4647  LogicalTapeReadExact(tape, &stup->datum1, tuplen);
4648  stup->isnull1 = false;
4649  stup->tuple = NULL;
4650  }
4651  else
4652  {
4653  void *raddr = readtup_alloc(state, tuplen);
4654 
4655  LogicalTapeReadExact(tape, raddr, tuplen);
4656  stup->datum1 = PointerGetDatum(raddr);
4657  stup->isnull1 = false;
4658  stup->tuple = raddr;
4659  }
4660 
4661  if (state->sortopt & TUPLESORT_RANDOMACCESS) /* need trailing length
4662  * word? */
4663  LogicalTapeReadExact(tape, &tuplen, sizeof(tuplen));
4664 }
4665 
4666 /*
4667  * Parallel sort routines
4668  */
4669 
4670 /*
4671  * tuplesort_estimate_shared - estimate required shared memory allocation
4672  *
4673  * nWorkers is an estimate of the number of workers (it's the number that
4674  * will be requested).
4675  */
4676 Size
4678 {
4679  Size tapesSize;
4680 
4681  Assert(nWorkers > 0);
4682 
4683  /* Make sure that BufFile shared state is MAXALIGN'd */
4684  tapesSize = mul_size(sizeof(TapeShare), nWorkers);
4685  tapesSize = MAXALIGN(add_size(tapesSize, offsetof(Sharedsort, tapes)));
4686 
4687  return tapesSize;
4688 }
4689 
4690 /*
4691  * tuplesort_initialize_shared - initialize shared tuplesort state
4692  *
4693  * Must be called from leader process before workers are launched, to
4694  * establish state needed up-front for worker tuplesortstates. nWorkers
4695  * should match the argument passed to tuplesort_estimate_shared().
4696  */
4697 void
4699 {
4700  int i;
4701 
4702  Assert(nWorkers > 0);
4703 
4704  SpinLockInit(&shared->mutex);
4705  shared->currentWorker = 0;
4706  shared->workersFinished = 0;
4707  SharedFileSetInit(&shared->fileset, seg);
4708  shared->nTapes = nWorkers;
4709  for (i = 0; i < nWorkers; i++)
4710  {
4711  shared->tapes[i].firstblocknumber = 0L;
4712  }
4713 }
4714 
4715 /*
4716  * tuplesort_attach_shared - attach to shared tuplesort state
4717  *
4718  * Must be called by all worker processes.
4719  */
4720 void
4722 {
4723  /* Attach to SharedFileSet */
4724  SharedFileSetAttach(&shared->fileset, seg);
4725 }
4726 
4727 /*
4728  * worker_get_identifier - Assign and return ordinal identifier for worker
4729  *
4730  * The order in which these are assigned is not well defined, and should not
4731  * matter; worker numbers across parallel sort participants need only be
4732  * distinct and gapless. logtape.c requires this.
4733  *
4734  * Note that the identifiers assigned from here have no relation to
4735  * ParallelWorkerNumber number, to avoid making any assumption about
4736  * caller's requirements. However, we do follow the ParallelWorkerNumber
4737  * convention of representing a non-worker with worker number -1. This
4738  * includes the leader, as well as serial Tuplesort processes.
4739  */
4740 static int
4742 {
4743  Sharedsort *shared = state->shared;
4744  int worker;
4745 
4746  Assert(WORKER(state));
4747 
4748  SpinLockAcquire(&shared->mutex);
4749  worker = shared->currentWorker++;
4750  SpinLockRelease(&shared->mutex);
4751 
4752  return worker;
4753 }
4754 
4755 /*
4756  * worker_freeze_result_tape - freeze worker's result tape for leader
4757  *
4758  * This is called by workers just after the result tape has been determined,
4759  * instead of calling LogicalTapeFreeze() directly. They do so because
4760  * workers require a few additional steps over similar serial
4761  * TSS_SORTEDONTAPE external sort cases, which also happen here. The extra
4762  * steps are around freeing now unneeded resources, and representing to
4763  * leader that worker's input run is available for its merge.
4764  *
4765  * There should only be one final output run for each worker, which consists
4766  * of all tuples that were originally input into worker.
4767  */
4768 static void
4770 {
4771  Sharedsort *shared = state->shared;
4772  TapeShare output;
4773 
4774  Assert(WORKER(state));
4775  Assert(state->result_tape != NULL);
4776  Assert(state->memtupcount == 0);
4777 
4778  /*
4779  * Free most remaining memory, in case caller is sensitive to our holding
4780  * on to it. memtuples may not be a tiny merge heap at this point.
4781  */
4782  pfree(state->memtuples);
4783  /* Be tidy */
4784  state->memtuples = NULL;
4785  state->memtupsize = 0;
4786 
4787  /*
4788  * Parallel worker requires result tape metadata, which is to be stored in
4789  * shared memory for leader
4790  */
4791  LogicalTapeFreeze(state->result_tape, &output);
4792 
4793  /* Store properties of output tape, and update finished worker count */
4794  SpinLockAcquire(&shared->mutex);
4795  shared->tapes[state->worker] = output;
4796  shared->workersFinished++;
4797  SpinLockRelease(&shared->mutex);
4798 }
4799 
4800 /*
4801  * worker_nomergeruns - dump memtuples in worker, without merging
4802  *
4803  * This called as an alternative to mergeruns() with a worker when no
4804  * merging is required.
4805  */
4806 static void
4808 {
4809  Assert(WORKER(state));
4810  Assert(state->result_tape == NULL);
4811  Assert(state->nOutputRuns == 1);
4812 
4813  state->result_tape = state->destTape;
4815 }
4816 
4817 /*
4818  * leader_takeover_tapes - create tapeset for leader from worker tapes
4819  *
4820  * So far, leader Tuplesortstate has performed no actual sorting. By now, all
4821  * sorting has occurred in workers, all of which must have already returned
4822  * from tuplesort_performsort().
4823  *
4824  * When this returns, leader process is left in a state that is virtually
4825  * indistinguishable from it having generated runs as a serial external sort
4826  * might have.
4827  */
4828 static void
4830 {
4831  Sharedsort *shared = state->shared;
4832  int nParticipants = state->nParticipants;
4833  int workersFinished;
4834  int j;
4835 
4836  Assert(LEADER(state));
4837  Assert(nParticipants >= 1);
4838 
4839  SpinLockAcquire(&shared->mutex);
4840  workersFinished = shared->workersFinished;
4841  SpinLockRelease(&shared->mutex);
4842 
4843  if (nParticipants != workersFinished)
4844  elog(ERROR, "cannot take over tapes before all workers finish");
4845 
4846  /*
4847  * Create the tapeset from worker tapes, including a leader-owned tape at
4848  * the end. Parallel workers are far more expensive than logical tapes,
4849  * so the number of tapes allocated here should never be excessive.
4850  */
4851  inittapestate(state, nParticipants);
4852  state->tapeset = LogicalTapeSetCreate(false, &shared->fileset, -1);
4853 
4854  /*
4855  * Set currentRun to reflect the number of runs we will merge (it's not
4856  * used for anything, this is just pro forma)
4857  */
4858  state->currentRun = nParticipants;
4859 
4860  /*
4861  * Initialize the state to look the same as after building the initial
4862  * runs.
4863  *
4864  * There will always be exactly 1 run per worker, and exactly one input
4865  * tape per run, because workers always output exactly 1 run, even when
4866  * there were no input tuples for workers to sort.
4867  */
4868  state->inputTapes = NULL;
4869  state->nInputTapes = 0;
4870  state->nInputRuns = 0;
4871 
4872  state->outputTapes = palloc0(nParticipants * sizeof(LogicalTape *));
4873  state->nOutputTapes = nParticipants;
4874  state->nOutputRuns = nParticipants;
4875 
4876  for (j = 0; j < nParticipants; j++)
4877  {
4878  state->outputTapes[j] = LogicalTapeImport(state->tapeset, j, &shared->tapes[j]);
4879  }
4880 
4881  state->status = TSS_BUILDRUNS;
4882 }
4883 
4884 /*
4885  * Convenience routine to free a tuple previously loaded into sort memory
4886  */
4887 static void
4889 {
4890  if (stup->tuple)
4891  {
4893  pfree(stup->tuple);
4894  stup->tuple = NULL;
4895  }
4896 }
4897 
4898 int
4900 {
4901  if (x < y)
4902  return -1;
4903  else if (x > y)
4904  return 1;
4905  else
4906  return 0;
4907 }
4908 
4909 #if SIZEOF_DATUM >= 8
4910 int
4911 ssup_datum_signed_cmp(Datum x, Datum y, SortSupport ssup)
4912 {
4913  int64 xx = DatumGetInt64(x);
4914  int64 yy = DatumGetInt64(y);
4915 
4916  if (xx < yy)
4917  return -1;
4918  else if (xx > yy)
4919  return 1;
4920  else
4921  return 0;
4922 }
4923 #endif
4924 
4925 int
4927 {
4928  int32 xx = DatumGetInt32(x);
4929  int32 yy = DatumGetInt32(y);
4930 
4931  if (xx < yy)
4932  return -1;
4933  else if (xx > yy)
4934  return 1;
4935  else
4936  return 0;
4937 }
int16 AttrNumber
Definition: attnum.h:21
void PrepareTempTablespaces(void)
Definition: tablespace.c:1379
uint32 BlockNumber
Definition: block.h:31
static Datum values[MAXATTR]
Definition: bootstrap.c:156
#define AssertState(condition)
Definition: c.h:807
unsigned int uint32
Definition: c.h:441
#define Min(x, y)
Definition: c.h:986
signed short int16
Definition: c.h:428
#define MAXALIGN(LEN)
Definition: c.h:757
#define offsetof(type, field)
Definition: c.h:727
signed int int32
Definition: c.h:429
#define Max(x, y)
Definition: c.h:980
#define INT64_FORMAT
Definition: c.h:483
#define FLEXIBLE_ARRAY_MEMBER
Definition: c.h:350
#define pg_attribute_always_inline
Definition: c.h:196
size_t Size
Definition: c.h:540
#define AssertArg(condition)
Definition: c.h:806
Datum datumCopy(Datum value, bool typByVal, int typLen)
Definition: datum.c:132
Size datumGetSize(Datum value, bool typByVal, int typLen)
Definition: datum.c:65
int errdetail(const char *fmt,...)
Definition: elog.c:1037
int errcode(int sqlerrcode)
Definition: elog.c:693
int errmsg(const char *fmt,...)
Definition: elog.c:904
#define LOG
Definition: elog.h:25
#define ERROR
Definition: elog.h:33
#define elog(elevel,...)
Definition: elog.h:218
#define ereport(elevel,...)
Definition: elog.h:143
void ExecDropSingleTupleTableSlot(TupleTableSlot *slot)
Definition: execTuples.c:1254
TupleTableSlot * ExecStoreMinimalTuple(MinimalTuple mtup, TupleTableSlot *slot, bool shouldFree)
Definition: execTuples.c:1446
TupleTableSlot * ExecStoreHeapTuple(HeapTuple tuple, TupleTableSlot *slot, bool shouldFree)
Definition: execTuples.c:1352
const TupleTableSlotOps TTSOpsHeapTuple
Definition: execTuples.c:84
TupleTableSlot * MakeSingleTupleTableSlot(TupleDesc tupdesc, const TupleTableSlotOps *tts_ops)
Definition: execTuples.c:1238
EState * CreateExecutorState(void)
Definition: execUtils.c:90
void FreeExecutorState(EState *estate)
Definition: execUtils.c:186
#define ResetPerTupleExprContext(estate)
Definition: executor.h:546
#define GetPerTupleExprContext(estate)
Definition: executor.h:537
char * BuildIndexValueDescription(Relation indexRelation, Datum *values, bool *isnull)
Definition: genam.c:177
MemoryContext GenerationContextCreate(MemoryContext parent, const char *name, Size minContextSize, Size initBlockSize, Size maxBlockSize)
Definition: generation.c:215
static int compare(const void *arg1, const void *arg2)
Definition: geqo_pool.c:145
uint32 Bucket
Definition: hash.h:35
Bucket _hash_hashkey2bucket(uint32 hashkey, uint32 maxbucket, uint32 highmask, uint32 lowmask)
Definition: hashutil.c:126
HeapTuple heap_copytuple(HeapTuple tuple)
Definition: heaptuple.c:680
void heap_free_minimal_tuple(MinimalTuple mtup)
Definition: heaptuple.c:1427
MinimalTuple heap_copy_minimal_tuple(MinimalTuple mtup)
Definition: heaptuple.c:1439
void heap_freetuple(HeapTuple htup)
Definition: heaptuple.c:1338
#define HEAPTUPLESIZE
Definition: htup.h:73
HeapTupleData * HeapTuple
Definition: htup.h:71
MinimalTupleData * MinimalTuple
Definition: htup.h:27
HeapTupleHeaderData * HeapTupleHeader
Definition: htup.h:23
#define MINIMAL_TUPLE_OFFSET
Definition: htup_details.h:613
static Datum heap_getattr(HeapTuple tup, int attnum, TupleDesc tupleDesc, bool *isnull)
Definition: htup_details.h:788
#define MINIMAL_TUPLE_DATA_OFFSET
Definition: htup_details.h:617
void FormIndexDatum(IndexInfo *indexInfo, TupleTableSlot *slot, EState *estate, Datum *values, bool *isnull)
Definition: index.c:2707
IndexInfo * BuildIndexInfo(Relation index)
Definition: index.c:2418
void index_deform_tuple(IndexTuple tup, TupleDesc tupleDescriptor, Datum *values, bool *isnull)
Definition: indextuple.c:437
IndexTuple index_form_tuple(TupleDesc tupleDescriptor, Datum *values, bool *isnull)
Definition: indextuple.c:47
long val
Definition: informix.c:664
int y
Definition: isn.c:72
int b
Definition: isn.c:70
int x
Definition: isn.c:71
int a
Definition: isn.c:69
int j
Definition: isn.c:74
int i
Definition: isn.c:73
#define ItemPointerGetBlockNumber(pointer)
Definition: itemptr.h:98
struct ItemPointerData ItemPointerData
#define ItemPointerGetOffsetNumber(pointer)
Definition: itemptr.h:117
#define index_getattr(tup, attnum, tupleDesc, isnull)
Definition: itup.h:99
IndexTupleData * IndexTuple
Definition: itup.h:53
#define IndexTupleSize(itup)
Definition: itup.h:70
Assert(fmt[strlen(fmt) - 1] !='\n')
LogicalTape * LogicalTapeCreate(LogicalTapeSet *lts)
Definition: logtape.c:681
void LogicalTapeRewindForRead(LogicalTape *lt, size_t buffer_size)
Definition: logtape.c:847
void LogicalTapeSetForgetFreeSpace(LogicalTapeSet *lts)
Definition: logtape.c:751
size_t LogicalTapeBackspace(LogicalTape *lt, size_t size)
Definition: logtape.c:1063
size_t LogicalTapeRead(LogicalTape *lt, void *ptr, size_t size)
Definition: logtape.c:929
void LogicalTapeClose(LogicalTape *lt)
Definition: logtape.c:734
void LogicalTapeSetClose(LogicalTapeSet *lts)
Definition: logtape.c:668
void LogicalTapeWrite(LogicalTape *lt, void *ptr, size_t size)
Definition: logtape.c:762
void LogicalTapeTell(LogicalTape *lt, long *blocknum, int *offset)
Definition: logtape.c:1163
void LogicalTapeSeek(LogicalTape *lt, long blocknum, int offset)
Definition: logtape.c:1134
LogicalTapeSet * LogicalTapeSetCreate(bool preallocate, SharedFileSet *fileset, int worker)
Definition: logtape.c:557
long LogicalTapeSetBlocks(LogicalTapeSet *lts)
Definition: logtape.c:1184
void LogicalTapeFreeze(LogicalTape *lt, TapeShare *share)
Definition: logtape.c:982
LogicalTape * LogicalTapeImport(LogicalTapeSet *lts, int worker, TapeShare *shared)
Definition: logtape.c:610
void get_typlenbyval(Oid typid, int16 *typlen, bool *typbyval)
Definition: lsyscache.c:2208
void MemoryContextReset(MemoryContext context)
Definition: mcxt.c:143
void pfree(void *pointer)
Definition: mcxt.c:1175
Size GetMemoryChunkSpace(void *pointer)
Definition: mcxt.c:434
void * palloc0(Size size)
Definition: mcxt.c:1099
MemoryContext CurrentMemoryContext
Definition: mcxt.c:42
void * MemoryContextAlloc(MemoryContext context, Size size)
Definition: mcxt.c:863
void MemoryContextDelete(MemoryContext context)
Definition: mcxt.c:218
void * palloc(Size size)
Definition: mcxt.c:1068
void MemoryContextResetOnly(MemoryContext context)
Definition: mcxt.c:162
void * repalloc_huge(void *pointer, Size size)
Definition: mcxt.c:1258
#define AllocSetContextCreate
Definition: memutils.h:173
#define MaxAllocHugeSize
Definition: memutils.h:44
#define ALLOCSET_DEFAULT_SIZES
Definition: memutils.h:197
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:121
#define SK_BT_NULLS_FIRST
Definition: nbtree.h:1085
#define SK_BT_DESC
Definition: nbtree.h:1084
BTScanInsert _bt_mkscankey(Relation rel, IndexTuple itup)
Definition: nbtutils.c:90
uint16 OffsetNumber
Definition: off.h:24
static MemoryContext MemoryContextSwitchTo(MemoryContext context)
Definition: palloc.h:109
#define INDEX_MAX_KEYS
const void size_t len
const char * pg_rusage_show(const PGRUsage *ru0)
Definition: pg_rusage.c:40
void pg_rusage_init(PGRUsage *ru0)
Definition: pg_rusage.c:27
static char * buf
Definition: pg_test_fsync.c:67
static void output(uint64 loop_count)
uintptr_t Datum
Definition: postgres.h:411
#define DatumGetPointer(X)
Definition: postgres.h:593
#define DatumGetInt32(X)
Definition: postgres.h:516
#define DatumGetInt64(X)
Definition: postgres.h:651
#define PointerGetDatum(X)
Definition: postgres.h:600
#define DatumGetUInt32(X)
Definition: postgres.h:530
#define InvalidOid
Definition: postgres_ext.h:36
unsigned int Oid
Definition: postgres_ext.h:31
#define RelationGetDescr(relation)
Definition: rel.h:515
#define RelationGetNumberOfAttributes(relation)
Definition: rel.h:495
#define RelationGetRelationName(relation)
Definition: rel.h:523
#define IndexRelationGetNumberOfKeyAttributes(relation)
Definition: rel.h:508
int errtableconstraint(Relation rel, const char *conname)
Definition: relcache.c:5873
int slock_t
Definition: s_lock.h:975
void SharedFileSetAttach(SharedFileSet *fileset, dsm_segment *seg)
Definition: sharedfileset.c:62
void SharedFileSetInit(SharedFileSet *fileset, dsm_segment *seg)
Definition: sharedfileset.c:44
Size add_size(Size s1, Size s2)
Definition: shmem.c:502
Size mul_size(Size s1, Size s2)
Definition: shmem.c:519
void PrepareSortSupportFromGistIndexRel(Relation indexRel, SortSupport ssup)
Definition: sortsupport.c:189
void PrepareSortSupportFromOrderingOp(Oid orderingOp, SortSupport ssup)
Definition: sortsupport.c:135
void PrepareSortSupportFromIndexRel(Relation indexRel, int16 strategy, SortSupport ssup)
Definition: sortsupport.c:162
static int ApplySortAbbrevFullComparator(Datum datum1, bool isNull1, Datum datum2, bool isNull2, SortSupport ssup)
Definition: sortsupport.h:341
struct SortSupportData * SortSupport
Definition: sortsupport.h:58
static int ApplySortComparator(Datum datum1, bool isNull1, Datum datum2, bool isNull2, SortSupport ssup)
Definition: sortsupport.h:200
static int ApplyUnsignedSortComparator(Datum datum1, bool isNull1, Datum datum2, bool isNull2, SortSupport ssup)
Definition: sortsupport.h:233
static int ApplyInt32SortComparator(Datum datum1, bool isNull1, Datum datum2, bool isNull2, SortSupport ssup)
Definition: sortsupport.h:302
#define SpinLockInit(lock)
Definition: spin.h:60
#define SpinLockRelease(lock)
Definition: spin.h:64
#define SpinLockAcquire(lock)
Definition: spin.h:62
#define BTGreaterStrategyNumber
Definition: stratnum.h:33
#define BTLessStrategyNumber
Definition: stratnum.h:29
ScanKeyData scankeys[INDEX_MAX_KEYS]
Definition: nbtree.h:793
TupleTableSlot * ecxt_scantuple
Definition: execnodes.h:232
ItemPointerData t_self
Definition: htup.h:65
uint32 t_len
Definition: htup.h:64
HeapTupleHeader t_data
Definition: htup.h:68
Oid t_tableOid
Definition: htup.h:66
ItemPointerData t_tid
Definition: itup.h:37
Oid * rd_indcollation
Definition: rel.h:213
Form_pg_class rd_rel
Definition: rel.h:109
int sk_flags
Definition: skey.h:66
Oid sk_collation
Definition: skey.h:70
AttrNumber sk_attno
Definition: skey.h:67
SharedFileSet fileset
Definition: tuplesort.c:516
TapeShare tapes[FLEXIBLE_ARRAY_MEMBER]
Definition: tuplesort.c:525
int workersFinished
Definition: tuplesort.c:513
int nTapes
Definition: tuplesort.c:519
slock_t mutex
Definition: tuplesort.c:502
int currentWorker
Definition: tuplesort.c:512
Sharedsort * sharedsort
Definition: tuplesort.h:55
AttrNumber ssup_attno
Definition: sortsupport.h:81
bool ssup_nulls_first
Definition: sortsupport.h:75
Datum(* abbrev_converter)(Datum original, SortSupport ssup)
Definition: sortsupport.h:172
MemoryContext ssup_cxt
Definition: sortsupport.h:66
bool isnull1
Definition: tuplesort.c:185
void * tuple
Definition: tuplesort.c:183
int srctape
Definition: tuplesort.c:186
Datum datum1
Definition: tuplesort.c:184
long firstblocknumber
Definition: logtape.h:54
TuplesortMethod sortMethod
Definition: tuplesort.h:100
TuplesortSpaceType spaceType
Definition: tuplesort.h:101
void * lastReturnedTuple
Definition: tuplesort.c:369
uint32 high_mask
Definition: tuplesort.c:475
LogicalTapeSet * tapeset
Definition: tuplesort.c:269
Relation heapRel
Definition: tuplesort.c:467
bool isMaxSpaceDisk
Definition: tuplesort.c:261
bool growmemtuples
Definition: tuplesort.c:325
SortTupleComparator comparetup
Definition: tuplesort.c:280
EState * estate
Definition: tuplesort.c:461
SortTuple * memtuples
Definition: tuplesort.c:322
int64 maxSpace
Definition: tuplesort.c:259
LogicalTape ** inputTapes
Definition: tuplesort.c:385
bool slabAllocatorUsed
Definition: tuplesort.c:354
MemoryContext maincontext
Definition: tuplesort.c:265
char * slabMemoryEnd
Definition: tuplesort.c:357
void(* writetup)(Tuplesortstate *state, LogicalTape *tape, SortTuple *stup)
Definition: tuplesort.c:298
PGRUsage ru_start
Definition: tuplesort.c:491
char * slabMemoryBegin
Definition: tuplesort.c:356
IndexInfo * indexInfo
Definition: tuplesort.c:460
void(* copytup)(Tuplesortstate *state, SortTuple *stup, void *tup)
Definition: tuplesort.c:288
LogicalTape ** outputTapes
Definition: tuplesort.c:389
uint32 max_buckets
Definition: tuplesort.c:477
uint32 low_mask
Definition: tuplesort.c:476
bool eof_reached
Definition: tuplesort.c:402
size_t tape_buffer_mem
Definition: tuplesort.c:361
TupSortStatus status
Definition: tuplesort.c:247
int64 availMem
Definition: tuplesort.c:255
LogicalTape * destTape
Definition: tuplesort.c:393
Relation indexRel
Definition: tuplesort.c:468
bool uniqueNullsNotDistinct
Definition: tuplesort.c:472
bool enforceUnique
Definition: tuplesort.c:471
SortSupport onlyKey
Definition: tuplesort.c:445
void(* readtup)(Tuplesortstate *state, SortTuple *stup, LogicalTape *tape, unsigned int len)
Definition: tuplesort.c:306
MemoryContext sortcontext
Definition: tuplesort.c:267
TupleDesc tupDesc
Definition: tuplesort.c:434
TupSortStatus maxSpaceStatus
Definition: tuplesort.c:264
long markpos_block
Definition: tuplesort.c:405
SortSupport sortKeys
Definition: tuplesort.c:435
bool markpos_eof
Definition: tuplesort.c:407
int64 abbrevNext
Definition: tuplesort.c:453
Sharedsort * shared
Definition: tuplesort.c:426
LogicalTape * result_tape
Definition: tuplesort.c:400
SlabSlot * slabFreeHead
Definition: tuplesort.c:358
int markpos_offset
Definition: tuplesort.c:406
int64 allowedMem
Definition: tuplesort.c:256
MemoryContext tuplecontext
Definition: tuplesort.c:268
Definition: regguts.h:318
IndexTuple tuplesort_getindextuple(Tuplesortstate *state, bool forward)
Definition: tuplesort.c:2618
static void copytup_heap(Tuplesortstate *state, SortTuple *stup, void *tup)
Definition: tuplesort.c:3950
void tuplesort_rescan(Tuplesortstate *state)
Definition: tuplesort.c:3379
void tuplesort_performsort(Tuplesortstate *state)
Definition: tuplesort.c:2196
int tuplesort_merge_order(int64 allowedMem)
Definition: tuplesort.c:2755
#define TAPE_BUFFER_OVERHEAD
Definition: tuplesort.c:236
static void tuplesort_heap_delete_top(Tuplesortstate *state)
Definition: tuplesort.c:3751
#define INITIAL_MEMTUPSIZE
Definition: tuplesort.c:139
static unsigned int getlen(LogicalTape *tape, bool eofOK)
Definition: tuplesort.c:3833
void tuplesort_initialize_shared(Sharedsort *shared, int nWorkers, dsm_segment *seg)
Definition: tuplesort.c:4698
#define COMPARETUP(state, a, b)
Definition: tuplesort.c:551
HeapTuple tuplesort_getheaptuple(Tuplesortstate *state, bool forward)
Definition: tuplesort.c:2598
static void selectnewtape(Tuplesortstate *state)
Definition: tuplesort.c:2927
void tuplesort_putdatum(Tuplesortstate *state, Datum val, bool isNull)
Definition: tuplesort.c:1961
Tuplesortstate * tuplesort_begin_index_hash(Relation heapRel, Relation indexRel, uint32 high_mask, uint32 low_mask, uint32 max_buckets, int workMem, SortCoordinate coordinate, int sortopt)
Definition: tuplesort.c:1290
static int comparetup_index_btree(const SortTuple *a, const SortTuple *b, Tuplesortstate *state)
Definition: tuplesort.c:4326
void tuplesort_reset(Tuplesortstate *state)
Definition: tuplesort.c:1688
#define SERIAL(state)
Definition: tuplesort.c:558
static void markrunend(LogicalTape *tape)
Definition: tuplesort.c:3846
static void readtup_index(Tuplesortstate *state, SortTuple *stup, LogicalTape *tape, unsigned int len)
Definition: tuplesort.c:4542
bool tuplesort_skiptuples(Tuplesortstate *state, int64 ntuples, bool forward)
Definition: tuplesort.c:2687
static void free_sort_tuple(Tuplesortstate *state, SortTuple *stup)
Definition: tuplesort.c:4888
static void puttuple_common(Tuplesortstate *state, SortTuple *tuple)
Definition: tuplesort.c:2043
void tuplesort_puttupleslot(Tuplesortstate *state, TupleTableSlot *slot)
Definition: tuplesort.c:1840
static int comparetup_datum(const SortTuple *a, const SortTuple *b, Tuplesortstate *state)
Definition: tuplesort.c:4565
static bool tuplesort_gettuple_common(Tuplesortstate *state, bool forward, SortTuple *stup)
Definition: tuplesort.c:2307
#define LACKMEM(state)
Definition: tuplesort.c:555
static void reversedirection(Tuplesortstate *state)
Definition: tuplesort.c:3815
#define USEMEM(state, amt)
Definition: tuplesort.c:556
static void tuplesort_heap_insert(Tuplesortstate *state, SortTuple *tuple)
Definition: tuplesort.c:3716
static bool grow_memtuples(Tuplesortstate *state)
Definition: tuplesort.c:1721
int ssup_datum_unsigned_cmp(Datum x, Datum y, SortSupport ssup)
Definition: tuplesort.c:4899
Tuplesortstate * tuplesort_begin_index_gist(Relation heapRel, Relation indexRel, int workMem, SortCoordinate coordinate, int sortopt)
Definition: tuplesort.c:1338
static void beginmerge(Tuplesortstate *state)
Definition: tuplesort.c:3241
static void make_bounded_heap(Tuplesortstate *state)
Definition: tuplesort.c:3564
bool tuplesort_used_bound(Tuplesortstate *state)
Definition: tuplesort.c:1532
static void copytup_cluster(Tuplesortstate *state, SortTuple *stup, void *tup)
Definition: tuplesort.c:4196
#define WRITETUP(state, tape, stup)
Definition: tuplesort.c:553
Tuplesortstate * tuplesort_begin_index_btree(Relation heapRel, Relation indexRel, bool enforceUnique, bool uniqueNullsNotDistinct, int workMem, SortCoordinate coordinate, int sortopt)
Definition: tuplesort.c:1209
static void sort_bounded_heap(Tuplesortstate *state)
Definition: tuplesort.c:3613
TupSortStatus
Definition: tuplesort.c:213
@ TSS_SORTEDONTAPE
Definition: tuplesort.c:218
@ TSS_SORTEDINMEM
Definition: tuplesort.c:217
@ TSS_INITIAL
Definition: tuplesort.c:214
@ TSS_FINALMERGE
Definition: tuplesort.c:219
@ TSS_BUILDRUNS
Definition: tuplesort.c:216
@ TSS_BOUNDED
Definition: tuplesort.c:215
static int worker_get_identifier(Tuplesortstate *state)
Definition: tuplesort.c:4741
static void mergeonerun(Tuplesortstate *state)
Definition: tuplesort.c:3183
#define FREEMEM(state, amt)
Definition: tuplesort.c:557
const char * tuplesort_space_type_name(TuplesortSpaceType t)
Definition: tuplesort.c:3543
#define PARALLEL_SORT(state)
Definition: tuplesort.c:129
#define MAXORDER
Definition: tuplesort.c:235
static void readtup_datum(Tuplesortstate *state, SortTuple *stup, LogicalTape *tape, unsigned int len)
Definition: tuplesort.c:4632
static void copytup_index(Tuplesortstate *state, SortTuple *stup, void *tup)
Definition: tuplesort.c:4515
#define INDEX_SORT
Definition: tuplesort.c:124
static void inittapestate(Tuplesortstate *state, int maxTapes)
Definition: tuplesort.c:2893
static void readtup_heap(Tuplesortstate *state, SortTuple *stup, LogicalTape *tape, unsigned int len)
Definition: tuplesort.c:4053
#define SLAB_SLOT_SIZE
Definition: tuplesort.c:200
static void writetup_datum(Tuplesortstate *state, LogicalTape *tape, SortTuple *stup)
Definition: tuplesort.c:4593
static void leader_takeover_tapes(Tuplesortstate *state)
Definition: tuplesort.c:4829
Tuplesortstate * tuplesort_begin_datum(Oid datumType, Oid sortOperator, Oid sortCollation, bool nullsFirstFlag, int workMem, SortCoordinate coordinate, int sortopt)
Definition: tuplesort.c:1396
#define CLUSTER_SORT
Definition: tuplesort.c:126
Size tuplesort_estimate_shared(int nWorkers)
Definition: tuplesort.c:4677
static Tuplesortstate * tuplesort_begin_common(int workMem, SortCoordinate coordinate, int sortopt)
Definition: tuplesort.c:845
void tuplesort_get_stats(Tuplesortstate *state, TuplesortInstrumentation *stats)
Definition: tuplesort.c:3476
static int comparetup_heap(const SortTuple *a, const SortTuple *b, Tuplesortstate *state)
Definition: tuplesort.c:3888
bool tuplesort_gettupleslot(Tuplesortstate *state, bool forward, bool copy, TupleTableSlot *slot, Datum *abbrev)
Definition: tuplesort.c:2561
static void tuplesort_sort_memtuples(Tuplesortstate *state)
Definition: tuplesort.c:3653
void tuplesort_end(Tuplesortstate *state)
Definition: tuplesort.c:1620
static void inittapes(Tuplesortstate *state, bool mergeruns)
Definition: tuplesort.c:2842
#define DATUM_SORT
Definition: tuplesort.c:125
void tuplesort_markpos(Tuplesortstate *state)
Definition: tuplesort.c:3412
#define MERGE_BUFFER_SIZE
Definition: tuplesort.c:237
static int comparetup_index_hash(const SortTuple *a, const SortTuple *b, Tuplesortstate *state)
Definition: tuplesort.c:4460
#define READTUP(state, stup, tape, len)
Definition: tuplesort.c:554
int ssup_datum_int32_cmp(Datum x, Datum y, SortSupport ssup)
Definition: tuplesort.c:4926
#define LEADER(state)
Definition: tuplesort.c:560
#define WORKER(state)
Definition: tuplesort.c:559
static int64 merge_read_buffer_size(int64 avail_mem, int nInputTapes, int nInputRuns, int maxOutputTapes)
Definition: tuplesort.c:2810
static bool mergereadnext(Tuplesortstate *state, LogicalTape *srcTape, SortTuple *stup)
Definition: tuplesort.c:3269
union SlabSlot SlabSlot
static void tuplesort_updatemax(Tuplesortstate *state)
Definition: tuplesort.c:1637
static void worker_freeze_result_tape(Tuplesortstate *state)
Definition: tuplesort.c:4769
bool trace_sort
Definition: tuplesort.c:144
#define RELEASE_SLAB_SLOT(state, tuple)
Definition: tuplesort.c:539
Tuplesortstate * tuplesort_begin_cluster(TupleDesc tupDesc, Relation indexRel, int workMem, SortCoordinate coordinate, int sortopt)
Definition: tuplesort.c:1105
void tuplesort_attach_shared(Sharedsort *shared, dsm_segment *seg)
Definition: tuplesort.c:4721
bool tuplesort_getdatum(Tuplesortstate *state, bool forward, Datum *val, bool *isNull, Datum *abbrev)
Definition: tuplesort.c:2647
#define LogicalTapeReadExact(tape, ptr, len)
Definition: tuplesort.c:612
static void worker_nomergeruns(Tuplesortstate *state)
Definition: tuplesort.c:4807
static void writetup_heap(Tuplesortstate *state, LogicalTape *tape, SortTuple *stup)
Definition: tuplesort.c:4028
static pg_attribute_always_inline int qsort_tuple_unsigned_compare(SortTuple *a, SortTuple *b, Tuplesortstate *state)
Definition: tuplesort.c:698
static void tuplesort_heap_replace_top(Tuplesortstate *state, SortTuple *tuple)
Definition: tuplesort.c:3775
static void copytup_datum(Tuplesortstate *state, SortTuple *stup, void *tup)
Definition: tuplesort.c:4586
static void writetup_index(Tuplesortstate *state, LogicalTape *tape, SortTuple *stup)
Definition: tuplesort.c:4522
void tuplesort_restorepos(Tuplesortstate *state)
Definition: tuplesort.c:3443
static pg_attribute_always_inline int qsort_tuple_int32_compare(SortTuple *a, SortTuple *b, Tuplesortstate *state)
Definition: tuplesort.c:745
static void mergeruns(Tuplesortstate *state)
Definition: tuplesort.c:2996
static void readtup_cluster(Tuplesortstate *state, SortTuple *stup, LogicalTape *tape, unsigned int len)
Definition: tuplesort.c:4290
int(* SortTupleComparator)(const SortTuple *a, const SortTuple *b, Tuplesortstate *state)
Definition: tuplesort.c:239
static void * readtup_alloc(Tuplesortstate *state, Size tuplen)
Definition: tuplesort.c:3860
#define MINORDER
Definition: tuplesort.c:234
void tuplesort_putheaptuple(Tuplesortstate *state, HeapTuple tup)
Definition: tuplesort.c:1862
static int comparetup_cluster(const SortTuple *a, const SortTuple *b, Tuplesortstate *state)
Definition: tuplesort.c:4084
static void tuplesort_begin_batch(Tuplesortstate *state)
Definition: tuplesort.c:956
void tuplesort_set_bound(Tuplesortstate *state, int64 bound)
Definition: tuplesort.c:1484
static void writetup_cluster(Tuplesortstate *state, LogicalTape *tape, SortTuple *stup)
Definition: tuplesort.c:4269
void tuplesort_putindextuplevalues(Tuplesortstate *state, Relation rel, ItemPointer self, Datum *values, bool *isnull)
Definition: tuplesort.c:1883
static void init_slab_allocator(Tuplesortstate *state, int numSlots)
Definition: tuplesort.c:2960
#define COPYTUP(state, stup, tup)
Definition: tuplesort.c:552
const char * tuplesort_method_name(TuplesortMethod m)
Definition: tuplesort.c:3520
static bool consider_abort_common(Tuplesortstate *state)
Definition: tuplesort.c:2152
static void tuplesort_free(Tuplesortstate *state)
Definition: tuplesort.c:1543
#define HEAP_SORT
Definition: tuplesort.c:123
static void dumptuples(Tuplesortstate *state, bool alltuples)
Definition: tuplesort.c:3288
Tuplesortstate * tuplesort_begin_heap(TupleDesc tupDesc, int nkeys, AttrNumber *attNums, Oid *sortOperators, Oid *sortCollations, bool *nullsFirstFlags, int workMem, SortCoordinate coordinate, int sortopt)
Definition: tuplesort.c:1030
#define TUPLESORT_RANDOMACCESS
Definition: tuplesort.h:93
#define TUPLESORT_ALLOWBOUNDED
Definition: tuplesort.h:96
TuplesortSpaceType
Definition: tuplesort.h:84
@ SORT_SPACE_TYPE_DISK
Definition: tuplesort.h:85
@ SORT_SPACE_TYPE_MEMORY
Definition: tuplesort.h:86
TuplesortMethod
Definition: tuplesort.h:73
@ SORT_TYPE_EXTERNAL_SORT
Definition: tuplesort.h:77
@ SORT_TYPE_TOP_N_HEAPSORT
Definition: tuplesort.h:75
@ SORT_TYPE_QUICKSORT
Definition: tuplesort.h:76
@ SORT_TYPE_STILL_IN_PROGRESS
Definition: tuplesort.h:74
@ SORT_TYPE_EXTERNAL_MERGE
Definition: tuplesort.h:78
static MinimalTuple ExecCopySlotMinimalTuple(TupleTableSlot *slot)
Definition: tuptable.h:463
static TupleTableSlot * ExecClearTuple(TupleTableSlot *slot)
Definition: tuptable.h:425
char buffer[SLAB_SLOT_SIZE]
Definition: tuplesort.c:205
union SlabSlot * nextfree
Definition: tuplesort.c:204