PostgreSQL Source Code  git master
tuplesort.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * tuplesort.c
4  * Generalized tuple sorting routines.
5  *
6  * This module handles sorting of heap tuples, index tuples, or single
7  * Datums (and could easily support other kinds of sortable objects,
8  * if necessary). It works efficiently for both small and large amounts
9  * of data. Small amounts are sorted in-memory using qsort(). Large
10  * amounts are sorted using temporary files and a standard external sort
11  * algorithm.
12  *
13  * See Knuth, volume 3, for more than you want to know about external
14  * sorting algorithms. The algorithm we use is a balanced k-way merge.
15  * Before PostgreSQL 15, we used the polyphase merge algorithm (Knuth's
16  * Algorithm 5.4.2D), but with modern hardware, a straightforward balanced
17  * merge is better. Knuth is assuming that tape drives are expensive
18  * beasts, and in particular that there will always be many more runs than
19  * tape drives. The polyphase merge algorithm was good at keeping all the
20  * tape drives busy, but in our implementation a "tape drive" doesn't cost
21  * much more than a few Kb of memory buffers, so we can afford to have
22  * lots of them. In particular, if we can have as many tape drives as
23  * sorted runs, we can eliminate any repeated I/O at all.
24  *
25  * Historically, we divided the input into sorted runs using replacement
26  * selection, in the form of a priority tree implemented as a heap
27  * (essentially Knuth's Algorithm 5.2.3H), but now we always use quicksort
28  * for run generation.
29  *
30  * The approximate amount of memory allowed for any one sort operation
31  * is specified in kilobytes by the caller (most pass work_mem). Initially,
32  * we absorb tuples and simply store them in an unsorted array as long as
33  * we haven't exceeded workMem. If we reach the end of the input without
34  * exceeding workMem, we sort the array using qsort() and subsequently return
35  * tuples just by scanning the tuple array sequentially. If we do exceed
36  * workMem, we begin to emit tuples into sorted runs in temporary tapes.
37  * When tuples are dumped in batch after quicksorting, we begin a new run
38  * with a new output tape. If we reach the max number of tapes, we write
39  * subsequent runs on the existing tapes in a round-robin fashion. We will
40  * need multiple merge passes to finish the merge in that case. After the
41  * end of the input is reached, we dump out remaining tuples in memory into
42  * a final run, then merge the runs.
43  *
44  * When merging runs, we use a heap containing just the frontmost tuple from
45  * each source run; we repeatedly output the smallest tuple and replace it
46  * with the next tuple from its source tape (if any). When the heap empties,
47  * the merge is complete. The basic merge algorithm thus needs very little
48  * memory --- only M tuples for an M-way merge, and M is constrained to a
49  * small number. However, we can still make good use of our full workMem
50  * allocation by pre-reading additional blocks from each source tape. Without
51  * prereading, our access pattern to the temporary file would be very erratic;
52  * on average we'd read one block from each of M source tapes during the same
53  * time that we're writing M blocks to the output tape, so there is no
54  * sequentiality of access at all, defeating the read-ahead methods used by
55  * most Unix kernels. Worse, the output tape gets written into a very random
56  * sequence of blocks of the temp file, ensuring that things will be even
57  * worse when it comes time to read that tape. A straightforward merge pass
58  * thus ends up doing a lot of waiting for disk seeks. We can improve matters
59  * by prereading from each source tape sequentially, loading about workMem/M
60  * bytes from each tape in turn, and making the sequential blocks immediately
61  * available for reuse. This approach helps to localize both read and write
62  * accesses. The pre-reading is handled by logtape.c, we just tell it how
63  * much memory to use for the buffers.
64  *
65  * In the current code we determine the number of input tapes M on the basis
66  * of workMem: we want workMem/M to be large enough that we read a fair
67  * amount of data each time we read from a tape, so as to maintain the
68  * locality of access described above. Nonetheless, with large workMem we
69  * can have many tapes. The logical "tapes" are implemented by logtape.c,
70  * which avoids space wastage by recycling disk space as soon as each block
71  * is read from its "tape".
72  *
73  * When the caller requests random access to the sort result, we form
74  * the final sorted run on a logical tape which is then "frozen", so
75  * that we can access it randomly. When the caller does not need random
76  * access, we return from tuplesort_performsort() as soon as we are down
77  * to one run per logical tape. The final merge is then performed
78  * on-the-fly as the caller repeatedly calls tuplesort_getXXX; this
79  * saves one cycle of writing all the data out to disk and reading it in.
80  *
81  * This module supports parallel sorting. Parallel sorts involve coordination
82  * among one or more worker processes, and a leader process, each with its own
83  * tuplesort state. The leader process (or, more accurately, the
84  * Tuplesortstate associated with a leader process) creates a full tapeset
85  * consisting of worker tapes with one run to merge; a run for every
86  * worker process. This is then merged. Worker processes are guaranteed to
87  * produce exactly one output run from their partial input.
88  *
89  *
90  * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
91  * Portions Copyright (c) 1994, Regents of the University of California
92  *
93  * IDENTIFICATION
94  * src/backend/utils/sort/tuplesort.c
95  *
96  *-------------------------------------------------------------------------
97  */
98 
99 #include "postgres.h"
100 
101 #include <limits.h>
102 
103 #include "access/hash.h"
104 #include "access/htup_details.h"
105 #include "access/nbtree.h"
106 #include "catalog/index.h"
107 #include "catalog/pg_am.h"
108 #include "commands/tablespace.h"
109 #include "executor/executor.h"
110 #include "miscadmin.h"
111 #include "pg_trace.h"
112 #include "utils/datum.h"
113 #include "utils/logtape.h"
114 #include "utils/lsyscache.h"
115 #include "utils/memutils.h"
116 #include "utils/pg_rusage.h"
117 #include "utils/rel.h"
118 #include "utils/sortsupport.h"
119 #include "utils/tuplesort.h"
120 
121 
122 /* sort-type codes for sort__start probes */
123 #define HEAP_SORT 0
124 #define INDEX_SORT 1
125 #define DATUM_SORT 2
126 #define CLUSTER_SORT 3
127 
128 /* Sort parallel code from state for sort__start probes */
129 #define PARALLEL_SORT(state) ((state)->shared == NULL ? 0 : \
130  (state)->worker >= 0 ? 1 : 2)
131 
132 /*
133  * Initial size of memtuples array. We're trying to select this size so that
134  * array doesn't exceed ALLOCSET_SEPARATE_THRESHOLD and so that the overhead of
135  * allocation might possibly be lowered. However, we don't consider array sizes
136  * less than 1024.
137  *
138  */
139 #define INITIAL_MEMTUPSIZE Max(1024, \
140  ALLOCSET_SEPARATE_THRESHOLD / sizeof(SortTuple) + 1)
141 
142 /* GUC variables */
143 #ifdef TRACE_SORT
144 bool trace_sort = false;
145 #endif
146 
147 #ifdef DEBUG_BOUNDED_SORT
148 bool optimize_bounded_sort = true;
149 #endif
150 
151 
152 /*
153  * The objects we actually sort are SortTuple structs. These contain
154  * a pointer to the tuple proper (might be a MinimalTuple or IndexTuple),
155  * which is a separate palloc chunk --- we assume it is just one chunk and
156  * can be freed by a simple pfree() (except during merge, when we use a
157  * simple slab allocator). SortTuples also contain the tuple's first key
158  * column in Datum/nullflag format, and a source/input tape number that
159  * tracks which tape each heap element/slot belongs to during merging.
160  *
161  * Storing the first key column lets us save heap_getattr or index_getattr
162  * calls during tuple comparisons. We could extract and save all the key
163  * columns not just the first, but this would increase code complexity and
164  * overhead, and wouldn't actually save any comparison cycles in the common
165  * case where the first key determines the comparison result. Note that
166  * for a pass-by-reference datatype, datum1 points into the "tuple" storage.
167  *
168  * There is one special case: when the sort support infrastructure provides an
169  * "abbreviated key" representation, where the key is (typically) a pass by
170  * value proxy for a pass by reference type. In this case, the abbreviated key
171  * is stored in datum1 in place of the actual first key column.
172  *
173  * When sorting single Datums, the data value is represented directly by
174  * datum1/isnull1 for pass by value types (or null values). If the datatype is
175  * pass-by-reference and isnull1 is false, then "tuple" points to a separately
176  * palloc'd data value, otherwise "tuple" is NULL. The value of datum1 is then
177  * either the same pointer as "tuple", or is an abbreviated key value as
178  * described above. Accordingly, "tuple" is always used in preference to
179  * datum1 as the authoritative value for pass-by-reference cases.
180  */
181 typedef struct
182 {
183  void *tuple; /* the tuple itself */
184  Datum datum1; /* value of first key column */
185  bool isnull1; /* is first key column NULL? */
186  int srctape; /* source tape number */
187 } SortTuple;
188 
189 /*
190  * During merge, we use a pre-allocated set of fixed-size slots to hold
191  * tuples. To avoid palloc/pfree overhead.
192  *
193  * Merge doesn't require a lot of memory, so we can afford to waste some,
194  * by using gratuitously-sized slots. If a tuple is larger than 1 kB, the
195  * palloc() overhead is not significant anymore.
196  *
197  * 'nextfree' is valid when this chunk is in the free list. When in use, the
198  * slot holds a tuple.
199  */
200 #define SLAB_SLOT_SIZE 1024
201 
202 typedef union SlabSlot
203 {
206 } SlabSlot;
207 
208 /*
209  * Possible states of a Tuplesort object. These denote the states that
210  * persist between calls of Tuplesort routines.
211  */
212 typedef enum
213 {
214  TSS_INITIAL, /* Loading tuples; still within memory limit */
215  TSS_BOUNDED, /* Loading tuples into bounded-size heap */
216  TSS_BUILDRUNS, /* Loading tuples; writing to tape */
217  TSS_SORTEDINMEM, /* Sort completed entirely in memory */
218  TSS_SORTEDONTAPE, /* Sort completed, final run is on tape */
219  TSS_FINALMERGE /* Performing final merge on-the-fly */
220 } TupSortStatus;
221 
222 /*
223  * Parameters for calculation of number of tapes to use --- see inittapes()
224  * and tuplesort_merge_order().
225  *
226  * In this calculation we assume that each tape will cost us about 1 blocks
227  * worth of buffer space. This ignores the overhead of all the other data
228  * structures needed for each tape, but it's probably close enough.
229  *
230  * MERGE_BUFFER_SIZE is how much buffer space we'd like to allocate for each
231  * input tape, for pre-reading (see discussion at top of file). This is *in
232  * addition to* the 1 block already included in TAPE_BUFFER_OVERHEAD.
233  */
234 #define MINORDER 6 /* minimum merge order */
235 #define MAXORDER 500 /* maximum merge order */
236 #define TAPE_BUFFER_OVERHEAD BLCKSZ
237 #define MERGE_BUFFER_SIZE (BLCKSZ * 32)
238 
239 typedef int (*SortTupleComparator) (const SortTuple *a, const SortTuple *b,
241 
242 /*
243  * Private state of a Tuplesort operation.
244  */
246 {
247  TupSortStatus status; /* enumerated value as shown above */
248  int nKeys; /* number of columns in sort key */
249  bool randomAccess; /* did caller request random access? */
250  bool bounded; /* did caller specify a maximum number of
251  * tuples to return? */
252  bool boundUsed; /* true if we made use of a bounded heap */
253  int bound; /* if bounded, the maximum number of tuples */
254  bool tuples; /* Can SortTuple.tuple ever be set? */
255  int64 availMem; /* remaining memory available, in bytes */
256  int64 allowedMem; /* total memory allowed, in bytes */
257  int maxTapes; /* max number of input tapes to merge in each
258  * pass */
259  int64 maxSpace; /* maximum amount of space occupied among sort
260  * of groups, either in-memory or on-disk */
261  bool isMaxSpaceDisk; /* true when maxSpace is value for on-disk
262  * space, false when it's value for in-memory
263  * space */
264  TupSortStatus maxSpaceStatus; /* sort status when maxSpace was reached */
265  MemoryContext maincontext; /* memory context for tuple sort metadata that
266  * persists across multiple batches */
267  MemoryContext sortcontext; /* memory context holding most sort data */
268  MemoryContext tuplecontext; /* sub-context of sortcontext for tuple data */
269  LogicalTapeSet *tapeset; /* logtape.c object for tapes in a temp file */
270 
271  /*
272  * These function pointers decouple the routines that must know what kind
273  * of tuple we are sorting from the routines that don't need to know it.
274  * They are set up by the tuplesort_begin_xxx routines.
275  *
276  * Function to compare two tuples; result is per qsort() convention, ie:
277  * <0, 0, >0 according as a<b, a=b, a>b. The API must match
278  * qsort_arg_comparator.
279  */
281 
282  /*
283  * Function to copy a supplied input tuple into palloc'd space and set up
284  * its SortTuple representation (ie, set tuple/datum1/isnull1). Also,
285  * state->availMem must be decreased by the amount of space used for the
286  * tuple copy (note the SortTuple struct itself is not counted).
287  */
288  void (*copytup) (Tuplesortstate *state, SortTuple *stup, void *tup);
289 
290  /*
291  * Function to write a stored tuple onto tape. The representation of the
292  * tuple on tape need not be the same as it is in memory; requirements on
293  * the tape representation are given below. Unless the slab allocator is
294  * used, after writing the tuple, pfree() the out-of-line data (not the
295  * SortTuple struct!), and increase state->availMem by the amount of
296  * memory space thereby released.
297  */
298  void (*writetup) (Tuplesortstate *state, LogicalTape *tape,
299  SortTuple *stup);
300 
301  /*
302  * Function to read a stored tuple from tape back into memory. 'len' is
303  * the already-read length of the stored tuple. The tuple is allocated
304  * from the slab memory arena, or is palloc'd, see readtup_alloc().
305  */
306  void (*readtup) (Tuplesortstate *state, SortTuple *stup,
307  LogicalTape *tape, unsigned int len);
308 
309  /*
310  * This array holds the tuples now in sort memory. If we are in state
311  * INITIAL, the tuples are in no particular order; if we are in state
312  * SORTEDINMEM, the tuples are in final sorted order; in states BUILDRUNS
313  * and FINALMERGE, the tuples are organized in "heap" order per Algorithm
314  * H. In state SORTEDONTAPE, the array is not used.
315  */
316  SortTuple *memtuples; /* array of SortTuple structs */
317  int memtupcount; /* number of tuples currently present */
318  int memtupsize; /* allocated length of memtuples array */
319  bool growmemtuples; /* memtuples' growth still underway? */
320 
321  /*
322  * Memory for tuples is sometimes allocated using a simple slab allocator,
323  * rather than with palloc(). Currently, we switch to slab allocation
324  * when we start merging. Merging only needs to keep a small, fixed
325  * number of tuples in memory at any time, so we can avoid the
326  * palloc/pfree overhead by recycling a fixed number of fixed-size slots
327  * to hold the tuples.
328  *
329  * For the slab, we use one large allocation, divided into SLAB_SLOT_SIZE
330  * slots. The allocation is sized to have one slot per tape, plus one
331  * additional slot. We need that many slots to hold all the tuples kept
332  * in the heap during merge, plus the one we have last returned from the
333  * sort, with tuplesort_gettuple.
334  *
335  * Initially, all the slots are kept in a linked list of free slots. When
336  * a tuple is read from a tape, it is put to the next available slot, if
337  * it fits. If the tuple is larger than SLAB_SLOT_SIZE, it is palloc'd
338  * instead.
339  *
340  * When we're done processing a tuple, we return the slot back to the free
341  * list, or pfree() if it was palloc'd. We know that a tuple was
342  * allocated from the slab, if its pointer value is between
343  * slabMemoryBegin and -End.
344  *
345  * When the slab allocator is used, the USEMEM/LACKMEM mechanism of
346  * tracking memory usage is not used.
347  */
349 
350  char *slabMemoryBegin; /* beginning of slab memory arena */
351  char *slabMemoryEnd; /* end of slab memory arena */
352  SlabSlot *slabFreeHead; /* head of free list */
353 
354  /* Memory used for input and output tape buffers. */
356 
357  /*
358  * When we return a tuple to the caller in tuplesort_gettuple_XXX, that
359  * came from a tape (that is, in TSS_SORTEDONTAPE or TSS_FINALMERGE
360  * modes), we remember the tuple in 'lastReturnedTuple', so that we can
361  * recycle the memory on next gettuple call.
362  */
364 
365  /*
366  * While building initial runs, this is the current output run number.
367  * Afterwards, it is the number of initial runs we made.
368  */
370 
371  /*
372  * Logical tapes, for merging.
373  *
374  * The initial runs are written in the output tapes. In each merge pass,
375  * the output tapes of the previous pass become the input tapes, and new
376  * output tapes are created as needed. When nInputTapes equals
377  * nInputRuns, there is only one merge pass left.
378  */
382 
386 
387  LogicalTape *destTape; /* current output tape */
388 
389  /*
390  * These variables are used after completion of sorting to keep track of
391  * the next tuple to return. (In the tape case, the tape's current read
392  * position is also critical state.)
393  */
394  LogicalTape *result_tape; /* actual tape of finished output */
395  int current; /* array index (only used if SORTEDINMEM) */
396  bool eof_reached; /* reached EOF (needed for cursors) */
397 
398  /* markpos_xxx holds marked position for mark and restore */
399  long markpos_block; /* tape block# (only used if SORTEDONTAPE) */
400  int markpos_offset; /* saved "current", or offset in tape block */
401  bool markpos_eof; /* saved "eof_reached" */
402 
403  /*
404  * These variables are used during parallel sorting.
405  *
406  * worker is our worker identifier. Follows the general convention that
407  * -1 value relates to a leader tuplesort, and values >= 0 worker
408  * tuplesorts. (-1 can also be a serial tuplesort.)
409  *
410  * shared is mutable shared memory state, which is used to coordinate
411  * parallel sorts.
412  *
413  * nParticipants is the number of worker Tuplesortstates known by the
414  * leader to have actually been launched, which implies that they must
415  * finish a run that the leader needs to merge. Typically includes a
416  * worker state held by the leader process itself. Set in the leader
417  * Tuplesortstate only.
418  */
419  int worker;
422 
423  /*
424  * The sortKeys variable is used by every case other than the hash index
425  * case; it is set by tuplesort_begin_xxx. tupDesc is only used by the
426  * MinimalTuple and CLUSTER routines, though.
427  */
429  SortSupport sortKeys; /* array of length nKeys */
430 
431  /*
432  * This variable is shared by the single-key MinimalTuple case and the
433  * Datum case (which both use qsort_ssup()). Otherwise it's NULL.
434  */
436 
437  /*
438  * Additional state for managing "abbreviated key" sortsupport routines
439  * (which currently may be used by all cases except the hash index case).
440  * Tracks the intervals at which the optimization's effectiveness is
441  * tested.
442  */
443  int64 abbrevNext; /* Tuple # at which to next check
444  * applicability */
445 
446  /*
447  * These variables are specific to the CLUSTER case; they are set by
448  * tuplesort_begin_cluster.
449  */
450  IndexInfo *indexInfo; /* info about index being used for reference */
451  EState *estate; /* for evaluating index expressions */
452 
453  /*
454  * These variables are specific to the IndexTuple case; they are set by
455  * tuplesort_begin_index_xxx and used only by the IndexTuple routines.
456  */
457  Relation heapRel; /* table the index is being built on */
458  Relation indexRel; /* index being built */
459 
460  /* These are specific to the index_btree subcase: */
461  bool enforceUnique; /* complain if we find duplicate tuples */
462 
463  /* These are specific to the index_hash subcase: */
464  uint32 high_mask; /* masks for sortable part of hash code */
467 
468  /*
469  * These variables are specific to the Datum case; they are set by
470  * tuplesort_begin_datum and used only by the DatumTuple routines.
471  */
473  /* we need typelen in order to know how to copy the Datums. */
475 
476  /*
477  * Resource snapshot for time of sort start.
478  */
479 #ifdef TRACE_SORT
481 #endif
482 };
483 
484 /*
485  * Private mutable state of tuplesort-parallel-operation. This is allocated
486  * in shared memory.
487  */
489 {
490  /* mutex protects all fields prior to tapes */
492 
493  /*
494  * currentWorker generates ordinal identifier numbers for parallel sort
495  * workers. These start from 0, and are always gapless.
496  *
497  * Workers increment workersFinished to indicate having finished. If this
498  * is equal to state.nParticipants within the leader, leader is ready to
499  * merge worker runs.
500  */
503 
504  /* Temporary file space */
506 
507  /* Size of tapes flexible array */
508  int nTapes;
509 
510  /*
511  * Tapes array used by workers to report back information needed by the
512  * leader to concatenate all worker tapes into one for merging
513  */
515 };
516 
517 /*
518  * Is the given tuple allocated from the slab memory arena?
519  */
520 #define IS_SLAB_SLOT(state, tuple) \
521  ((char *) (tuple) >= (state)->slabMemoryBegin && \
522  (char *) (tuple) < (state)->slabMemoryEnd)
523 
524 /*
525  * Return the given tuple to the slab memory free list, or free it
526  * if it was palloc'd.
527  */
528 #define RELEASE_SLAB_SLOT(state, tuple) \
529  do { \
530  SlabSlot *buf = (SlabSlot *) tuple; \
531  \
532  if (IS_SLAB_SLOT((state), buf)) \
533  { \
534  buf->nextfree = (state)->slabFreeHead; \
535  (state)->slabFreeHead = buf; \
536  } else \
537  pfree(buf); \
538  } while(0)
539 
540 #define COMPARETUP(state,a,b) ((*(state)->comparetup) (a, b, state))
541 #define COPYTUP(state,stup,tup) ((*(state)->copytup) (state, stup, tup))
542 #define WRITETUP(state,tape,stup) ((*(state)->writetup) (state, tape, stup))
543 #define READTUP(state,stup,tape,len) ((*(state)->readtup) (state, stup, tape, len))
544 #define LACKMEM(state) ((state)->availMem < 0 && !(state)->slabAllocatorUsed)
545 #define USEMEM(state,amt) ((state)->availMem -= (amt))
546 #define FREEMEM(state,amt) ((state)->availMem += (amt))
547 #define SERIAL(state) ((state)->shared == NULL)
548 #define WORKER(state) ((state)->shared && (state)->worker != -1)
549 #define LEADER(state) ((state)->shared && (state)->worker == -1)
550 
551 /*
552  * NOTES about on-tape representation of tuples:
553  *
554  * We require the first "unsigned int" of a stored tuple to be the total size
555  * on-tape of the tuple, including itself (so it is never zero; an all-zero
556  * unsigned int is used to delimit runs). The remainder of the stored tuple
557  * may or may not match the in-memory representation of the tuple ---
558  * any conversion needed is the job of the writetup and readtup routines.
559  *
560  * If state->randomAccess is true, then the stored representation of the
561  * tuple must be followed by another "unsigned int" that is a copy of the
562  * length --- so the total tape space used is actually sizeof(unsigned int)
563  * more than the stored length value. This allows read-backwards. When
564  * randomAccess is not true, the write/read routines may omit the extra
565  * length word.
566  *
567  * writetup is expected to write both length words as well as the tuple
568  * data. When readtup is called, the tape is positioned just after the
569  * front length word; readtup must read the tuple data and advance past
570  * the back length word (if present).
571  *
572  * The write/read routines can make use of the tuple description data
573  * stored in the Tuplesortstate record, if needed. They are also expected
574  * to adjust state->availMem by the amount of memory space (not tape space!)
575  * released or consumed. There is no error return from either writetup
576  * or readtup; they should ereport() on failure.
577  *
578  *
579  * NOTES about memory consumption calculations:
580  *
581  * We count space allocated for tuples against the workMem limit, plus
582  * the space used by the variable-size memtuples array. Fixed-size space
583  * is not counted; it's small enough to not be interesting.
584  *
585  * Note that we count actual space used (as shown by GetMemoryChunkSpace)
586  * rather than the originally-requested size. This is important since
587  * palloc can add substantial overhead. It's not a complete answer since
588  * we won't count any wasted space in palloc allocation blocks, but it's
589  * a lot better than what we were doing before 7.3. As of 9.6, a
590  * separate memory context is used for caller passed tuples. Resetting
591  * it at certain key increments significantly ameliorates fragmentation.
592  * Note that this places a responsibility on copytup routines to use the
593  * correct memory context for these tuples (and to not use the reset
594  * context for anything whose lifetime needs to span multiple external
595  * sort runs). readtup routines use the slab allocator (they cannot use
596  * the reset context because it gets deleted at the point that merging
597  * begins).
598  */
599 
600 /* When using this macro, beware of double evaluation of len */
601 #define LogicalTapeReadExact(tape, ptr, len) \
602  do { \
603  if (LogicalTapeRead(tape, ptr, len) != (size_t) (len)) \
604  elog(ERROR, "unexpected end of data"); \
605  } while(0)
606 
607 
608 static Tuplesortstate *tuplesort_begin_common(int workMem,
609  SortCoordinate coordinate,
610  bool randomAccess);
612 static void puttuple_common(Tuplesortstate *state, SortTuple *tuple);
614 static void inittapes(Tuplesortstate *state, bool mergeruns);
615 static void inittapestate(Tuplesortstate *state, int maxTapes);
616 static void selectnewtape(Tuplesortstate *state);
617 static void init_slab_allocator(Tuplesortstate *state, int numSlots);
618 static void mergeruns(Tuplesortstate *state);
619 static void mergeonerun(Tuplesortstate *state);
620 static void beginmerge(Tuplesortstate *state);
621 static bool mergereadnext(Tuplesortstate *state, LogicalTape *srcTape, SortTuple *stup);
622 static void dumptuples(Tuplesortstate *state, bool alltuples);
630 static unsigned int getlen(LogicalTape *tape, bool eofOK);
631 static void markrunend(LogicalTape *tape);
632 static void *readtup_alloc(Tuplesortstate *state, Size tuplen);
633 static int comparetup_heap(const SortTuple *a, const SortTuple *b,
635 static void copytup_heap(Tuplesortstate *state, SortTuple *stup, void *tup);
636 static void writetup_heap(Tuplesortstate *state, LogicalTape *tape,
637  SortTuple *stup);
638 static void readtup_heap(Tuplesortstate *state, SortTuple *stup,
639  LogicalTape *tape, unsigned int len);
640 static int comparetup_cluster(const SortTuple *a, const SortTuple *b,
642 static void copytup_cluster(Tuplesortstate *state, SortTuple *stup, void *tup);
644  SortTuple *stup);
645 static void readtup_cluster(Tuplesortstate *state, SortTuple *stup,
646  LogicalTape *tape, unsigned int len);
647 static int comparetup_index_btree(const SortTuple *a, const SortTuple *b,
649 static int comparetup_index_hash(const SortTuple *a, const SortTuple *b,
651 static void copytup_index(Tuplesortstate *state, SortTuple *stup, void *tup);
652 static void writetup_index(Tuplesortstate *state, LogicalTape *tape,
653  SortTuple *stup);
654 static void readtup_index(Tuplesortstate *state, SortTuple *stup,
655  LogicalTape *tape, unsigned int len);
656 static int comparetup_datum(const SortTuple *a, const SortTuple *b,
658 static void copytup_datum(Tuplesortstate *state, SortTuple *stup, void *tup);
659 static void writetup_datum(Tuplesortstate *state, LogicalTape *tape,
660  SortTuple *stup);
661 static void readtup_datum(Tuplesortstate *state, SortTuple *stup,
662  LogicalTape *tape, unsigned int len);
667 static void free_sort_tuple(Tuplesortstate *state, SortTuple *stup);
668 static void tuplesort_free(Tuplesortstate *state);
670 
671 /*
672  * Special versions of qsort just for SortTuple objects. qsort_tuple() sorts
673  * any variant of SortTuples, using the appropriate comparetup function.
674  * qsort_ssup() is specialized for the case where the comparetup function
675  * reduces to ApplySortComparator(), that is single-key MinimalTuple sorts
676  * and Datum sorts.
677  */
678 
679 #define ST_SORT qsort_tuple
680 #define ST_ELEMENT_TYPE SortTuple
681 #define ST_COMPARE_RUNTIME_POINTER
682 #define ST_COMPARE_ARG_TYPE Tuplesortstate
683 #define ST_CHECK_FOR_INTERRUPTS
684 #define ST_SCOPE static
685 #define ST_DECLARE
686 #define ST_DEFINE
687 #include "lib/sort_template.h"
688 
689 #define ST_SORT qsort_ssup
690 #define ST_ELEMENT_TYPE SortTuple
691 #define ST_COMPARE(a, b, ssup) \
692  ApplySortComparator((a)->datum1, (a)->isnull1, \
693  (b)->datum1, (b)->isnull1, (ssup))
694 #define ST_COMPARE_ARG_TYPE SortSupportData
695 #define ST_CHECK_FOR_INTERRUPTS
696 #define ST_SCOPE static
697 #define ST_DEFINE
698 #include "lib/sort_template.h"
699 
700 /*
701  * tuplesort_begin_xxx
702  *
703  * Initialize for a tuple sort operation.
704  *
705  * After calling tuplesort_begin, the caller should call tuplesort_putXXX
706  * zero or more times, then call tuplesort_performsort when all the tuples
707  * have been supplied. After performsort, retrieve the tuples in sorted
708  * order by calling tuplesort_getXXX until it returns false/NULL. (If random
709  * access was requested, rescan, markpos, and restorepos can also be called.)
710  * Call tuplesort_end to terminate the operation and release memory/disk space.
711  *
712  * Each variant of tuplesort_begin has a workMem parameter specifying the
713  * maximum number of kilobytes of RAM to use before spilling data to disk.
714  * (The normal value of this parameter is work_mem, but some callers use
715  * other values.) Each variant also has a randomAccess parameter specifying
716  * whether the caller needs non-sequential access to the sort result.
717  */
718 
719 static Tuplesortstate *
720 tuplesort_begin_common(int workMem, SortCoordinate coordinate,
721  bool randomAccess)
722 {
724  MemoryContext maincontext;
725  MemoryContext sortcontext;
726  MemoryContext oldcontext;
727 
728  /* See leader_takeover_tapes() remarks on randomAccess support */
729  if (coordinate && randomAccess)
730  elog(ERROR, "random access disallowed under parallel sort");
731 
732  /*
733  * Memory context surviving tuplesort_reset. This memory context holds
734  * data which is useful to keep while sorting multiple similar batches.
735  */
737  "TupleSort main",
739 
740  /*
741  * Create a working memory context for one sort operation. The content of
742  * this context is deleted by tuplesort_reset.
743  */
744  sortcontext = AllocSetContextCreate(maincontext,
745  "TupleSort sort",
747 
748  /*
749  * Additionally a working memory context for tuples is setup in
750  * tuplesort_begin_batch.
751  */
752 
753  /*
754  * Make the Tuplesortstate within the per-sortstate context. This way, we
755  * don't need a separate pfree() operation for it at shutdown.
756  */
757  oldcontext = MemoryContextSwitchTo(maincontext);
758 
759  state = (Tuplesortstate *) palloc0(sizeof(Tuplesortstate));
760 
761 #ifdef TRACE_SORT
762  if (trace_sort)
763  pg_rusage_init(&state->ru_start);
764 #endif
765 
766  state->randomAccess = randomAccess;
767  state->tuples = true;
768 
769  /*
770  * workMem is forced to be at least 64KB, the current minimum valid value
771  * for the work_mem GUC. This is a defense against parallel sort callers
772  * that divide out memory among many workers in a way that leaves each
773  * with very little memory.
774  */
775  state->allowedMem = Max(workMem, 64) * (int64) 1024;
776  state->sortcontext = sortcontext;
777  state->maincontext = maincontext;
778 
779  /*
780  * Initial size of array must be more than ALLOCSET_SEPARATE_THRESHOLD;
781  * see comments in grow_memtuples().
782  */
784  state->memtuples = NULL;
785 
786  /*
787  * After all of the other non-parallel-related state, we setup all of the
788  * state needed for each batch.
789  */
790  tuplesort_begin_batch(state);
791 
792  /*
793  * Initialize parallel-related state based on coordination information
794  * from caller
795  */
796  if (!coordinate)
797  {
798  /* Serial sort */
799  state->shared = NULL;
800  state->worker = -1;
801  state->nParticipants = -1;
802  }
803  else if (coordinate->isWorker)
804  {
805  /* Parallel worker produces exactly one final run from all input */
806  state->shared = coordinate->sharedsort;
807  state->worker = worker_get_identifier(state);
808  state->nParticipants = -1;
809  }
810  else
811  {
812  /* Parallel leader state only used for final merge */
813  state->shared = coordinate->sharedsort;
814  state->worker = -1;
815  state->nParticipants = coordinate->nParticipants;
816  Assert(state->nParticipants >= 1);
817  }
818 
819  MemoryContextSwitchTo(oldcontext);
820 
821  return state;
822 }
823 
824 /*
825  * tuplesort_begin_batch
826  *
827  * Setup, or reset, all state need for processing a new set of tuples with this
828  * sort state. Called both from tuplesort_begin_common (the first time sorting
829  * with this sort state) and tuplesort_reset (for subsequent usages).
830  */
831 static void
833 {
834  MemoryContext oldcontext;
835 
836  oldcontext = MemoryContextSwitchTo(state->maincontext);
837 
838  /*
839  * Caller tuple (e.g. IndexTuple) memory context.
840  *
841  * A dedicated child context used exclusively for caller passed tuples
842  * eases memory management. Resetting at key points reduces
843  * fragmentation. Note that the memtuples array of SortTuples is allocated
844  * in the parent context, not this context, because there is no need to
845  * free memtuples early.
846  */
848  "Caller tuples",
850 
851  state->status = TSS_INITIAL;
852  state->bounded = false;
853  state->boundUsed = false;
854 
855  state->availMem = state->allowedMem;
856 
857  state->tapeset = NULL;
858 
859  state->memtupcount = 0;
860 
861  /*
862  * Initial size of array must be more than ALLOCSET_SEPARATE_THRESHOLD;
863  * see comments in grow_memtuples().
864  */
865  state->growmemtuples = true;
866  state->slabAllocatorUsed = false;
867  if (state->memtuples != NULL && state->memtupsize != INITIAL_MEMTUPSIZE)
868  {
869  pfree(state->memtuples);
870  state->memtuples = NULL;
872  }
873  if (state->memtuples == NULL)
874  {
875  state->memtuples = (SortTuple *) palloc(state->memtupsize * sizeof(SortTuple));
876  USEMEM(state, GetMemoryChunkSpace(state->memtuples));
877  }
878 
879  /* workMem must be large enough for the minimal memtuples array */
880  if (LACKMEM(state))
881  elog(ERROR, "insufficient memory allowed for sort");
882 
883  state->currentRun = 0;
884 
885  /*
886  * Tape variables (inputTapes, outputTapes, etc.) will be initialized by
887  * inittapes(), if needed.
888  */
889 
890  state->result_tape = NULL; /* flag that result tape has not been formed */
891 
892  MemoryContextSwitchTo(oldcontext);
893 }
894 
897  int nkeys, AttrNumber *attNums,
898  Oid *sortOperators, Oid *sortCollations,
899  bool *nullsFirstFlags,
900  int workMem, SortCoordinate coordinate, bool randomAccess)
901 {
902  Tuplesortstate *state = tuplesort_begin_common(workMem, coordinate,
903  randomAccess);
904  MemoryContext oldcontext;
905  int i;
906 
907  oldcontext = MemoryContextSwitchTo(state->maincontext);
908 
909  AssertArg(nkeys > 0);
910 
911 #ifdef TRACE_SORT
912  if (trace_sort)
913  elog(LOG,
914  "begin tuple sort: nkeys = %d, workMem = %d, randomAccess = %c",
915  nkeys, workMem, randomAccess ? 't' : 'f');
916 #endif
917 
918  state->nKeys = nkeys;
919 
920  TRACE_POSTGRESQL_SORT_START(HEAP_SORT,
921  false, /* no unique check */
922  nkeys,
923  workMem,
924  randomAccess,
925  PARALLEL_SORT(state));
926 
927  state->comparetup = comparetup_heap;
928  state->copytup = copytup_heap;
929  state->writetup = writetup_heap;
930  state->readtup = readtup_heap;
931 
932  state->tupDesc = tupDesc; /* assume we need not copy tupDesc */
933  state->abbrevNext = 10;
934 
935  /* Prepare SortSupport data for each column */
936  state->sortKeys = (SortSupport) palloc0(nkeys * sizeof(SortSupportData));
937 
938  for (i = 0; i < nkeys; i++)
939  {
940  SortSupport sortKey = state->sortKeys + i;
941 
942  AssertArg(attNums[i] != 0);
943  AssertArg(sortOperators[i] != 0);
944 
945  sortKey->ssup_cxt = CurrentMemoryContext;
946  sortKey->ssup_collation = sortCollations[i];
947  sortKey->ssup_nulls_first = nullsFirstFlags[i];
948  sortKey->ssup_attno = attNums[i];
949  /* Convey if abbreviation optimization is applicable in principle */
950  sortKey->abbreviate = (i == 0);
951 
952  PrepareSortSupportFromOrderingOp(sortOperators[i], sortKey);
953  }
954 
955  /*
956  * The "onlyKey" optimization cannot be used with abbreviated keys, since
957  * tie-breaker comparisons may be required. Typically, the optimization
958  * is only of value to pass-by-value types anyway, whereas abbreviated
959  * keys are typically only of value to pass-by-reference types.
960  */
961  if (nkeys == 1 && !state->sortKeys->abbrev_converter)
962  state->onlyKey = state->sortKeys;
963 
964  MemoryContextSwitchTo(oldcontext);
965 
966  return state;
967 }
968 
971  Relation indexRel,
972  int workMem,
973  SortCoordinate coordinate, bool randomAccess)
974 {
975  Tuplesortstate *state = tuplesort_begin_common(workMem, coordinate,
976  randomAccess);
977  BTScanInsert indexScanKey;
978  MemoryContext oldcontext;
979  int i;
980 
981  Assert(indexRel->rd_rel->relam == BTREE_AM_OID);
982 
983  oldcontext = MemoryContextSwitchTo(state->maincontext);
984 
985 #ifdef TRACE_SORT
986  if (trace_sort)
987  elog(LOG,
988  "begin tuple sort: nkeys = %d, workMem = %d, randomAccess = %c",
990  workMem, randomAccess ? 't' : 'f');
991 #endif
992 
993  state->nKeys = IndexRelationGetNumberOfKeyAttributes(indexRel);
994 
995  TRACE_POSTGRESQL_SORT_START(CLUSTER_SORT,
996  false, /* no unique check */
997  state->nKeys,
998  workMem,
999  randomAccess,
1000  PARALLEL_SORT(state));
1001 
1002  state->comparetup = comparetup_cluster;
1003  state->copytup = copytup_cluster;
1004  state->writetup = writetup_cluster;
1005  state->readtup = readtup_cluster;
1006  state->abbrevNext = 10;
1007 
1008  state->indexInfo = BuildIndexInfo(indexRel);
1009 
1010  state->tupDesc = tupDesc; /* assume we need not copy tupDesc */
1011 
1012  indexScanKey = _bt_mkscankey(indexRel, NULL);
1013 
1014  if (state->indexInfo->ii_Expressions != NULL)
1015  {
1016  TupleTableSlot *slot;
1017  ExprContext *econtext;
1018 
1019  /*
1020  * We will need to use FormIndexDatum to evaluate the index
1021  * expressions. To do that, we need an EState, as well as a
1022  * TupleTableSlot to put the table tuples into. The econtext's
1023  * scantuple has to point to that slot, too.
1024  */
1025  state->estate = CreateExecutorState();
1026  slot = MakeSingleTupleTableSlot(tupDesc, &TTSOpsHeapTuple);
1027  econtext = GetPerTupleExprContext(state->estate);
1028  econtext->ecxt_scantuple = slot;
1029  }
1030 
1031  /* Prepare SortSupport data for each column */
1032  state->sortKeys = (SortSupport) palloc0(state->nKeys *
1033  sizeof(SortSupportData));
1034 
1035  for (i = 0; i < state->nKeys; i++)
1036  {
1037  SortSupport sortKey = state->sortKeys + i;
1038  ScanKey scanKey = indexScanKey->scankeys + i;
1039  int16 strategy;
1040 
1041  sortKey->ssup_cxt = CurrentMemoryContext;
1042  sortKey->ssup_collation = scanKey->sk_collation;
1043  sortKey->ssup_nulls_first =
1044  (scanKey->sk_flags & SK_BT_NULLS_FIRST) != 0;
1045  sortKey->ssup_attno = scanKey->sk_attno;
1046  /* Convey if abbreviation optimization is applicable in principle */
1047  sortKey->abbreviate = (i == 0);
1048 
1049  AssertState(sortKey->ssup_attno != 0);
1050 
1051  strategy = (scanKey->sk_flags & SK_BT_DESC) != 0 ?
1053 
1054  PrepareSortSupportFromIndexRel(indexRel, strategy, sortKey);
1055  }
1056 
1057  pfree(indexScanKey);
1058 
1059  MemoryContextSwitchTo(oldcontext);
1060 
1061  return state;
1062 }
1063 
1066  Relation indexRel,
1067  bool enforceUnique,
1068  int workMem,
1069  SortCoordinate coordinate,
1070  bool randomAccess)
1071 {
1072  Tuplesortstate *state = tuplesort_begin_common(workMem, coordinate,
1073  randomAccess);
1074  BTScanInsert indexScanKey;
1075  MemoryContext oldcontext;
1076  int i;
1077 
1078  oldcontext = MemoryContextSwitchTo(state->maincontext);
1079 
1080 #ifdef TRACE_SORT
1081  if (trace_sort)
1082  elog(LOG,
1083  "begin index sort: unique = %c, workMem = %d, randomAccess = %c",
1084  enforceUnique ? 't' : 'f',
1085  workMem, randomAccess ? 't' : 'f');
1086 #endif
1087 
1088  state->nKeys = IndexRelationGetNumberOfKeyAttributes(indexRel);
1089 
1090  TRACE_POSTGRESQL_SORT_START(INDEX_SORT,
1091  enforceUnique,
1092  state->nKeys,
1093  workMem,
1094  randomAccess,
1095  PARALLEL_SORT(state));
1096 
1098  state->copytup = copytup_index;
1099  state->writetup = writetup_index;
1100  state->readtup = readtup_index;
1101  state->abbrevNext = 10;
1102 
1103  state->heapRel = heapRel;
1104  state->indexRel = indexRel;
1105  state->enforceUnique = enforceUnique;
1106 
1107  indexScanKey = _bt_mkscankey(indexRel, NULL);
1108 
1109  /* Prepare SortSupport data for each column */
1110  state->sortKeys = (SortSupport) palloc0(state->nKeys *
1111  sizeof(SortSupportData));
1112 
1113  for (i = 0; i < state->nKeys; i++)
1114  {
1115  SortSupport sortKey = state->sortKeys + i;
1116  ScanKey scanKey = indexScanKey->scankeys + i;
1117  int16 strategy;
1118 
1119  sortKey->ssup_cxt = CurrentMemoryContext;
1120  sortKey->ssup_collation = scanKey->sk_collation;
1121  sortKey->ssup_nulls_first =
1122  (scanKey->sk_flags & SK_BT_NULLS_FIRST) != 0;
1123  sortKey->ssup_attno = scanKey->sk_attno;
1124  /* Convey if abbreviation optimization is applicable in principle */
1125  sortKey->abbreviate = (i == 0);
1126 
1127  AssertState(sortKey->ssup_attno != 0);
1128 
1129  strategy = (scanKey->sk_flags & SK_BT_DESC) != 0 ?
1131 
1132  PrepareSortSupportFromIndexRel(indexRel, strategy, sortKey);
1133  }
1134 
1135  pfree(indexScanKey);
1136 
1137  MemoryContextSwitchTo(oldcontext);
1138 
1139  return state;
1140 }
1141 
1144  Relation indexRel,
1145  uint32 high_mask,
1146  uint32 low_mask,
1147  uint32 max_buckets,
1148  int workMem,
1149  SortCoordinate coordinate,
1150  bool randomAccess)
1151 {
1152  Tuplesortstate *state = tuplesort_begin_common(workMem, coordinate,
1153  randomAccess);
1154  MemoryContext oldcontext;
1155 
1156  oldcontext = MemoryContextSwitchTo(state->maincontext);
1157 
1158 #ifdef TRACE_SORT
1159  if (trace_sort)
1160  elog(LOG,
1161  "begin index sort: high_mask = 0x%x, low_mask = 0x%x, "
1162  "max_buckets = 0x%x, workMem = %d, randomAccess = %c",
1163  high_mask,
1164  low_mask,
1165  max_buckets,
1166  workMem, randomAccess ? 't' : 'f');
1167 #endif
1168 
1169  state->nKeys = 1; /* Only one sort column, the hash code */
1170 
1172  state->copytup = copytup_index;
1173  state->writetup = writetup_index;
1174  state->readtup = readtup_index;
1175 
1176  state->heapRel = heapRel;
1177  state->indexRel = indexRel;
1178 
1179  state->high_mask = high_mask;
1180  state->low_mask = low_mask;
1181  state->max_buckets = max_buckets;
1182 
1183  MemoryContextSwitchTo(oldcontext);
1184 
1185  return state;
1186 }
1187 
1190  Relation indexRel,
1191  int workMem,
1192  SortCoordinate coordinate,
1193  bool randomAccess)
1194 {
1195  Tuplesortstate *state = tuplesort_begin_common(workMem, coordinate,
1196  randomAccess);
1197  MemoryContext oldcontext;
1198  int i;
1199 
1200  oldcontext = MemoryContextSwitchTo(state->sortcontext);
1201 
1202 #ifdef TRACE_SORT
1203  if (trace_sort)
1204  elog(LOG,
1205  "begin index sort: workMem = %d, randomAccess = %c",
1206  workMem, randomAccess ? 't' : 'f');
1207 #endif
1208 
1209  state->nKeys = IndexRelationGetNumberOfKeyAttributes(indexRel);
1210 
1212  state->copytup = copytup_index;
1213  state->writetup = writetup_index;
1214  state->readtup = readtup_index;
1215 
1216  state->heapRel = heapRel;
1217  state->indexRel = indexRel;
1218 
1219  /* Prepare SortSupport data for each column */
1220  state->sortKeys = (SortSupport) palloc0(state->nKeys *
1221  sizeof(SortSupportData));
1222 
1223  for (i = 0; i < state->nKeys; i++)
1224  {
1225  SortSupport sortKey = state->sortKeys + i;
1226 
1227  sortKey->ssup_cxt = CurrentMemoryContext;
1228  sortKey->ssup_collation = indexRel->rd_indcollation[i];
1229  sortKey->ssup_nulls_first = false;
1230  sortKey->ssup_attno = i + 1;
1231  /* Convey if abbreviation optimization is applicable in principle */
1232  sortKey->abbreviate = (i == 0);
1233 
1234  AssertState(sortKey->ssup_attno != 0);
1235 
1236  /* Look for a sort support function */
1237  PrepareSortSupportFromGistIndexRel(indexRel, sortKey);
1238  }
1239 
1240  MemoryContextSwitchTo(oldcontext);
1241 
1242  return state;
1243 }
1244 
1246 tuplesort_begin_datum(Oid datumType, Oid sortOperator, Oid sortCollation,
1247  bool nullsFirstFlag, int workMem,
1248  SortCoordinate coordinate, bool randomAccess)
1249 {
1250  Tuplesortstate *state = tuplesort_begin_common(workMem, coordinate,
1251  randomAccess);
1252  MemoryContext oldcontext;
1253  int16 typlen;
1254  bool typbyval;
1255 
1256  oldcontext = MemoryContextSwitchTo(state->maincontext);
1257 
1258 #ifdef TRACE_SORT
1259  if (trace_sort)
1260  elog(LOG,
1261  "begin datum sort: workMem = %d, randomAccess = %c",
1262  workMem, randomAccess ? 't' : 'f');
1263 #endif
1264 
1265  state->nKeys = 1; /* always a one-column sort */
1266 
1267  TRACE_POSTGRESQL_SORT_START(DATUM_SORT,
1268  false, /* no unique check */
1269  1,
1270  workMem,
1271  randomAccess,
1272  PARALLEL_SORT(state));
1273 
1274  state->comparetup = comparetup_datum;
1275  state->copytup = copytup_datum;
1276  state->writetup = writetup_datum;
1277  state->readtup = readtup_datum;
1278  state->abbrevNext = 10;
1279 
1280  state->datumType = datumType;
1281 
1282  /* lookup necessary attributes of the datum type */
1283  get_typlenbyval(datumType, &typlen, &typbyval);
1284  state->datumTypeLen = typlen;
1285  state->tuples = !typbyval;
1286 
1287  /* Prepare SortSupport data */
1288  state->sortKeys = (SortSupport) palloc0(sizeof(SortSupportData));
1289 
1291  state->sortKeys->ssup_collation = sortCollation;
1292  state->sortKeys->ssup_nulls_first = nullsFirstFlag;
1293 
1294  /*
1295  * Abbreviation is possible here only for by-reference types. In theory,
1296  * a pass-by-value datatype could have an abbreviated form that is cheaper
1297  * to compare. In a tuple sort, we could support that, because we can
1298  * always extract the original datum from the tuple as needed. Here, we
1299  * can't, because a datum sort only stores a single copy of the datum; the
1300  * "tuple" field of each SortTuple is NULL.
1301  */
1302  state->sortKeys->abbreviate = !typbyval;
1303 
1304  PrepareSortSupportFromOrderingOp(sortOperator, state->sortKeys);
1305 
1306  /*
1307  * The "onlyKey" optimization cannot be used with abbreviated keys, since
1308  * tie-breaker comparisons may be required. Typically, the optimization
1309  * is only of value to pass-by-value types anyway, whereas abbreviated
1310  * keys are typically only of value to pass-by-reference types.
1311  */
1312  if (!state->sortKeys->abbrev_converter)
1313  state->onlyKey = state->sortKeys;
1314 
1315  MemoryContextSwitchTo(oldcontext);
1316 
1317  return state;
1318 }
1319 
1320 /*
1321  * tuplesort_set_bound
1322  *
1323  * Advise tuplesort that at most the first N result tuples are required.
1324  *
1325  * Must be called before inserting any tuples. (Actually, we could allow it
1326  * as long as the sort hasn't spilled to disk, but there seems no need for
1327  * delayed calls at the moment.)
1328  *
1329  * This is a hint only. The tuplesort may still return more tuples than
1330  * requested. Parallel leader tuplesorts will always ignore the hint.
1331  */
1332 void
1334 {
1335  /* Assert we're called before loading any tuples */
1336  Assert(state->status == TSS_INITIAL && state->memtupcount == 0);
1337  /* Can't set the bound twice, either */
1338  Assert(!state->bounded);
1339  /* Also, this shouldn't be called in a parallel worker */
1340  Assert(!WORKER(state));
1341 
1342  /* Parallel leader allows but ignores hint */
1343  if (LEADER(state))
1344  return;
1345 
1346 #ifdef DEBUG_BOUNDED_SORT
1347  /* Honor GUC setting that disables the feature (for easy testing) */
1348  if (!optimize_bounded_sort)
1349  return;
1350 #endif
1351 
1352  /* We want to be able to compute bound * 2, so limit the setting */
1353  if (bound > (int64) (INT_MAX / 2))
1354  return;
1355 
1356  state->bounded = true;
1357  state->bound = (int) bound;
1358 
1359  /*
1360  * Bounded sorts are not an effective target for abbreviated key
1361  * optimization. Disable by setting state to be consistent with no
1362  * abbreviation support.
1363  */
1364  state->sortKeys->abbrev_converter = NULL;
1365  if (state->sortKeys->abbrev_full_comparator)
1367 
1368  /* Not strictly necessary, but be tidy */
1369  state->sortKeys->abbrev_abort = NULL;
1370  state->sortKeys->abbrev_full_comparator = NULL;
1371 }
1372 
1373 /*
1374  * tuplesort_used_bound
1375  *
1376  * Allow callers to find out if the sort state was able to use a bound.
1377  */
1378 bool
1380 {
1381  return state->boundUsed;
1382 }
1383 
1384 /*
1385  * tuplesort_free
1386  *
1387  * Internal routine for freeing resources of tuplesort.
1388  */
1389 static void
1391 {
1392  /* context swap probably not needed, but let's be safe */
1393  MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext);
1394 
1395 #ifdef TRACE_SORT
1396  long spaceUsed;
1397 
1398  if (state->tapeset)
1399  spaceUsed = LogicalTapeSetBlocks(state->tapeset);
1400  else
1401  spaceUsed = (state->allowedMem - state->availMem + 1023) / 1024;
1402 #endif
1403 
1404  /*
1405  * Delete temporary "tape" files, if any.
1406  *
1407  * Note: want to include this in reported total cost of sort, hence need
1408  * for two #ifdef TRACE_SORT sections.
1409  *
1410  * We don't bother to destroy the individual tapes here. They will go away
1411  * with the sortcontext. (In TSS_FINALMERGE state, we have closed
1412  * finished tapes already.)
1413  */
1414  if (state->tapeset)
1415  LogicalTapeSetClose(state->tapeset);
1416 
1417 #ifdef TRACE_SORT
1418  if (trace_sort)
1419  {
1420  if (state->tapeset)
1421  elog(LOG, "%s of worker %d ended, %ld disk blocks used: %s",
1422  SERIAL(state) ? "external sort" : "parallel external sort",
1423  state->worker, spaceUsed, pg_rusage_show(&state->ru_start));
1424  else
1425  elog(LOG, "%s of worker %d ended, %ld KB used: %s",
1426  SERIAL(state) ? "internal sort" : "unperformed parallel sort",
1427  state->worker, spaceUsed, pg_rusage_show(&state->ru_start));
1428  }
1429 
1430  TRACE_POSTGRESQL_SORT_DONE(state->tapeset != NULL, spaceUsed);
1431 #else
1432 
1433  /*
1434  * If you disabled TRACE_SORT, you can still probe sort__done, but you
1435  * ain't getting space-used stats.
1436  */
1437  TRACE_POSTGRESQL_SORT_DONE(state->tapeset != NULL, 0L);
1438 #endif
1439 
1440  /* Free any execution state created for CLUSTER case */
1441  if (state->estate != NULL)
1442  {
1443  ExprContext *econtext = GetPerTupleExprContext(state->estate);
1444 
1446  FreeExecutorState(state->estate);
1447  }
1448 
1449  MemoryContextSwitchTo(oldcontext);
1450 
1451  /*
1452  * Free the per-sort memory context, thereby releasing all working memory.
1453  */
1455 }
1456 
1457 /*
1458  * tuplesort_end
1459  *
1460  * Release resources and clean up.
1461  *
1462  * NOTE: after calling this, any pointers returned by tuplesort_getXXX are
1463  * pointing to garbage. Be careful not to attempt to use or free such
1464  * pointers afterwards!
1465  */
1466 void
1468 {
1469  tuplesort_free(state);
1470 
1471  /*
1472  * Free the main memory context, including the Tuplesortstate struct
1473  * itself.
1474  */
1476 }
1477 
1478 /*
1479  * tuplesort_updatemax
1480  *
1481  * Update maximum resource usage statistics.
1482  */
1483 static void
1485 {
1486  int64 spaceUsed;
1487  bool isSpaceDisk;
1488 
1489  /*
1490  * Note: it might seem we should provide both memory and disk usage for a
1491  * disk-based sort. However, the current code doesn't track memory space
1492  * accurately once we have begun to return tuples to the caller (since we
1493  * don't account for pfree's the caller is expected to do), so we cannot
1494  * rely on availMem in a disk sort. This does not seem worth the overhead
1495  * to fix. Is it worth creating an API for the memory context code to
1496  * tell us how much is actually used in sortcontext?
1497  */
1498  if (state->tapeset)
1499  {
1500  isSpaceDisk = true;
1501  spaceUsed = LogicalTapeSetBlocks(state->tapeset) * BLCKSZ;
1502  }
1503  else
1504  {
1505  isSpaceDisk = false;
1506  spaceUsed = state->allowedMem - state->availMem;
1507  }
1508 
1509  /*
1510  * Sort evicts data to the disk when it wasn't able to fit that data into
1511  * main memory. This is why we assume space used on the disk to be more
1512  * important for tracking resource usage than space used in memory. Note
1513  * that the amount of space occupied by some tupleset on the disk might be
1514  * less than amount of space occupied by the same tupleset in memory due
1515  * to more compact representation.
1516  */
1517  if ((isSpaceDisk && !state->isMaxSpaceDisk) ||
1518  (isSpaceDisk == state->isMaxSpaceDisk && spaceUsed > state->maxSpace))
1519  {
1520  state->maxSpace = spaceUsed;
1521  state->isMaxSpaceDisk = isSpaceDisk;
1522  state->maxSpaceStatus = state->status;
1523  }
1524 }
1525 
1526 /*
1527  * tuplesort_reset
1528  *
1529  * Reset the tuplesort. Reset all the data in the tuplesort, but leave the
1530  * meta-information in. After tuplesort_reset, tuplesort is ready to start
1531  * a new sort. This allows avoiding recreation of tuple sort states (and
1532  * save resources) when sorting multiple small batches.
1533  */
1534 void
1536 {
1537  tuplesort_updatemax(state);
1538  tuplesort_free(state);
1539 
1540  /*
1541  * After we've freed up per-batch memory, re-setup all of the state common
1542  * to both the first batch and any subsequent batch.
1543  */
1544  tuplesort_begin_batch(state);
1545 
1546  state->lastReturnedTuple = NULL;
1547  state->slabMemoryBegin = NULL;
1548  state->slabMemoryEnd = NULL;
1549  state->slabFreeHead = NULL;
1550 }
1551 
1552 /*
1553  * Grow the memtuples[] array, if possible within our memory constraint. We
1554  * must not exceed INT_MAX tuples in memory or the caller-provided memory
1555  * limit. Return true if we were able to enlarge the array, false if not.
1556  *
1557  * Normally, at each increment we double the size of the array. When doing
1558  * that would exceed a limit, we attempt one last, smaller increase (and then
1559  * clear the growmemtuples flag so we don't try any more). That allows us to
1560  * use memory as fully as permitted; sticking to the pure doubling rule could
1561  * result in almost half going unused. Because availMem moves around with
1562  * tuple addition/removal, we need some rule to prevent making repeated small
1563  * increases in memtupsize, which would just be useless thrashing. The
1564  * growmemtuples flag accomplishes that and also prevents useless
1565  * recalculations in this function.
1566  */
1567 static bool
1569 {
1570  int newmemtupsize;
1571  int memtupsize = state->memtupsize;
1572  int64 memNowUsed = state->allowedMem - state->availMem;
1573 
1574  /* Forget it if we've already maxed out memtuples, per comment above */
1575  if (!state->growmemtuples)
1576  return false;
1577 
1578  /* Select new value of memtupsize */
1579  if (memNowUsed <= state->availMem)
1580  {
1581  /*
1582  * We've used no more than half of allowedMem; double our usage,
1583  * clamping at INT_MAX tuples.
1584  */
1585  if (memtupsize < INT_MAX / 2)
1586  newmemtupsize = memtupsize * 2;
1587  else
1588  {
1589  newmemtupsize = INT_MAX;
1590  state->growmemtuples = false;
1591  }
1592  }
1593  else
1594  {
1595  /*
1596  * This will be the last increment of memtupsize. Abandon doubling
1597  * strategy and instead increase as much as we safely can.
1598  *
1599  * To stay within allowedMem, we can't increase memtupsize by more
1600  * than availMem / sizeof(SortTuple) elements. In practice, we want
1601  * to increase it by considerably less, because we need to leave some
1602  * space for the tuples to which the new array slots will refer. We
1603  * assume the new tuples will be about the same size as the tuples
1604  * we've already seen, and thus we can extrapolate from the space
1605  * consumption so far to estimate an appropriate new size for the
1606  * memtuples array. The optimal value might be higher or lower than
1607  * this estimate, but it's hard to know that in advance. We again
1608  * clamp at INT_MAX tuples.
1609  *
1610  * This calculation is safe against enlarging the array so much that
1611  * LACKMEM becomes true, because the memory currently used includes
1612  * the present array; thus, there would be enough allowedMem for the
1613  * new array elements even if no other memory were currently used.
1614  *
1615  * We do the arithmetic in float8, because otherwise the product of
1616  * memtupsize and allowedMem could overflow. Any inaccuracy in the
1617  * result should be insignificant; but even if we computed a
1618  * completely insane result, the checks below will prevent anything
1619  * really bad from happening.
1620  */
1621  double grow_ratio;
1622 
1623  grow_ratio = (double) state->allowedMem / (double) memNowUsed;
1624  if (memtupsize * grow_ratio < INT_MAX)
1625  newmemtupsize = (int) (memtupsize * grow_ratio);
1626  else
1627  newmemtupsize = INT_MAX;
1628 
1629  /* We won't make any further enlargement attempts */
1630  state->growmemtuples = false;
1631  }
1632 
1633  /* Must enlarge array by at least one element, else report failure */
1634  if (newmemtupsize <= memtupsize)
1635  goto noalloc;
1636 
1637  /*
1638  * On a 32-bit machine, allowedMem could exceed MaxAllocHugeSize. Clamp
1639  * to ensure our request won't be rejected. Note that we can easily
1640  * exhaust address space before facing this outcome. (This is presently
1641  * impossible due to guc.c's MAX_KILOBYTES limitation on work_mem, but
1642  * don't rely on that at this distance.)
1643  */
1644  if ((Size) newmemtupsize >= MaxAllocHugeSize / sizeof(SortTuple))
1645  {
1646  newmemtupsize = (int) (MaxAllocHugeSize / sizeof(SortTuple));
1647  state->growmemtuples = false; /* can't grow any more */
1648  }
1649 
1650  /*
1651  * We need to be sure that we do not cause LACKMEM to become true, else
1652  * the space management algorithm will go nuts. The code above should
1653  * never generate a dangerous request, but to be safe, check explicitly
1654  * that the array growth fits within availMem. (We could still cause
1655  * LACKMEM if the memory chunk overhead associated with the memtuples
1656  * array were to increase. That shouldn't happen because we chose the
1657  * initial array size large enough to ensure that palloc will be treating
1658  * both old and new arrays as separate chunks. But we'll check LACKMEM
1659  * explicitly below just in case.)
1660  */
1661  if (state->availMem < (int64) ((newmemtupsize - memtupsize) * sizeof(SortTuple)))
1662  goto noalloc;
1663 
1664  /* OK, do it */
1665  FREEMEM(state, GetMemoryChunkSpace(state->memtuples));
1666  state->memtupsize = newmemtupsize;
1667  state->memtuples = (SortTuple *)
1668  repalloc_huge(state->memtuples,
1669  state->memtupsize * sizeof(SortTuple));
1670  USEMEM(state, GetMemoryChunkSpace(state->memtuples));
1671  if (LACKMEM(state))
1672  elog(ERROR, "unexpected out-of-memory situation in tuplesort");
1673  return true;
1674 
1675 noalloc:
1676  /* If for any reason we didn't realloc, shut off future attempts */
1677  state->growmemtuples = false;
1678  return false;
1679 }
1680 
1681 /*
1682  * Accept one tuple while collecting input data for sort.
1683  *
1684  * Note that the input data is always copied; the caller need not save it.
1685  */
1686 void
1688 {
1689  MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext);
1690  SortTuple stup;
1691 
1692  /*
1693  * Copy the given tuple into memory we control, and decrease availMem.
1694  * Then call the common code.
1695  */
1696  COPYTUP(state, &stup, (void *) slot);
1697 
1698  puttuple_common(state, &stup);
1699 
1700  MemoryContextSwitchTo(oldcontext);
1701 }
1702 
1703 /*
1704  * Accept one tuple while collecting input data for sort.
1705  *
1706  * Note that the input data is always copied; the caller need not save it.
1707  */
1708 void
1710 {
1711  MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext);
1712  SortTuple stup;
1713 
1714  /*
1715  * Copy the given tuple into memory we control, and decrease availMem.
1716  * Then call the common code.
1717  */
1718  COPYTUP(state, &stup, (void *) tup);
1719 
1720  puttuple_common(state, &stup);
1721 
1722  MemoryContextSwitchTo(oldcontext);
1723 }
1724 
1725 /*
1726  * Collect one index tuple while collecting input data for sort, building
1727  * it from caller-supplied values.
1728  */
1729 void
1731  ItemPointer self, Datum *values,
1732  bool *isnull)
1733 {
1734  MemoryContext oldcontext = MemoryContextSwitchTo(state->tuplecontext);
1735  SortTuple stup;
1736  Datum original;
1737  IndexTuple tuple;
1738 
1739  stup.tuple = index_form_tuple(RelationGetDescr(rel), values, isnull);
1740  tuple = ((IndexTuple) stup.tuple);
1741  tuple->t_tid = *self;
1742  USEMEM(state, GetMemoryChunkSpace(stup.tuple));
1743  /* set up first-column key value */
1744  original = index_getattr(tuple,
1745  1,
1746  RelationGetDescr(state->indexRel),
1747  &stup.isnull1);
1748 
1750 
1751  if (!state->sortKeys || !state->sortKeys->abbrev_converter || stup.isnull1)
1752  {
1753  /*
1754  * Store ordinary Datum representation, or NULL value. If there is a
1755  * converter it won't expect NULL values, and cost model is not
1756  * required to account for NULL, so in that case we avoid calling
1757  * converter and just set datum1 to zeroed representation (to be
1758  * consistent, and to support cheap inequality tests for NULL
1759  * abbreviated keys).
1760  */
1761  stup.datum1 = original;
1762  }
1763  else if (!consider_abort_common(state))
1764  {
1765  /* Store abbreviated key representation */
1766  stup.datum1 = state->sortKeys->abbrev_converter(original,
1767  state->sortKeys);
1768  }
1769  else
1770  {
1771  /* Abort abbreviation */
1772  int i;
1773 
1774  stup.datum1 = original;
1775 
1776  /*
1777  * Set state to be consistent with never trying abbreviation.
1778  *
1779  * Alter datum1 representation in already-copied tuples, so as to
1780  * ensure a consistent representation (current tuple was just
1781  * handled). It does not matter if some dumped tuples are already
1782  * sorted on tape, since serialized tuples lack abbreviated keys
1783  * (TSS_BUILDRUNS state prevents control reaching here in any case).
1784  */
1785  for (i = 0; i < state->memtupcount; i++)
1786  {
1787  SortTuple *mtup = &state->memtuples[i];
1788 
1789  tuple = mtup->tuple;
1790  mtup->datum1 = index_getattr(tuple,
1791  1,
1792  RelationGetDescr(state->indexRel),
1793  &mtup->isnull1);
1794  }
1795  }
1796 
1797  puttuple_common(state, &stup);
1798 
1799  MemoryContextSwitchTo(oldcontext);
1800 }
1801 
1802 /*
1803  * Accept one Datum while collecting input data for sort.
1804  *
1805  * If the Datum is pass-by-ref type, the value will be copied.
1806  */
1807 void
1809 {
1810  MemoryContext oldcontext = MemoryContextSwitchTo(state->tuplecontext);
1811  SortTuple stup;
1812 
1813  /*
1814  * Pass-by-value types or null values are just stored directly in
1815  * stup.datum1 (and stup.tuple is not used and set to NULL).
1816  *
1817  * Non-null pass-by-reference values need to be copied into memory we
1818  * control, and possibly abbreviated. The copied value is pointed to by
1819  * stup.tuple and is treated as the canonical copy (e.g. to return via
1820  * tuplesort_getdatum or when writing to tape); stup.datum1 gets the
1821  * abbreviated value if abbreviation is happening, otherwise it's
1822  * identical to stup.tuple.
1823  */
1824 
1825  if (isNull || !state->tuples)
1826  {
1827  /*
1828  * Set datum1 to zeroed representation for NULLs (to be consistent,
1829  * and to support cheap inequality tests for NULL abbreviated keys).
1830  */
1831  stup.datum1 = !isNull ? val : (Datum) 0;
1832  stup.isnull1 = isNull;
1833  stup.tuple = NULL; /* no separate storage */
1835  }
1836  else
1837  {
1838  Datum original = datumCopy(val, false, state->datumTypeLen);
1839 
1840  stup.isnull1 = false;
1841  stup.tuple = DatumGetPointer(original);
1842  USEMEM(state, GetMemoryChunkSpace(stup.tuple));
1844 
1845  if (!state->sortKeys->abbrev_converter)
1846  {
1847  stup.datum1 = original;
1848  }
1849  else if (!consider_abort_common(state))
1850  {
1851  /* Store abbreviated key representation */
1852  stup.datum1 = state->sortKeys->abbrev_converter(original,
1853  state->sortKeys);
1854  }
1855  else
1856  {
1857  /* Abort abbreviation */
1858  int i;
1859 
1860  stup.datum1 = original;
1861 
1862  /*
1863  * Set state to be consistent with never trying abbreviation.
1864  *
1865  * Alter datum1 representation in already-copied tuples, so as to
1866  * ensure a consistent representation (current tuple was just
1867  * handled). It does not matter if some dumped tuples are already
1868  * sorted on tape, since serialized tuples lack abbreviated keys
1869  * (TSS_BUILDRUNS state prevents control reaching here in any
1870  * case).
1871  */
1872  for (i = 0; i < state->memtupcount; i++)
1873  {
1874  SortTuple *mtup = &state->memtuples[i];
1875 
1876  mtup->datum1 = PointerGetDatum(mtup->tuple);
1877  }
1878  }
1879  }
1880 
1881  puttuple_common(state, &stup);
1882 
1883  MemoryContextSwitchTo(oldcontext);
1884 }
1885 
1886 /*
1887  * Shared code for tuple and datum cases.
1888  */
1889 static void
1891 {
1892  Assert(!LEADER(state));
1893 
1894  switch (state->status)
1895  {
1896  case TSS_INITIAL:
1897 
1898  /*
1899  * Save the tuple into the unsorted array. First, grow the array
1900  * as needed. Note that we try to grow the array when there is
1901  * still one free slot remaining --- if we fail, there'll still be
1902  * room to store the incoming tuple, and then we'll switch to
1903  * tape-based operation.
1904  */
1905  if (state->memtupcount >= state->memtupsize - 1)
1906  {
1907  (void) grow_memtuples(state);
1908  Assert(state->memtupcount < state->memtupsize);
1909  }
1910  state->memtuples[state->memtupcount++] = *tuple;
1911 
1912  /*
1913  * Check if it's time to switch over to a bounded heapsort. We do
1914  * so if the input tuple count exceeds twice the desired tuple
1915  * count (this is a heuristic for where heapsort becomes cheaper
1916  * than a quicksort), or if we've just filled workMem and have
1917  * enough tuples to meet the bound.
1918  *
1919  * Note that once we enter TSS_BOUNDED state we will always try to
1920  * complete the sort that way. In the worst case, if later input
1921  * tuples are larger than earlier ones, this might cause us to
1922  * exceed workMem significantly.
1923  */
1924  if (state->bounded &&
1925  (state->memtupcount > state->bound * 2 ||
1926  (state->memtupcount > state->bound && LACKMEM(state))))
1927  {
1928 #ifdef TRACE_SORT
1929  if (trace_sort)
1930  elog(LOG, "switching to bounded heapsort at %d tuples: %s",
1931  state->memtupcount,
1932  pg_rusage_show(&state->ru_start));
1933 #endif
1934  make_bounded_heap(state);
1935  return;
1936  }
1937 
1938  /*
1939  * Done if we still fit in available memory and have array slots.
1940  */
1941  if (state->memtupcount < state->memtupsize && !LACKMEM(state))
1942  return;
1943 
1944  /*
1945  * Nope; time to switch to tape-based operation.
1946  */
1947  inittapes(state, true);
1948 
1949  /*
1950  * Dump all tuples.
1951  */
1952  dumptuples(state, false);
1953  break;
1954 
1955  case TSS_BOUNDED:
1956 
1957  /*
1958  * We don't want to grow the array here, so check whether the new
1959  * tuple can be discarded before putting it in. This should be a
1960  * good speed optimization, too, since when there are many more
1961  * input tuples than the bound, most input tuples can be discarded
1962  * with just this one comparison. Note that because we currently
1963  * have the sort direction reversed, we must check for <= not >=.
1964  */
1965  if (COMPARETUP(state, tuple, &state->memtuples[0]) <= 0)
1966  {
1967  /* new tuple <= top of the heap, so we can discard it */
1968  free_sort_tuple(state, tuple);
1970  }
1971  else
1972  {
1973  /* discard top of heap, replacing it with the new tuple */
1974  free_sort_tuple(state, &state->memtuples[0]);
1975  tuplesort_heap_replace_top(state, tuple);
1976  }
1977  break;
1978 
1979  case TSS_BUILDRUNS:
1980 
1981  /*
1982  * Save the tuple into the unsorted array (there must be space)
1983  */
1984  state->memtuples[state->memtupcount++] = *tuple;
1985 
1986  /*
1987  * If we are over the memory limit, dump all tuples.
1988  */
1989  dumptuples(state, false);
1990  break;
1991 
1992  default:
1993  elog(ERROR, "invalid tuplesort state");
1994  break;
1995  }
1996 }
1997 
1998 static bool
2000 {
2001  Assert(state->sortKeys[0].abbrev_converter != NULL);
2002  Assert(state->sortKeys[0].abbrev_abort != NULL);
2003  Assert(state->sortKeys[0].abbrev_full_comparator != NULL);
2004 
2005  /*
2006  * Check effectiveness of abbreviation optimization. Consider aborting
2007  * when still within memory limit.
2008  */
2009  if (state->status == TSS_INITIAL &&
2010  state->memtupcount >= state->abbrevNext)
2011  {
2012  state->abbrevNext *= 2;
2013 
2014  /*
2015  * Check opclass-supplied abbreviation abort routine. It may indicate
2016  * that abbreviation should not proceed.
2017  */
2018  if (!state->sortKeys->abbrev_abort(state->memtupcount,
2019  state->sortKeys))
2020  return false;
2021 
2022  /*
2023  * Finally, restore authoritative comparator, and indicate that
2024  * abbreviation is not in play by setting abbrev_converter to NULL
2025  */
2026  state->sortKeys[0].comparator = state->sortKeys[0].abbrev_full_comparator;
2027  state->sortKeys[0].abbrev_converter = NULL;
2028  /* Not strictly necessary, but be tidy */
2029  state->sortKeys[0].abbrev_abort = NULL;
2030  state->sortKeys[0].abbrev_full_comparator = NULL;
2031 
2032  /* Give up - expect original pass-by-value representation */
2033  return true;
2034  }
2035 
2036  return false;
2037 }
2038 
2039 /*
2040  * All tuples have been provided; finish the sort.
2041  */
2042 void
2044 {
2045  MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext);
2046 
2047 #ifdef TRACE_SORT
2048  if (trace_sort)
2049  elog(LOG, "performsort of worker %d starting: %s",
2050  state->worker, pg_rusage_show(&state->ru_start));
2051 #endif
2052 
2053  switch (state->status)
2054  {
2055  case TSS_INITIAL:
2056 
2057  /*
2058  * We were able to accumulate all the tuples within the allowed
2059  * amount of memory, or leader to take over worker tapes
2060  */
2061  if (SERIAL(state))
2062  {
2063  /* Just qsort 'em and we're done */
2064  tuplesort_sort_memtuples(state);
2065  state->status = TSS_SORTEDINMEM;
2066  }
2067  else if (WORKER(state))
2068  {
2069  /*
2070  * Parallel workers must still dump out tuples to tape. No
2071  * merge is required to produce single output run, though.
2072  */
2073  inittapes(state, false);
2074  dumptuples(state, true);
2075  worker_nomergeruns(state);
2076  state->status = TSS_SORTEDONTAPE;
2077  }
2078  else
2079  {
2080  /*
2081  * Leader will take over worker tapes and merge worker runs.
2082  * Note that mergeruns sets the correct state->status.
2083  */
2084  leader_takeover_tapes(state);
2085  mergeruns(state);
2086  }
2087  state->current = 0;
2088  state->eof_reached = false;
2089  state->markpos_block = 0L;
2090  state->markpos_offset = 0;
2091  state->markpos_eof = false;
2092  break;
2093 
2094  case TSS_BOUNDED:
2095 
2096  /*
2097  * We were able to accumulate all the tuples required for output
2098  * in memory, using a heap to eliminate excess tuples. Now we
2099  * have to transform the heap to a properly-sorted array.
2100  */
2101  sort_bounded_heap(state);
2102  state->current = 0;
2103  state->eof_reached = false;
2104  state->markpos_offset = 0;
2105  state->markpos_eof = false;
2106  state->status = TSS_SORTEDINMEM;
2107  break;
2108 
2109  case TSS_BUILDRUNS:
2110 
2111  /*
2112  * Finish tape-based sort. First, flush all tuples remaining in
2113  * memory out to tape; then merge until we have a single remaining
2114  * run (or, if !randomAccess and !WORKER(), one run per tape).
2115  * Note that mergeruns sets the correct state->status.
2116  */
2117  dumptuples(state, true);
2118  mergeruns(state);
2119  state->eof_reached = false;
2120  state->markpos_block = 0L;
2121  state->markpos_offset = 0;
2122  state->markpos_eof = false;
2123  break;
2124 
2125  default:
2126  elog(ERROR, "invalid tuplesort state");
2127  break;
2128  }
2129 
2130 #ifdef TRACE_SORT
2131  if (trace_sort)
2132  {
2133  if (state->status == TSS_FINALMERGE)
2134  elog(LOG, "performsort of worker %d done (except %d-way final merge): %s",
2135  state->worker, state->nInputTapes,
2136  pg_rusage_show(&state->ru_start));
2137  else
2138  elog(LOG, "performsort of worker %d done: %s",
2139  state->worker, pg_rusage_show(&state->ru_start));
2140  }
2141 #endif
2142 
2143  MemoryContextSwitchTo(oldcontext);
2144 }
2145 
2146 /*
2147  * Internal routine to fetch the next tuple in either forward or back
2148  * direction into *stup. Returns false if no more tuples.
2149  * Returned tuple belongs to tuplesort memory context, and must not be freed
2150  * by caller. Note that fetched tuple is stored in memory that may be
2151  * recycled by any future fetch.
2152  */
2153 static bool
2155  SortTuple *stup)
2156 {
2157  unsigned int tuplen;
2158  size_t nmoved;
2159 
2160  Assert(!WORKER(state));
2161 
2162  switch (state->status)
2163  {
2164  case TSS_SORTEDINMEM:
2165  Assert(forward || state->randomAccess);
2166  Assert(!state->slabAllocatorUsed);
2167  if (forward)
2168  {
2169  if (state->current < state->memtupcount)
2170  {
2171  *stup = state->memtuples[state->current++];
2172  return true;
2173  }
2174  state->eof_reached = true;
2175 
2176  /*
2177  * Complain if caller tries to retrieve more tuples than
2178  * originally asked for in a bounded sort. This is because
2179  * returning EOF here might be the wrong thing.
2180  */
2181  if (state->bounded && state->current >= state->bound)
2182  elog(ERROR, "retrieved too many tuples in a bounded sort");
2183 
2184  return false;
2185  }
2186  else
2187  {
2188  if (state->current <= 0)
2189  return false;
2190 
2191  /*
2192  * if all tuples are fetched already then we return last
2193  * tuple, else - tuple before last returned.
2194  */
2195  if (state->eof_reached)
2196  state->eof_reached = false;
2197  else
2198  {
2199  state->current--; /* last returned tuple */
2200  if (state->current <= 0)
2201  return false;
2202  }
2203  *stup = state->memtuples[state->current - 1];
2204  return true;
2205  }
2206  break;
2207 
2208  case TSS_SORTEDONTAPE:
2209  Assert(forward || state->randomAccess);
2210  Assert(state->slabAllocatorUsed);
2211 
2212  /*
2213  * The slot that held the tuple that we returned in previous
2214  * gettuple call can now be reused.
2215  */
2216  if (state->lastReturnedTuple)
2217  {
2218  RELEASE_SLAB_SLOT(state, state->lastReturnedTuple);
2219  state->lastReturnedTuple = NULL;
2220  }
2221 
2222  if (forward)
2223  {
2224  if (state->eof_reached)
2225  return false;
2226 
2227  if ((tuplen = getlen(state->result_tape, true)) != 0)
2228  {
2229  READTUP(state, stup, state->result_tape, tuplen);
2230 
2231  /*
2232  * Remember the tuple we return, so that we can recycle
2233  * its memory on next call. (This can be NULL, in the
2234  * !state->tuples case).
2235  */
2236  state->lastReturnedTuple = stup->tuple;
2237 
2238  return true;
2239  }
2240  else
2241  {
2242  state->eof_reached = true;
2243  return false;
2244  }
2245  }
2246 
2247  /*
2248  * Backward.
2249  *
2250  * if all tuples are fetched already then we return last tuple,
2251  * else - tuple before last returned.
2252  */
2253  if (state->eof_reached)
2254  {
2255  /*
2256  * Seek position is pointing just past the zero tuplen at the
2257  * end of file; back up to fetch last tuple's ending length
2258  * word. If seek fails we must have a completely empty file.
2259  */
2260  nmoved = LogicalTapeBackspace(state->result_tape,
2261  2 * sizeof(unsigned int));
2262  if (nmoved == 0)
2263  return false;
2264  else if (nmoved != 2 * sizeof(unsigned int))
2265  elog(ERROR, "unexpected tape position");
2266  state->eof_reached = false;
2267  }
2268  else
2269  {
2270  /*
2271  * Back up and fetch previously-returned tuple's ending length
2272  * word. If seek fails, assume we are at start of file.
2273  */
2274  nmoved = LogicalTapeBackspace(state->result_tape,
2275  sizeof(unsigned int));
2276  if (nmoved == 0)
2277  return false;
2278  else if (nmoved != sizeof(unsigned int))
2279  elog(ERROR, "unexpected tape position");
2280  tuplen = getlen(state->result_tape, false);
2281 
2282  /*
2283  * Back up to get ending length word of tuple before it.
2284  */
2285  nmoved = LogicalTapeBackspace(state->result_tape,
2286  tuplen + 2 * sizeof(unsigned int));
2287  if (nmoved == tuplen + sizeof(unsigned int))
2288  {
2289  /*
2290  * We backed up over the previous tuple, but there was no
2291  * ending length word before it. That means that the prev
2292  * tuple is the first tuple in the file. It is now the
2293  * next to read in forward direction (not obviously right,
2294  * but that is what in-memory case does).
2295  */
2296  return false;
2297  }
2298  else if (nmoved != tuplen + 2 * sizeof(unsigned int))
2299  elog(ERROR, "bogus tuple length in backward scan");
2300  }
2301 
2302  tuplen = getlen(state->result_tape, false);
2303 
2304  /*
2305  * Now we have the length of the prior tuple, back up and read it.
2306  * Note: READTUP expects we are positioned after the initial
2307  * length word of the tuple, so back up to that point.
2308  */
2309  nmoved = LogicalTapeBackspace(state->result_tape,
2310  tuplen);
2311  if (nmoved != tuplen)
2312  elog(ERROR, "bogus tuple length in backward scan");
2313  READTUP(state, stup, state->result_tape, tuplen);
2314 
2315  /*
2316  * Remember the tuple we return, so that we can recycle its memory
2317  * on next call. (This can be NULL, in the Datum case).
2318  */
2319  state->lastReturnedTuple = stup->tuple;
2320 
2321  return true;
2322 
2323  case TSS_FINALMERGE:
2324  Assert(forward);
2325  /* We are managing memory ourselves, with the slab allocator. */
2326  Assert(state->slabAllocatorUsed);
2327 
2328  /*
2329  * The slab slot holding the tuple that we returned in previous
2330  * gettuple call can now be reused.
2331  */
2332  if (state->lastReturnedTuple)
2333  {
2334  RELEASE_SLAB_SLOT(state, state->lastReturnedTuple);
2335  state->lastReturnedTuple = NULL;
2336  }
2337 
2338  /*
2339  * This code should match the inner loop of mergeonerun().
2340  */
2341  if (state->memtupcount > 0)
2342  {
2343  int srcTapeIndex = state->memtuples[0].srctape;
2344  LogicalTape *srcTape = state->inputTapes[srcTapeIndex];
2345  SortTuple newtup;
2346 
2347  *stup = state->memtuples[0];
2348 
2349  /*
2350  * Remember the tuple we return, so that we can recycle its
2351  * memory on next call. (This can be NULL, in the Datum case).
2352  */
2353  state->lastReturnedTuple = stup->tuple;
2354 
2355  /*
2356  * Pull next tuple from tape, and replace the returned tuple
2357  * at top of the heap with it.
2358  */
2359  if (!mergereadnext(state, srcTape, &newtup))
2360  {
2361  /*
2362  * If no more data, we've reached end of run on this tape.
2363  * Remove the top node from the heap.
2364  */
2366  state->nInputRuns--;
2367 
2368  /*
2369  * Close the tape. It'd go away at the end of the sort
2370  * anyway, but better to release the memory early.
2371  */
2372  LogicalTapeClose(srcTape);
2373  return true;
2374  }
2375  newtup.srctape = srcTapeIndex;
2376  tuplesort_heap_replace_top(state, &newtup);
2377  return true;
2378  }
2379  return false;
2380 
2381  default:
2382  elog(ERROR, "invalid tuplesort state");
2383  return false; /* keep compiler quiet */
2384  }
2385 }
2386 
2387 /*
2388  * Fetch the next tuple in either forward or back direction.
2389  * If successful, put tuple in slot and return true; else, clear the slot
2390  * and return false.
2391  *
2392  * Caller may optionally be passed back abbreviated value (on true return
2393  * value) when abbreviation was used, which can be used to cheaply avoid
2394  * equality checks that might otherwise be required. Caller can safely make a
2395  * determination of "non-equal tuple" based on simple binary inequality. A
2396  * NULL value in leading attribute will set abbreviated value to zeroed
2397  * representation, which caller may rely on in abbreviated inequality check.
2398  *
2399  * If copy is true, the slot receives a tuple that's been copied into the
2400  * caller's memory context, so that it will stay valid regardless of future
2401  * manipulations of the tuplesort's state (up to and including deleting the
2402  * tuplesort). If copy is false, the slot will just receive a pointer to a
2403  * tuple held within the tuplesort, which is more efficient, but only safe for
2404  * callers that are prepared to have any subsequent manipulation of the
2405  * tuplesort's state invalidate slot contents.
2406  */
2407 bool
2408 tuplesort_gettupleslot(Tuplesortstate *state, bool forward, bool copy,
2409  TupleTableSlot *slot, Datum *abbrev)
2410 {
2411  MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext);
2412  SortTuple stup;
2413 
2414  if (!tuplesort_gettuple_common(state, forward, &stup))
2415  stup.tuple = NULL;
2416 
2417  MemoryContextSwitchTo(oldcontext);
2418 
2419  if (stup.tuple)
2420  {
2421  /* Record abbreviated key for caller */
2422  if (state->sortKeys->abbrev_converter && abbrev)
2423  *abbrev = stup.datum1;
2424 
2425  if (copy)
2427 
2428  ExecStoreMinimalTuple((MinimalTuple) stup.tuple, slot, copy);
2429  return true;
2430  }
2431  else
2432  {
2433  ExecClearTuple(slot);
2434  return false;
2435  }
2436 }
2437 
2438 /*
2439  * Fetch the next tuple in either forward or back direction.
2440  * Returns NULL if no more tuples. Returned tuple belongs to tuplesort memory
2441  * context, and must not be freed by caller. Caller may not rely on tuple
2442  * remaining valid after any further manipulation of tuplesort.
2443  */
2444 HeapTuple
2446 {
2447  MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext);
2448  SortTuple stup;
2449 
2450  if (!tuplesort_gettuple_common(state, forward, &stup))
2451  stup.tuple = NULL;
2452 
2453  MemoryContextSwitchTo(oldcontext);
2454 
2455  return stup.tuple;
2456 }
2457 
2458 /*
2459  * Fetch the next index tuple in either forward or back direction.
2460  * Returns NULL if no more tuples. Returned tuple belongs to tuplesort memory
2461  * context, and must not be freed by caller. Caller may not rely on tuple
2462  * remaining valid after any further manipulation of tuplesort.
2463  */
2464 IndexTuple
2466 {
2467  MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext);
2468  SortTuple stup;
2469 
2470  if (!tuplesort_gettuple_common(state, forward, &stup))
2471  stup.tuple = NULL;
2472 
2473  MemoryContextSwitchTo(oldcontext);
2474 
2475  return (IndexTuple) stup.tuple;
2476 }
2477 
2478 /*
2479  * Fetch the next Datum in either forward or back direction.
2480  * Returns false if no more datums.
2481  *
2482  * If the Datum is pass-by-ref type, the returned value is freshly palloc'd
2483  * in caller's context, and is now owned by the caller (this differs from
2484  * similar routines for other types of tuplesorts).
2485  *
2486  * Caller may optionally be passed back abbreviated value (on true return
2487  * value) when abbreviation was used, which can be used to cheaply avoid
2488  * equality checks that might otherwise be required. Caller can safely make a
2489  * determination of "non-equal tuple" based on simple binary inequality. A
2490  * NULL value will have a zeroed abbreviated value representation, which caller
2491  * may rely on in abbreviated inequality check.
2492  */
2493 bool
2495  Datum *val, bool *isNull, Datum *abbrev)
2496 {
2497  MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext);
2498  SortTuple stup;
2499 
2500  if (!tuplesort_gettuple_common(state, forward, &stup))
2501  {
2502  MemoryContextSwitchTo(oldcontext);
2503  return false;
2504  }
2505 
2506  /* Ensure we copy into caller's memory context */
2507  MemoryContextSwitchTo(oldcontext);
2508 
2509  /* Record abbreviated key for caller */
2510  if (state->sortKeys->abbrev_converter && abbrev)
2511  *abbrev = stup.datum1;
2512 
2513  if (stup.isnull1 || !state->tuples)
2514  {
2515  *val = stup.datum1;
2516  *isNull = stup.isnull1;
2517  }
2518  else
2519  {
2520  /* use stup.tuple because stup.datum1 may be an abbreviation */
2521  *val = datumCopy(PointerGetDatum(stup.tuple), false, state->datumTypeLen);
2522  *isNull = false;
2523  }
2524 
2525  return true;
2526 }
2527 
2528 /*
2529  * Advance over N tuples in either forward or back direction,
2530  * without returning any data. N==0 is a no-op.
2531  * Returns true if successful, false if ran out of tuples.
2532  */
2533 bool
2534 tuplesort_skiptuples(Tuplesortstate *state, int64 ntuples, bool forward)
2535 {
2536  MemoryContext oldcontext;
2537 
2538  /*
2539  * We don't actually support backwards skip yet, because no callers need
2540  * it. The API is designed to allow for that later, though.
2541  */
2542  Assert(forward);
2543  Assert(ntuples >= 0);
2544  Assert(!WORKER(state));
2545 
2546  switch (state->status)
2547  {
2548  case TSS_SORTEDINMEM:
2549  if (state->memtupcount - state->current >= ntuples)
2550  {
2551  state->current += ntuples;
2552  return true;
2553  }
2554  state->current = state->memtupcount;
2555  state->eof_reached = true;
2556 
2557  /*
2558  * Complain if caller tries to retrieve more tuples than
2559  * originally asked for in a bounded sort. This is because
2560  * returning EOF here might be the wrong thing.
2561  */
2562  if (state->bounded && state->current >= state->bound)
2563  elog(ERROR, "retrieved too many tuples in a bounded sort");
2564 
2565  return false;
2566 
2567  case TSS_SORTEDONTAPE:
2568  case TSS_FINALMERGE:
2569 
2570  /*
2571  * We could probably optimize these cases better, but for now it's
2572  * not worth the trouble.
2573  */
2574  oldcontext = MemoryContextSwitchTo(state->sortcontext);
2575  while (ntuples-- > 0)
2576  {
2577  SortTuple stup;
2578 
2579  if (!tuplesort_gettuple_common(state, forward, &stup))
2580  {
2581  MemoryContextSwitchTo(oldcontext);
2582  return false;
2583  }
2585  }
2586  MemoryContextSwitchTo(oldcontext);
2587  return true;
2588 
2589  default:
2590  elog(ERROR, "invalid tuplesort state");
2591  return false; /* keep compiler quiet */
2592  }
2593 }
2594 
2595 /*
2596  * tuplesort_merge_order - report merge order we'll use for given memory
2597  * (note: "merge order" just means the number of input tapes in the merge).
2598  *
2599  * This is exported for use by the planner. allowedMem is in bytes.
2600  */
2601 int
2602 tuplesort_merge_order(int64 allowedMem)
2603 {
2604  int mOrder;
2605 
2606  /*----------
2607  * In the merge phase, we need buffer space for each input and output tape.
2608  * Each pass in the balanced merge algorithm reads from M input tapes, and
2609  * writes to N output tapes. Each tape consumes TAPE_BUFFER_OVERHEAD bytes
2610  * of memory. In addition to that, we want MERGE_BUFFER_SIZE workspace per
2611  * input tape.
2612  *
2613  * totalMem = M * (TAPE_BUFFER_OVERHEAD + MERGE_BUFFER_SIZE) +
2614  * N * TAPE_BUFFER_OVERHEAD
2615  *
2616  * Except for the last and next-to-last merge passes, where there can be
2617  * fewer tapes left to process, M = N. We choose M so that we have the
2618  * desired amount of memory available for the input buffers
2619  * (TAPE_BUFFER_OVERHEAD + MERGE_BUFFER_SIZE), given the total memory
2620  * available for the tape buffers (allowedMem).
2621  *
2622  * Note: you might be thinking we need to account for the memtuples[]
2623  * array in this calculation, but we effectively treat that as part of the
2624  * MERGE_BUFFER_SIZE workspace.
2625  *----------
2626  */
2627  mOrder = allowedMem /
2629 
2630  /*
2631  * Even in minimum memory, use at least a MINORDER merge. On the other
2632  * hand, even when we have lots of memory, do not use more than a MAXORDER
2633  * merge. Tapes are pretty cheap, but they're not entirely free. Each
2634  * additional tape reduces the amount of memory available to build runs,
2635  * which in turn can cause the same sort to need more runs, which makes
2636  * merging slower even if it can still be done in a single pass. Also,
2637  * high order merges are quite slow due to CPU cache effects; it can be
2638  * faster to pay the I/O cost of a multi-pass merge than to perform a
2639  * single merge pass across many hundreds of tapes.
2640  */
2641  mOrder = Max(mOrder, MINORDER);
2642  mOrder = Min(mOrder, MAXORDER);
2643 
2644  return mOrder;
2645 }
2646 
2647 /*
2648  * Helper function to calculate how much memory to allocate for the read buffer
2649  * of each input tape in a merge pass.
2650  *
2651  * 'avail_mem' is the amount of memory available for the buffers of all the
2652  * tapes, both input and output.
2653  * 'nInputTapes' and 'nInputRuns' are the number of input tapes and runs.
2654  * 'maxOutputTapes' is the max. number of output tapes we should produce.
2655  */
2656 static int64
2657 merge_read_buffer_size(int64 avail_mem, int nInputTapes, int nInputRuns,
2658  int maxOutputTapes)
2659 {
2660  int nOutputRuns;
2661  int nOutputTapes;
2662 
2663  /*
2664  * How many output tapes will we produce in this pass?
2665  *
2666  * This is nInputRuns / nInputTapes, rounded up.
2667  */
2668  nOutputRuns = (nInputRuns + nInputTapes - 1) / nInputTapes;
2669 
2670  nOutputTapes = Min(nOutputRuns, maxOutputTapes);
2671 
2672  /*
2673  * Each output tape consumes TAPE_BUFFER_OVERHEAD bytes of memory. All
2674  * remaining memory is divided evenly between the input tapes.
2675  *
2676  * This also follows from the formula in tuplesort_merge_order, but here
2677  * we derive the input buffer size from the amount of memory available,
2678  * and M and N.
2679  */
2680  return Max((avail_mem - TAPE_BUFFER_OVERHEAD * nOutputTapes) / nInputTapes, 0);
2681 }
2682 
2683 /*
2684  * inittapes - initialize for tape sorting.
2685  *
2686  * This is called only if we have found we won't sort in memory.
2687  */
2688 static void
2690 {
2691  Assert(!LEADER(state));
2692 
2693  if (mergeruns)
2694  {
2695  /* Compute number of input tapes to use when merging */
2696  state->maxTapes = tuplesort_merge_order(state->allowedMem);
2697  }
2698  else
2699  {
2700  /* Workers can sometimes produce single run, output without merge */
2701  Assert(WORKER(state));
2702  state->maxTapes = MINORDER;
2703  }
2704 
2705 #ifdef TRACE_SORT
2706  if (trace_sort)
2707  elog(LOG, "worker %d switching to external sort with %d tapes: %s",
2708  state->worker, state->maxTapes, pg_rusage_show(&state->ru_start));
2709 #endif
2710 
2711  /* Create the tape set */
2712  inittapestate(state, state->maxTapes);
2713  state->tapeset =
2714  LogicalTapeSetCreate(false,
2715  state->shared ? &state->shared->fileset : NULL,
2716  state->worker);
2717 
2718  state->currentRun = 0;
2719 
2720  /*
2721  * Initialize logical tape arrays.
2722  */
2723  state->inputTapes = NULL;
2724  state->nInputTapes = 0;
2725  state->nInputRuns = 0;
2726 
2727  state->outputTapes = palloc0(state->maxTapes * sizeof(LogicalTape *));
2728  state->nOutputTapes = 0;
2729  state->nOutputRuns = 0;
2730 
2731  state->status = TSS_BUILDRUNS;
2732 
2733  selectnewtape(state);
2734 }
2735 
2736 /*
2737  * inittapestate - initialize generic tape management state
2738  */
2739 static void
2741 {
2742  int64 tapeSpace;
2743 
2744  /*
2745  * Decrease availMem to reflect the space needed for tape buffers; but
2746  * don't decrease it to the point that we have no room for tuples. (That
2747  * case is only likely to occur if sorting pass-by-value Datums; in all
2748  * other scenarios the memtuples[] array is unlikely to occupy more than
2749  * half of allowedMem. In the pass-by-value case it's not important to
2750  * account for tuple space, so we don't care if LACKMEM becomes
2751  * inaccurate.)
2752  */
2753  tapeSpace = (int64) maxTapes * TAPE_BUFFER_OVERHEAD;
2754 
2755  if (tapeSpace + GetMemoryChunkSpace(state->memtuples) < state->allowedMem)
2756  USEMEM(state, tapeSpace);
2757 
2758  /*
2759  * Make sure that the temp file(s) underlying the tape set are created in
2760  * suitable temp tablespaces. For parallel sorts, this should have been
2761  * called already, but it doesn't matter if it is called a second time.
2762  */
2764 }
2765 
2766 /*
2767  * selectnewtape -- select next tape to output to.
2768  *
2769  * This is called after finishing a run when we know another run
2770  * must be started. This is used both when building the initial
2771  * runs, and during merge passes.
2772  */
2773 static void
2775 {
2776  /*
2777  * At the beginning of each merge pass, nOutputTapes and nOutputRuns are
2778  * both zero. On each call, we create a new output tape to hold the next
2779  * run, until maxTapes is reached. After that, we assign new runs to the
2780  * existing tapes in a round robin fashion.
2781  */
2782  if (state->nOutputTapes < state->maxTapes)
2783  {
2784  /* Create a new tape to hold the next run */
2785  Assert(state->outputTapes[state->nOutputRuns] == NULL);
2786  Assert(state->nOutputRuns == state->nOutputTapes);
2787  state->destTape = LogicalTapeCreate(state->tapeset);
2788  state->outputTapes[state->nOutputTapes] = state->destTape;
2789  state->nOutputTapes++;
2790  state->nOutputRuns++;
2791  }
2792  else
2793  {
2794  /*
2795  * We have reached the max number of tapes. Append to an existing
2796  * tape.
2797  */
2798  state->destTape = state->outputTapes[state->nOutputRuns % state->nOutputTapes];
2799  state->nOutputRuns++;
2800  }
2801 }
2802 
2803 /*
2804  * Initialize the slab allocation arena, for the given number of slots.
2805  */
2806 static void
2808 {
2809  if (numSlots > 0)
2810  {
2811  char *p;
2812  int i;
2813 
2814  state->slabMemoryBegin = palloc(numSlots * SLAB_SLOT_SIZE);
2815  state->slabMemoryEnd = state->slabMemoryBegin +
2816  numSlots * SLAB_SLOT_SIZE;
2817  state->slabFreeHead = (SlabSlot *) state->slabMemoryBegin;
2818  USEMEM(state, numSlots * SLAB_SLOT_SIZE);
2819 
2820  p = state->slabMemoryBegin;
2821  for (i = 0; i < numSlots - 1; i++)
2822  {
2823  ((SlabSlot *) p)->nextfree = (SlabSlot *) (p + SLAB_SLOT_SIZE);
2824  p += SLAB_SLOT_SIZE;
2825  }
2826  ((SlabSlot *) p)->nextfree = NULL;
2827  }
2828  else
2829  {
2830  state->slabMemoryBegin = state->slabMemoryEnd = NULL;
2831  state->slabFreeHead = NULL;
2832  }
2833  state->slabAllocatorUsed = true;
2834 }
2835 
2836 /*
2837  * mergeruns -- merge all the completed initial runs.
2838  *
2839  * This implements the Balanced k-Way Merge Algorithm. All input data has
2840  * already been written to initial runs on tape (see dumptuples).
2841  */
2842 static void
2844 {
2845  int tapenum;
2846 
2847  Assert(state->status == TSS_BUILDRUNS);
2848  Assert(state->memtupcount == 0);
2849 
2850  if (state->sortKeys != NULL && state->sortKeys->abbrev_converter != NULL)
2851  {
2852  /*
2853  * If there are multiple runs to be merged, when we go to read back
2854  * tuples from disk, abbreviated keys will not have been stored, and
2855  * we don't care to regenerate them. Disable abbreviation from this
2856  * point on.
2857  */
2858  state->sortKeys->abbrev_converter = NULL;
2860 
2861  /* Not strictly necessary, but be tidy */
2862  state->sortKeys->abbrev_abort = NULL;
2863  state->sortKeys->abbrev_full_comparator = NULL;
2864  }
2865 
2866  /*
2867  * Reset tuple memory. We've freed all the tuples that we previously
2868  * allocated. We will use the slab allocator from now on.
2869  */
2871 
2872  /*
2873  * We no longer need a large memtuples array. (We will allocate a smaller
2874  * one for the heap later.)
2875  */
2876  FREEMEM(state, GetMemoryChunkSpace(state->memtuples));
2877  pfree(state->memtuples);
2878  state->memtuples = NULL;
2879 
2880  /*
2881  * Initialize the slab allocator. We need one slab slot per input tape,
2882  * for the tuples in the heap, plus one to hold the tuple last returned
2883  * from tuplesort_gettuple. (If we're sorting pass-by-val Datums,
2884  * however, we don't need to do allocate anything.)
2885  *
2886  * In a multi-pass merge, we could shrink this allocation for the last
2887  * merge pass, if it has fewer tapes than previous passes, but we don't
2888  * bother.
2889  *
2890  * From this point on, we no longer use the USEMEM()/LACKMEM() mechanism
2891  * to track memory usage of individual tuples.
2892  */
2893  if (state->tuples)
2894  init_slab_allocator(state, state->nOutputTapes + 1);
2895  else
2896  init_slab_allocator(state, 0);
2897 
2898  /*
2899  * Allocate a new 'memtuples' array, for the heap. It will hold one tuple
2900  * from each input tape.
2901  *
2902  * We could shrink this, too, between passes in a multi-pass merge, but we
2903  * don't bother. (The initial input tapes are still in outputTapes. The
2904  * number of input tapes will not increase between passes.)
2905  */
2906  state->memtupsize = state->nOutputTapes;
2907  state->memtuples = (SortTuple *) MemoryContextAlloc(state->maincontext,
2908  state->nOutputTapes * sizeof(SortTuple));
2909  USEMEM(state, GetMemoryChunkSpace(state->memtuples));
2910 
2911  /*
2912  * Use all the remaining memory we have available for tape buffers among
2913  * all the input tapes. At the beginning of each merge pass, we will
2914  * divide this memory between the input and output tapes in the pass.
2915  */
2916  state->tape_buffer_mem = state->availMem;
2917  USEMEM(state, state->tape_buffer_mem);
2918 #ifdef TRACE_SORT
2919  if (trace_sort)
2920  elog(LOG, "worker %d using %zu KB of memory for tape buffers",
2921  state->worker, state->tape_buffer_mem / 1024);
2922 #endif
2923 
2924  for (;;)
2925  {
2926  /*
2927  * On the first iteration, or if we have read all the runs from the
2928  * input tapes in a multi-pass merge, it's time to start a new pass.
2929  * Rewind all the output tapes, and make them inputs for the next
2930  * pass.
2931  */
2932  if (state->nInputRuns == 0)
2933  {
2934  int64 input_buffer_size;
2935 
2936  /* Close the old, emptied, input tapes */
2937  if (state->nInputTapes > 0)
2938  {
2939  for (tapenum = 0; tapenum < state->nInputTapes; tapenum++)
2940  LogicalTapeClose(state->inputTapes[tapenum]);
2941  pfree(state->inputTapes);
2942  }
2943 
2944  /* Previous pass's outputs become next pass's inputs. */
2945  state->inputTapes = state->outputTapes;
2946  state->nInputTapes = state->nOutputTapes;
2947  state->nInputRuns = state->nOutputRuns;
2948 
2949  /*
2950  * Reset output tape variables. The actual LogicalTapes will be
2951  * created as needed, here we only allocate the array to hold
2952  * them.
2953  */
2954  state->outputTapes = palloc0(state->nInputTapes * sizeof(LogicalTape *));
2955  state->nOutputTapes = 0;
2956  state->nOutputRuns = 0;
2957 
2958  /*
2959  * Redistribute the memory allocated for tape buffers, among the
2960  * new input and output tapes.
2961  */
2962  input_buffer_size = merge_read_buffer_size(state->tape_buffer_mem,
2963  state->nInputTapes,
2964  state->nInputRuns,
2965  state->maxTapes);
2966 
2967 #ifdef TRACE_SORT
2968  if (trace_sort)
2969  elog(LOG, "starting merge pass of %d input runs on %d tapes, " INT64_FORMAT " KB of memory for each input tape: %s",
2970  state->nInputRuns, state->nInputTapes, input_buffer_size / 1024,
2971  pg_rusage_show(&state->ru_start));
2972 #endif
2973 
2974  /* Prepare the new input tapes for merge pass. */
2975  for (tapenum = 0; tapenum < state->nInputTapes; tapenum++)
2976  LogicalTapeRewindForRead(state->inputTapes[tapenum], input_buffer_size);
2977 
2978  /*
2979  * If there's just one run left on each input tape, then only one
2980  * merge pass remains. If we don't have to produce a materialized
2981  * sorted tape, we can stop at this point and do the final merge
2982  * on-the-fly.
2983  */
2984  if (!state->randomAccess && state->nInputRuns <= state->nInputTapes
2985  && !WORKER(state))
2986  {
2987  /* Tell logtape.c we won't be writing anymore */
2989  /* Initialize for the final merge pass */
2990  beginmerge(state);
2991  state->status = TSS_FINALMERGE;
2992  return;
2993  }
2994  }
2995 
2996  /* Select an output tape */
2997  selectnewtape(state);
2998 
2999  /* Merge one run from each input tape. */
3000  mergeonerun(state);
3001 
3002  /*
3003  * If the input tapes are empty, and we output only one output run,
3004  * we're done. The current output tape contains the final result.
3005  */
3006  if (state->nInputRuns == 0 && state->nOutputRuns <= 1)
3007  break;
3008  }
3009 
3010  /*
3011  * Done. The result is on a single run on a single tape.
3012  */
3013  state->result_tape = state->outputTapes[0];
3014  if (!WORKER(state))
3015  LogicalTapeFreeze(state->result_tape, NULL);
3016  else
3018  state->status = TSS_SORTEDONTAPE;
3019 
3020  /* Close all the now-empty input tapes, to release their read buffers. */
3021  for (tapenum = 0; tapenum < state->nInputTapes; tapenum++)
3022  LogicalTapeClose(state->inputTapes[tapenum]);
3023 }
3024 
3025 /*
3026  * Merge one run from each input tape.
3027  */
3028 static void
3030 {
3031  int srcTapeIndex;
3032  LogicalTape *srcTape;
3033 
3034  /*
3035  * Start the merge by loading one tuple from each active source tape into
3036  * the heap.
3037  */
3038  beginmerge(state);
3039 
3040  /*
3041  * Execute merge by repeatedly extracting lowest tuple in heap, writing it
3042  * out, and replacing it with next tuple from same tape (if there is
3043  * another one).
3044  */
3045  while (state->memtupcount > 0)
3046  {
3047  SortTuple stup;
3048 
3049  /* write the tuple to destTape */
3050  srcTapeIndex = state->memtuples[0].srctape;
3051  srcTape = state->inputTapes[srcTapeIndex];
3052  WRITETUP(state, state->destTape, &state->memtuples[0]);
3053 
3054  /* recycle the slot of the tuple we just wrote out, for the next read */
3055  if (state->memtuples[0].tuple)
3056  RELEASE_SLAB_SLOT(state, state->memtuples[0].tuple);
3057 
3058  /*
3059  * pull next tuple from the tape, and replace the written-out tuple in
3060  * the heap with it.
3061  */
3062  if (mergereadnext(state, srcTape, &stup))
3063  {
3064  stup.srctape = srcTapeIndex;
3065  tuplesort_heap_replace_top(state, &stup);
3066 
3067  }
3068  else
3069  {
3071  state->nInputRuns--;
3072  }
3073  }
3074 
3075  /*
3076  * When the heap empties, we're done. Write an end-of-run marker on the
3077  * output tape.
3078  */
3079  markrunend(state->destTape);
3080 }
3081 
3082 /*
3083  * beginmerge - initialize for a merge pass
3084  *
3085  * Fill the merge heap with the first tuple from each input tape.
3086  */
3087 static void
3089 {
3090  int activeTapes;
3091  int srcTapeIndex;
3092 
3093  /* Heap should be empty here */
3094  Assert(state->memtupcount == 0);
3095 
3096  activeTapes = Min(state->nInputTapes, state->nInputRuns);
3097 
3098  for (srcTapeIndex = 0; srcTapeIndex < activeTapes; srcTapeIndex++)
3099  {
3100  SortTuple tup;
3101 
3102  if (mergereadnext(state, state->inputTapes[srcTapeIndex], &tup))
3103  {
3104  tup.srctape = srcTapeIndex;
3105  tuplesort_heap_insert(state, &tup);
3106  }
3107  }
3108 }
3109 
3110 /*
3111  * mergereadnext - read next tuple from one merge input tape
3112  *
3113  * Returns false on EOF.
3114  */
3115 static bool
3117 {
3118  unsigned int tuplen;
3119 
3120  /* read next tuple, if any */
3121  if ((tuplen = getlen(srcTape, true)) == 0)
3122  return false;
3123  READTUP(state, stup, srcTape, tuplen);
3124 
3125  return true;
3126 }
3127 
3128 /*
3129  * dumptuples - remove tuples from memtuples and write initial run to tape
3130  *
3131  * When alltuples = true, dump everything currently in memory. (This case is
3132  * only used at end of input data.)
3133  */
3134 static void
3136 {
3137  int memtupwrite;
3138  int i;
3139 
3140  /*
3141  * Nothing to do if we still fit in available memory and have array slots,
3142  * unless this is the final call during initial run generation.
3143  */
3144  if (state->memtupcount < state->memtupsize && !LACKMEM(state) &&
3145  !alltuples)
3146  return;
3147 
3148  /*
3149  * Final call might require no sorting, in rare cases where we just so
3150  * happen to have previously LACKMEM()'d at the point where exactly all
3151  * remaining tuples are loaded into memory, just before input was
3152  * exhausted. In general, short final runs are quite possible, but avoid
3153  * creating a completely empty run. In a worker, though, we must produce
3154  * at least one tape, even if it's empty.
3155  */
3156  if (state->memtupcount == 0 && state->currentRun > 0)
3157  return;
3158 
3159  Assert(state->status == TSS_BUILDRUNS);
3160 
3161  /*
3162  * It seems unlikely that this limit will ever be exceeded, but take no
3163  * chances
3164  */
3165  if (state->currentRun == INT_MAX)
3166  ereport(ERROR,
3167  (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
3168  errmsg("cannot have more than %d runs for an external sort",
3169  INT_MAX)));
3170 
3171  if (state->currentRun > 0)
3172  selectnewtape(state);
3173 
3174  state->currentRun++;
3175 
3176 #ifdef TRACE_SORT
3177  if (trace_sort)
3178  elog(LOG, "worker %d starting quicksort of run %d: %s",
3179  state->worker, state->currentRun,
3180  pg_rusage_show(&state->ru_start));
3181 #endif
3182 
3183  /*
3184  * Sort all tuples accumulated within the allowed amount of memory for
3185  * this run using quicksort
3186  */
3187  tuplesort_sort_memtuples(state);
3188 
3189 #ifdef TRACE_SORT
3190  if (trace_sort)
3191  elog(LOG, "worker %d finished quicksort of run %d: %s",
3192  state->worker, state->currentRun,
3193  pg_rusage_show(&state->ru_start));
3194 #endif
3195 
3196  memtupwrite = state->memtupcount;
3197  for (i = 0; i < memtupwrite; i++)
3198  {
3199  WRITETUP(state, state->destTape, &state->memtuples[i]);
3200  state->memtupcount--;
3201  }
3202 
3203  /*
3204  * Reset tuple memory. We've freed all of the tuples that we previously
3205  * allocated. It's important to avoid fragmentation when there is a stark
3206  * change in the sizes of incoming tuples. Fragmentation due to
3207  * AllocSetFree's bucketing by size class might be particularly bad if
3208  * this step wasn't taken.
3209  */
3211 
3212  markrunend(state->destTape);
3213 
3214 #ifdef TRACE_SORT
3215  if (trace_sort)
3216  elog(LOG, "worker %d finished writing run %d to tape %d: %s",
3217  state->worker, state->currentRun, (state->currentRun - 1) % state->nOutputTapes + 1,
3218  pg_rusage_show(&state->ru_start));
3219 #endif
3220 }
3221 
3222 /*
3223  * tuplesort_rescan - rewind and replay the scan
3224  */
3225 void
3227 {
3228  MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext);
3229 
3230  Assert(state->randomAccess);
3231 
3232  switch (state->status)
3233  {
3234  case TSS_SORTEDINMEM:
3235  state->current = 0;
3236  state->eof_reached = false;
3237  state->markpos_offset = 0;
3238  state->markpos_eof = false;
3239  break;
3240  case TSS_SORTEDONTAPE:
3242  state->eof_reached = false;
3243  state->markpos_block = 0L;
3244  state->markpos_offset = 0;
3245  state->markpos_eof = false;
3246  break;
3247  default:
3248  elog(ERROR, "invalid tuplesort state");
3249  break;
3250  }
3251 
3252  MemoryContextSwitchTo(oldcontext);
3253 }
3254 
3255 /*
3256  * tuplesort_markpos - saves current position in the merged sort file
3257  */
3258 void
3260 {
3261  MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext);
3262 
3263  Assert(state->randomAccess);
3264 
3265  switch (state->status)
3266  {
3267  case TSS_SORTEDINMEM:
3268  state->markpos_offset = state->current;
3269  state->markpos_eof = state->eof_reached;
3270  break;
3271  case TSS_SORTEDONTAPE:
3273  &state->markpos_block,
3274  &state->markpos_offset);
3275  state->markpos_eof = state->eof_reached;
3276  break;
3277  default:
3278  elog(ERROR, "invalid tuplesort state");
3279  break;
3280  }
3281 
3282  MemoryContextSwitchTo(oldcontext);
3283 }
3284 
3285 /*
3286  * tuplesort_restorepos - restores current position in merged sort file to
3287  * last saved position
3288  */
3289 void
3291 {
3292  MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext);
3293 
3294  Assert(state->randomAccess);
3295 
3296  switch (state->status)
3297  {
3298  case TSS_SORTEDINMEM:
3299  state->current = state->markpos_offset;
3300  state->eof_reached = state->markpos_eof;
3301  break;
3302  case TSS_SORTEDONTAPE:
3304  state->markpos_block,
3305  state->markpos_offset);
3306  state->eof_reached = state->markpos_eof;
3307  break;
3308  default:
3309  elog(ERROR, "invalid tuplesort state");
3310  break;
3311  }
3312 
3313  MemoryContextSwitchTo(oldcontext);
3314 }
3315 
3316 /*
3317  * tuplesort_get_stats - extract summary statistics
3318  *
3319  * This can be called after tuplesort_performsort() finishes to obtain
3320  * printable summary information about how the sort was performed.
3321  */
3322 void
3324  TuplesortInstrumentation *stats)
3325 {
3326  /*
3327  * Note: it might seem we should provide both memory and disk usage for a
3328  * disk-based sort. However, the current code doesn't track memory space
3329  * accurately once we have begun to return tuples to the caller (since we
3330  * don't account for pfree's the caller is expected to do), so we cannot
3331  * rely on availMem in a disk sort. This does not seem worth the overhead
3332  * to fix. Is it worth creating an API for the memory context code to
3333  * tell us how much is actually used in sortcontext?
3334  */
3335  tuplesort_updatemax(state);
3336 
3337  if (state->isMaxSpaceDisk)
3339  else
3341  stats->spaceUsed = (state->maxSpace + 1023) / 1024;
3342 
3343  switch (state->maxSpaceStatus)
3344  {
3345  case TSS_SORTEDINMEM:
3346  if (state->boundUsed)
3348  else
3350  break;
3351  case TSS_SORTEDONTAPE:
3353  break;
3354  case TSS_FINALMERGE:
3356  break;
3357  default:
3359  break;
3360  }
3361 }
3362 
3363 /*
3364  * Convert TuplesortMethod to a string.
3365  */
3366 const char *
3368 {
3369  switch (m)
3370  {
3372  return "still in progress";
3374  return "top-N heapsort";
3375  case SORT_TYPE_QUICKSORT:
3376  return "quicksort";
3378  return "external sort";
3380  return "external merge";
3381  }
3382 
3383  return "unknown";
3384 }
3385 
3386 /*
3387  * Convert TuplesortSpaceType to a string.
3388  */
3389 const char *
3391 {
3393  return t == SORT_SPACE_TYPE_DISK ? "Disk" : "Memory";
3394 }
3395 
3396 
3397 /*
3398  * Heap manipulation routines, per Knuth's Algorithm 5.2.3H.
3399  */
3400 
3401 /*
3402  * Convert the existing unordered array of SortTuples to a bounded heap,
3403  * discarding all but the smallest "state->bound" tuples.
3404  *
3405  * When working with a bounded heap, we want to keep the largest entry
3406  * at the root (array entry zero), instead of the smallest as in the normal
3407  * sort case. This allows us to discard the largest entry cheaply.
3408  * Therefore, we temporarily reverse the sort direction.
3409  */
3410 static void
3412 {
3413  int tupcount = state->memtupcount;
3414  int i;
3415 
3416  Assert(state->status == TSS_INITIAL);
3417  Assert(state->bounded);
3418  Assert(tupcount >= state->bound);
3419  Assert(SERIAL(state));
3420 
3421  /* Reverse sort direction so largest entry will be at root */
3422  reversedirection(state);
3423 
3424  state->memtupcount = 0; /* make the heap empty */
3425  for (i = 0; i < tupcount; i++)
3426  {
3427  if (state->memtupcount < state->bound)
3428  {
3429  /* Insert next tuple into heap */
3430  /* Must copy source tuple to avoid possible overwrite */
3431  SortTuple stup = state->memtuples[i];
3432 
3433  tuplesort_heap_insert(state, &stup);
3434  }
3435  else
3436  {
3437  /*
3438  * The heap is full. Replace the largest entry with the new
3439  * tuple, or just discard it, if it's larger than anything already
3440  * in the heap.
3441  */
3442  if (COMPARETUP(state, &state->memtuples[i], &state->memtuples[0]) <= 0)
3443  {
3444  free_sort_tuple(state, &state->memtuples[i]);
3446  }
3447  else
3448  tuplesort_heap_replace_top(state, &state->memtuples[i]);
3449  }
3450  }
3451 
3452  Assert(state->memtupcount == state->bound);
3453  state->status = TSS_BOUNDED;
3454 }
3455 
3456 /*
3457  * Convert the bounded heap to a properly-sorted array
3458  */
3459 static void
3461 {
3462  int tupcount = state->memtupcount;
3463 
3464  Assert(state->status == TSS_BOUNDED);
3465  Assert(state->bounded);
3466  Assert(tupcount == state->bound);
3467  Assert(SERIAL(state));
3468 
3469  /*
3470  * We can unheapify in place because each delete-top call will remove the
3471  * largest entry, which we can promptly store in the newly freed slot at
3472  * the end. Once we're down to a single-entry heap, we're done.
3473  */
3474  while (state->memtupcount > 1)
3475  {
3476  SortTuple stup = state->memtuples[0];
3477 
3478  /* this sifts-up the next-largest entry and decreases memtupcount */
3480  state->memtuples[state->memtupcount] = stup;
3481  }
3482  state->memtupcount = tupcount;
3483 
3484  /*
3485  * Reverse sort direction back to the original state. This is not
3486  * actually necessary but seems like a good idea for tidiness.
3487  */
3488  reversedirection(state);
3489 
3490  state->status = TSS_SORTEDINMEM;
3491  state->boundUsed = true;
3492 }
3493 
3494 /*
3495  * Sort all memtuples using specialized qsort() routines.
3496  *
3497  * Quicksort is used for small in-memory sorts, and external sort runs.
3498  */
3499 static void
3501 {
3502  Assert(!LEADER(state));
3503 
3504  if (state->memtupcount > 1)
3505  {
3506  /* Can we use the single-key sort function? */
3507  if (state->onlyKey != NULL)
3508  qsort_ssup(state->memtuples, state->memtupcount,
3509  state->onlyKey);
3510  else
3511  qsort_tuple(state->memtuples,
3512  state->memtupcount,
3513  state->comparetup,
3514  state);
3515  }
3516 }
3517 
3518 /*
3519  * Insert a new tuple into an empty or existing heap, maintaining the
3520  * heap invariant. Caller is responsible for ensuring there's room.
3521  *
3522  * Note: For some callers, tuple points to a memtuples[] entry above the
3523  * end of the heap. This is safe as long as it's not immediately adjacent
3524  * to the end of the heap (ie, in the [memtupcount] array entry) --- if it
3525  * is, it might get overwritten before being moved into the heap!
3526  */
3527 static void
3529 {
3530  SortTuple *memtuples;
3531  int j;
3532 
3533  memtuples = state->memtuples;
3534  Assert(state->memtupcount < state->memtupsize);
3535 
3537 
3538  /*
3539  * Sift-up the new entry, per Knuth 5.2.3 exercise 16. Note that Knuth is
3540  * using 1-based array indexes, not 0-based.
3541  */
3542  j = state->memtupcount++;
3543  while (j > 0)
3544  {
3545  int i = (j - 1) >> 1;
3546 
3547  if (COMPARETUP(state, tuple, &memtuples[i]) >= 0)
3548  break;
3549  memtuples[j] = memtuples[i];
3550  j = i;
3551  }
3552  memtuples[j] = *tuple;
3553 }
3554 
3555 /*
3556  * Remove the tuple at state->memtuples[0] from the heap. Decrement
3557  * memtupcount, and sift up to maintain the heap invariant.
3558  *
3559  * The caller has already free'd the tuple the top node points to,
3560  * if necessary.
3561  */
3562 static void
3564 {
3565  SortTuple *memtuples = state->memtuples;
3566  SortTuple *tuple;
3567 
3568  if (--state->memtupcount <= 0)
3569  return;
3570 
3571  /*
3572  * Remove the last tuple in the heap, and re-insert it, by replacing the
3573  * current top node with it.
3574  */
3575  tuple = &memtuples[state->memtupcount];
3576  tuplesort_heap_replace_top(state, tuple);
3577 }
3578 
3579 /*
3580  * Replace the tuple at state->memtuples[0] with a new tuple. Sift up to
3581  * maintain the heap invariant.
3582  *
3583  * This corresponds to Knuth's "sift-up" algorithm (Algorithm 5.2.3H,
3584  * Heapsort, steps H3-H8).
3585  */
3586 static void
3588 {
3589  SortTuple *memtuples = state->memtuples;
3590  unsigned int i,
3591  n;
3592 
3593  Assert(state->memtupcount >= 1);
3594 
3596 
3597  /*
3598  * state->memtupcount is "int", but we use "unsigned int" for i, j, n.
3599  * This prevents overflow in the "2 * i + 1" calculation, since at the top
3600  * of the loop we must have i < n <= INT_MAX <= UINT_MAX/2.
3601  */
3602  n = state->memtupcount;
3603  i = 0; /* i is where the "hole" is */
3604  for (;;)
3605  {
3606  unsigned int j = 2 * i + 1;
3607 
3608  if (j >= n)
3609  break;
3610  if (j + 1 < n &&
3611  COMPARETUP(state, &memtuples[j], &memtuples[j + 1]) > 0)
3612  j++;
3613  if (COMPARETUP(state, tuple, &memtuples[j]) <= 0)
3614  break;
3615  memtuples[i] = memtuples[j];
3616  i = j;
3617  }
3618  memtuples[i] = *tuple;
3619 }
3620 
3621 /*
3622  * Function to reverse the sort direction from its current state
3623  *
3624  * It is not safe to call this when performing hash tuplesorts
3625  */
3626 static void
3628 {
3629  SortSupport sortKey = state->sortKeys;
3630  int nkey;
3631 
3632  for (nkey = 0; nkey < state->nKeys; nkey++, sortKey++)
3633  {
3634  sortKey->ssup_reverse = !sortKey->ssup_reverse;
3635  sortKey->ssup_nulls_first = !sortKey->ssup_nulls_first;
3636  }
3637 }
3638 
3639 
3640 /*
3641  * Tape interface routines
3642  */
3643 
3644 static unsigned int
3645 getlen(LogicalTape *tape, bool eofOK)
3646 {
3647  unsigned int len;
3648 
3649  if (LogicalTapeRead(tape,
3650  &len, sizeof(len)) != sizeof(len))
3651  elog(ERROR, "unexpected end of tape");
3652  if (len == 0 && !eofOK)
3653  elog(ERROR, "unexpected end of data");
3654  return len;
3655 }
3656 
3657 static void
3659 {
3660  unsigned int len = 0;
3661 
3662  LogicalTapeWrite(tape, (void *) &len, sizeof(len));
3663 }
3664 
3665 /*
3666  * Get memory for tuple from within READTUP() routine.
3667  *
3668  * We use next free slot from the slab allocator, or palloc() if the tuple
3669  * is too large for that.
3670  */
3671 static void *
3673 {
3674  SlabSlot *buf;
3675 
3676  /*
3677  * We pre-allocate enough slots in the slab arena that we should never run
3678  * out.
3679  */
3680  Assert(state->slabFreeHead);
3681 
3682  if (tuplen > SLAB_SLOT_SIZE || !state->slabFreeHead)
3683  return MemoryContextAlloc(state->sortcontext, tuplen);
3684  else
3685  {
3686  buf = state->slabFreeHead;
3687  /* Reuse this slot */
3688  state->slabFreeHead = buf->nextfree;
3689 
3690  return buf;
3691  }
3692 }
3693 
3694 
3695 /*
3696  * Routines specialized for HeapTuple (actually MinimalTuple) case
3697  */
3698 
3699 static int
3701 {
3702  SortSupport sortKey = state->sortKeys;
3703  HeapTupleData ltup;
3704  HeapTupleData rtup;
3705  TupleDesc tupDesc;
3706  int nkey;
3707  int32 compare;
3708  AttrNumber attno;
3709  Datum datum1,
3710  datum2;
3711  bool isnull1,
3712  isnull2;
3713 
3714 
3715  /* Compare the leading sort key */
3716  compare = ApplySortComparator(a->datum1, a->isnull1,
3717  b->datum1, b->isnull1,
3718  sortKey);
3719  if (compare != 0)
3720  return compare;
3721 
3722  /* Compare additional sort keys */
3723  ltup.t_len = ((MinimalTuple) a->tuple)->t_len + MINIMAL_TUPLE_OFFSET;
3724  ltup.t_data = (HeapTupleHeader) ((char *) a->tuple - MINIMAL_TUPLE_OFFSET);
3725  rtup.t_len = ((MinimalTuple) b->tuple)->t_len + MINIMAL_TUPLE_OFFSET;
3726  rtup.t_data = (HeapTupleHeader) ((char *) b->tuple - MINIMAL_TUPLE_OFFSET);
3727  tupDesc = state->tupDesc;
3728 
3729  if (sortKey->abbrev_converter)
3730  {
3731  attno = sortKey->ssup_attno;
3732 
3733  datum1 = heap_getattr(&ltup, attno, tupDesc, &isnull1);
3734  datum2 = heap_getattr(&rtup, attno, tupDesc, &isnull2);
3735 
3736  compare = ApplySortAbbrevFullComparator(datum1, isnull1,
3737  datum2, isnull2,
3738  sortKey);
3739  if (compare != 0)
3740  return compare;
3741  }
3742 
3743  sortKey++;
3744  for (nkey = 1; nkey < state->nKeys; nkey++, sortKey++)
3745  {
3746  attno = sortKey->ssup_attno;
3747 
3748  datum1 = heap_getattr(&ltup, attno, tupDesc, &isnull1);
3749  datum2 = heap_getattr(&rtup, attno, tupDesc, &isnull2);
3750 
3751  compare = ApplySortComparator(datum1, isnull1,
3752  datum2, isnull2,
3753  sortKey);
3754  if (compare != 0)
3755  return compare;
3756  }
3757 
3758  return 0;
3759 }
3760 
3761 static void
3763 {
3764  /*
3765  * We expect the passed "tup" to be a TupleTableSlot, and form a
3766  * MinimalTuple using the exported interface for that.
3767  */
3768  TupleTableSlot *slot = (TupleTableSlot *) tup;
3769  Datum original;
3770  MinimalTuple tuple;
3771  HeapTupleData htup;
3772  MemoryContext oldcontext = MemoryContextSwitchTo(state->tuplecontext);
3773 
3774  /* copy the tuple into sort storage */
3775  tuple = ExecCopySlotMinimalTuple(slot);
3776  stup->tuple = (void *) tuple;
3777  USEMEM(state, GetMemoryChunkSpace(tuple));
3778  /* set up first-column key value */
3779  htup.t_len = tuple->t_len + MINIMAL_TUPLE_OFFSET;
3780  htup.t_data = (HeapTupleHeader) ((char *) tuple - MINIMAL_TUPLE_OFFSET);
3781  original = heap_getattr(&htup,
3782  state->sortKeys[0].ssup_attno,
3783  state->tupDesc,
3784  &stup->isnull1);
3785 
3786  MemoryContextSwitchTo(oldcontext);
3787 
3788  if (!state->sortKeys->abbrev_converter || stup->isnull1)
3789  {
3790  /*
3791  * Store ordinary Datum representation, or NULL value. If there is a
3792  * converter it won't expect NULL values, and cost model is not
3793  * required to account for NULL, so in that case we avoid calling
3794  * converter and just set datum1 to zeroed representation (to be
3795  * consistent, and to support cheap inequality tests for NULL
3796  * abbreviated keys).
3797  */
3798  stup->datum1 = original;
3799  }
3800  else if (!consider_abort_common(state))
3801  {
3802  /* Store abbreviated key representation */
3803  stup->datum1 = state->sortKeys->abbrev_converter(original,
3804  state->sortKeys);
3805  }
3806  else
3807  {
3808  /* Abort abbreviation */
3809  int i;
3810 
3811  stup->datum1 = original;
3812 
3813  /*
3814  * Set state to be consistent with never trying abbreviation.
3815  *
3816  * Alter datum1 representation in already-copied tuples, so as to
3817  * ensure a consistent representation (current tuple was just
3818  * handled). It does not matter if some dumped tuples are already
3819  * sorted on tape, since serialized tuples lack abbreviated keys
3820  * (TSS_BUILDRUNS state prevents control reaching here in any case).
3821  */
3822  for (i = 0; i < state->memtupcount; i++)
3823  {
3824  SortTuple *mtup = &state->memtuples[i];
3825 
3826  htup.t_len = ((MinimalTuple) mtup->tuple)->t_len +
3828  htup.t_data = (HeapTupleHeader) ((char *) mtup->tuple -
3830 
3831  mtup->datum1 = heap_getattr(&htup,
3832  state->sortKeys[0].ssup_attno,
3833  state->tupDesc,
3834  &mtup->isnull1);
3835  }
3836  }
3837 }
3838 
3839 static void
3841 {
3842  MinimalTuple tuple = (MinimalTuple) stup->tuple;
3843 
3844  /* the part of the MinimalTuple we'll write: */
3845  char *tupbody = (char *) tuple + MINIMAL_TUPLE_DATA_OFFSET;
3846  unsigned int tupbodylen = tuple->t_len - MINIMAL_TUPLE_DATA_OFFSET;
3847 
3848  /* total on-disk footprint: */
3849  unsigned int tuplen = tupbodylen + sizeof(int);
3850 
3851  LogicalTapeWrite(tape, (void *) &tuplen, sizeof(tuplen));
3852  LogicalTapeWrite(tape, (void *) tupbody, tupbodylen);
3853  if (state->randomAccess) /* need trailing length word? */
3854  LogicalTapeWrite(tape, (void *) &tuplen, sizeof(tuplen));
3855 
3856  if (!state->slabAllocatorUsed)
3857  {
3858  FREEMEM(state, GetMemoryChunkSpace(tuple));
3859  heap_free_minimal_tuple(tuple);
3860  }
3861 }
3862 
3863 static void
3865  LogicalTape *tape, unsigned int len)
3866 {
3867  unsigned int tupbodylen = len - sizeof(int);
3868  unsigned int tuplen = tupbodylen + MINIMAL_TUPLE_DATA_OFFSET;
3869  MinimalTuple tuple = (MinimalTuple) readtup_alloc(state, tuplen);
3870  char *tupbody = (char *) tuple + MINIMAL_TUPLE_DATA_OFFSET;
3871  HeapTupleData htup;
3872 
3873  /* read in the tuple proper */
3874  tuple->t_len = tuplen;
3875  LogicalTapeReadExact(tape, tupbody, tupbodylen);
3876  if (state->randomAccess) /* need trailing length word? */
3877  LogicalTapeReadExact(tape, &tuplen, sizeof(tuplen));
3878  stup->tuple = (void *) tuple;
3879  /* set up first-column key value */
3880  htup.t_len = tuple->t_len + MINIMAL_TUPLE_OFFSET;
3881  htup.t_data = (HeapTupleHeader) ((char *) tuple - MINIMAL_TUPLE_OFFSET);
3882  stup->datum1 = heap_getattr(&htup,
3883  state->sortKeys[0].ssup_attno,
3884  state->tupDesc,
3885  &stup->isnull1);
3886 }
3887 
3888 /*
3889  * Routines specialized for the CLUSTER case (HeapTuple data, with
3890  * comparisons per a btree index definition)
3891  */
3892 
3893 static int
3896 {
3897  SortSupport sortKey = state->sortKeys;
3898  HeapTuple ltup;
3899  HeapTuple rtup;
3900  TupleDesc tupDesc;
3901  int nkey;
3902  int32 compare;
3903  Datum datum1,
3904  datum2;
3905  bool isnull1,
3906  isnull2;
3907  AttrNumber leading = state->indexInfo->ii_IndexAttrNumbers[0];
3908 
3909  /* Be prepared to compare additional sort keys */
3910  ltup = (HeapTuple) a->tuple;
3911  rtup = (HeapTuple) b->tuple;
3912  tupDesc = state->tupDesc;
3913 
3914  /* Compare the leading sort key, if it's simple */
3915  if (leading != 0)
3916  {
3917  compare = ApplySortComparator(a->datum1, a->isnull1,
3918  b->datum1, b->isnull1,
3919  sortKey);
3920  if (compare != 0)
3921  return compare;
3922 
3923  if (sortKey->abbrev_converter)
3924  {
3925  datum1 = heap_getattr(ltup, leading, tupDesc, &isnull1);
3926  datum2 = heap_getattr(rtup, leading, tupDesc, &isnull2);
3927 
3928  compare = ApplySortAbbrevFullComparator(datum1, isnull1,
3929  datum2, isnull2,
3930  sortKey);
3931  }
3932  if (compare != 0 || state->nKeys == 1)
3933  return compare;
3934  /* Compare additional columns the hard way */
3935  sortKey++;
3936  nkey = 1;
3937  }
3938  else
3939  {
3940  /* Must compare all keys the hard way */
3941  nkey = 0;
3942  }
3943 
3944  if (state->indexInfo->ii_Expressions == NULL)
3945  {
3946  /* If not expression index, just compare the proper heap attrs */
3947 
3948  for (; nkey < state->nKeys; nkey++, sortKey++)
3949  {
3950  AttrNumber attno = state->indexInfo->ii_IndexAttrNumbers[nkey];
3951 
3952  datum1 = heap_getattr(ltup, attno, tupDesc, &isnull1);
3953  datum2 = heap_getattr(rtup, attno, tupDesc, &isnull2);
3954 
3955  compare = ApplySortComparator(datum1, isnull1,
3956  datum2, isnull2,
3957  sortKey);
3958  if (compare != 0)
3959  return compare;
3960  }
3961  }
3962  else
3963  {
3964  /*
3965  * In the expression index case, compute the whole index tuple and
3966  * then compare values. It would perhaps be faster to compute only as
3967  * many columns as we need to compare, but that would require
3968  * duplicating all the logic in FormIndexDatum.
3969  */
3970  Datum l_index_values[INDEX_MAX_KEYS];
3971  bool l_index_isnull[INDEX_MAX_KEYS];
3972  Datum r_index_values[INDEX_MAX_KEYS];
3973  bool r_index_isnull[INDEX_MAX_KEYS];
3974  TupleTableSlot *ecxt_scantuple;
3975 
3976  /* Reset context each time to prevent memory leakage */
3978 
3979  ecxt_scantuple = GetPerTupleExprContext(state->estate)->ecxt_scantuple;
3980 
3981  ExecStoreHeapTuple(ltup, ecxt_scantuple, false);
3982  FormIndexDatum(state->indexInfo, ecxt_scantuple, state->estate,
3983  l_index_values, l_index_isnull);
3984 
3985  ExecStoreHeapTuple(rtup, ecxt_scantuple, false);
3986  FormIndexDatum(state->indexInfo, ecxt_scantuple, state->estate,
3987  r_index_values, r_index_isnull);
3988 
3989  for (; nkey < state->nKeys; nkey++, sortKey++)
3990  {
3991  compare = ApplySortComparator(l_index_values[nkey],
3992  l_index_isnull[nkey],
3993  r_index_values[nkey],
3994  r_index_isnull[nkey],
3995  sortKey);
3996  if (compare != 0)
3997  return compare;
3998  }
3999  }
4000 
4001  return 0;
4002 }
4003 
4004 static void
4006 {
4007  HeapTuple tuple = (HeapTuple) tup;
4008  Datum original;
4009  MemoryContext oldcontext = MemoryContextSwitchTo(state->tuplecontext);
4010 
4011  /* copy the tuple into sort storage */
4012  tuple = heap_copytuple(tuple);
4013  stup->tuple = (void *) tuple;
4014  USEMEM(state, GetMemoryChunkSpace(tuple));
4015 
4016  MemoryContextSwitchTo(oldcontext);
4017 
4018  /*
4019  * set up first-column key value, and potentially abbreviate, if it's a
4020  * simple column
4021  */
4022  if (state->indexInfo->ii_IndexAttrNumbers[0] == 0)
4023  return;
4024 
4025  original = heap_getattr(tuple,
4026  state->indexInfo->ii_IndexAttrNumbers[0],
4027  state->tupDesc,
4028  &stup->isnull1);
4029 
4030  if (!state->sortKeys->abbrev_converter || stup->isnull1)
4031  {
4032  /*
4033  * Store ordinary Datum representation, or NULL value. If there is a
4034  * converter it won't expect NULL values, and cost model is not
4035  * required to account for NULL, so in that case we avoid calling
4036  * converter and just set datum1 to zeroed representation (to be
4037  * consistent, and to support cheap inequality tests for NULL
4038  * abbreviated keys).
4039  */
4040  stup->datum1 = original;
4041  }
4042  else if (!consider_abort_common(state))
4043  {
4044  /* Store abbreviated key representation */
4045  stup->datum1 = state->sortKeys->abbrev_converter(original,
4046  state->sortKeys);
4047  }
4048  else
4049  {
4050  /* Abort abbreviation */
4051  int i;
4052 
4053  stup->datum1 = original;
4054 
4055  /*
4056  * Set state to be consistent with never trying abbreviation.
4057  *
4058  * Alter datum1 representation in already-copied tuples, so as to
4059  * ensure a consistent representation (current tuple was just
4060  * handled). It does not matter if some dumped tuples are already
4061  * sorted on tape, since serialized tuples lack abbreviated keys
4062  * (TSS_BUILDRUNS state prevents control reaching here in any case).
4063  */
4064  for (i = 0; i < state->memtupcount; i++)
4065  {
4066  SortTuple *mtup = &state->memtuples[i];
4067 
4068  tuple = (HeapTuple) mtup->tuple;
4069  mtup->datum1 = heap_getattr(tuple,
4070  state->indexInfo->ii_IndexAttrNumbers[0],
4071  state->tupDesc,
4072  &mtup->isnull1);
4073  }
4074  }
4075 }
4076 
4077 static void
4079 {
4080  HeapTuple tuple = (HeapTuple) stup->tuple;
4081  unsigned int tuplen = tuple->t_len + sizeof(ItemPointerData) + sizeof(int);
4082 
4083  /* We need to store t_self, but not other fields of HeapTupleData */
4084  LogicalTapeWrite(tape, &tuplen, sizeof(tuplen));
4085  LogicalTapeWrite(tape, &tuple->t_self, sizeof(ItemPointerData));
4086  LogicalTapeWrite(tape, tuple->t_data, tuple->t_len);
4087  if (state->randomAccess) /* need trailing length word? */
4088  LogicalTapeWrite(tape, &tuplen, sizeof(tuplen));
4089 
4090  if (!state->slabAllocatorUsed)
4091  {
4092  FREEMEM(state, GetMemoryChunkSpace(tuple));
4093  heap_freetuple(tuple);
4094  }
4095 }
4096 
4097 static void
4099  LogicalTape *tape, unsigned int tuplen)
4100 {
4101  unsigned int t_len = tuplen - sizeof(ItemPointerData) - sizeof(int);
4102  HeapTuple tuple = (HeapTuple) readtup_alloc(state,
4103  t_len + HEAPTUPLESIZE);
4104 
4105  /* Reconstruct the HeapTupleData header */
4106  tuple->t_data = (HeapTupleHeader) ((char *) tuple + HEAPTUPLESIZE);
4107  tuple->t_len = t_len;
4108  LogicalTapeReadExact(tape, &tuple->t_self, sizeof(ItemPointerData));
4109  /* We don't currently bother to reconstruct t_tableOid */
4110  tuple->t_tableOid = InvalidOid;
4111  /* Read in the tuple body */
4112  LogicalTapeReadExact(tape, tuple->t_data, tuple->t_len);
4113  if (state->randomAccess) /* need trailing length word? */
4114  LogicalTapeReadExact(tape, &tuplen, sizeof(tuplen));
4115  stup->tuple = (void *) tuple;
4116  /* set up first-column key value, if it's a simple column */
4117  if (state->indexInfo->ii_IndexAttrNumbers[0] != 0)
4118  stup->datum1 = heap_getattr(tuple,
4119  state->indexInfo->ii_IndexAttrNumbers[0],
4120  state->tupDesc,
4121  &stup->isnull1);
4122 }
4123 
4124 /*
4125  * Routines specialized for IndexTuple case
4126  *
4127  * The btree and hash cases require separate comparison functions, but the
4128  * IndexTuple representation is the same so the copy/write/read support
4129  * functions can be shared.
4130  */
4131 
4132 static int
4135 {
4136  /*
4137  * This is similar to comparetup_heap(), but expects index tuples. There
4138  * is also special handling for enforcing uniqueness, and special
4139  * treatment for equal keys at the end.
4140  */
4141  SortSupport sortKey = state->sortKeys;
4142  IndexTuple tuple1;
4143  IndexTuple tuple2;
4144  int keysz;
4145  TupleDesc tupDes;
4146  bool equal_hasnull = false;
4147  int nkey;
4148  int32 compare;
4149  Datum datum1,
4150  datum2;
4151  bool isnull1,
4152  isnull2;
4153 
4154 
4155  /* Compare the leading sort key */
4156  compare = ApplySortComparator(a->datum1, a->isnull1,
4157  b->datum1, b->isnull1,
4158  sortKey);
4159  if (compare != 0)
4160  return compare;
4161 
4162  /* Compare additional sort keys */
4163  tuple1 = (IndexTuple) a->tuple;
4164  tuple2 = (IndexTuple) b->tuple;
4165  keysz = state->nKeys;
4166  tupDes = RelationGetDescr(state->indexRel);
4167 
4168  if (sortKey->abbrev_converter)
4169  {
4170  datum1 = index_getattr(tuple1, 1, tupDes, &isnull1);
4171  datum2 = index_getattr(tuple2, 1, tupDes, &isnull2);
4172 
4173  compare = ApplySortAbbrevFullComparator(datum1, isnull1,
4174  datum2, isnull2,
4175  sortKey);
4176  if (compare != 0)
4177  return compare;
4178  }
4179 
4180  /* they are equal, so we only need to examine one null flag */
4181  if (a->isnull1)
4182  equal_hasnull = true;
4183 
4184  sortKey++;
4185  for (nkey = 2; nkey <= keysz; nkey++, sortKey++)
4186  {
4187  datum1 = index_getattr(tuple1, nkey, tupDes, &isnull1);
4188  datum2 = index_getattr(tuple2, nkey, tupDes, &isnull2);
4189 
4190  compare = ApplySortComparator(datum1, isnull1,
4191  datum2, isnull2,
4192  sortKey);
4193  if (compare != 0)
4194  return compare; /* done when we find unequal attributes */
4195 
4196  /* they are equal, so we only need to examine one null flag */
4197  if (isnull1)
4198  equal_hasnull = true;
4199  }
4200 
4201  /*
4202  * If btree has asked us to enforce uniqueness, complain if two equal
4203  * tuples are detected (unless there was at least one NULL field).
4204  *
4205  * It is sufficient to make the test here, because if two tuples are equal
4206  * they *must* get compared at some stage of the sort --- otherwise the
4207  * sort algorithm wouldn't have checked whether one must appear before the
4208  * other.
4209  */
4210  if (state->enforceUnique && !equal_hasnull)
4211  {
4213  bool isnull[INDEX_MAX_KEYS];
4214  char *key_desc;
4215 
4216  /*
4217  * Some rather brain-dead implementations of qsort (such as the one in
4218  * QNX 4) will sometimes call the comparison routine to compare a
4219  * value to itself, but we always use our own implementation, which
4220  * does not.
4221  */
4222  Assert(tuple1 != tuple2);
4223 
4224  index_deform_tuple(tuple1, tupDes, values, isnull);
4225 
4226  key_desc = BuildIndexValueDescription(state->indexRel, values, isnull);
4227 
4228  ereport(ERROR,
4229  (errcode(ERRCODE_UNIQUE_VIOLATION),
4230  errmsg("could not create unique index \"%s\"",
4232  key_desc ? errdetail("Key %s is duplicated.", key_desc) :
4233  errdetail("Duplicate keys exist."),
4234  errtableconstraint(state->heapRel,
4235  RelationGetRelationName(state->indexRel))));
4236  }
4237 
4238  /*
4239  * If key values are equal, we sort on ItemPointer. This is required for
4240  * btree indexes, since heap TID is treated as an implicit last key
4241  * attribute in order to ensure that all keys in the index are physically
4242  * unique.
4243  */
4244  {
4245  BlockNumber blk1 = ItemPointerGetBlockNumber(&tuple1->t_tid);
4246  BlockNumber blk2 = ItemPointerGetBlockNumber(&tuple2->t_tid);
4247 
4248  if (blk1 != blk2)
4249  return (blk1 < blk2) ? -1 : 1;
4250  }
4251  {
4254 
4255  if (pos1 != pos2)
4256  return (pos1 < pos2) ? -1 : 1;
4257  }
4258 
4259  /* ItemPointer values should never be equal */
4260  Assert(false);
4261 
4262  return 0;
4263 }
4264 
4265 static int
4268 {
4269  Bucket bucket1;
4270  Bucket bucket2;
4271  IndexTuple tuple1;
4272  IndexTuple tuple2;
4273 
4274  /*
4275  * Fetch hash keys and mask off bits we don't want to sort by. We know
4276  * that the first column of the index tuple is the hash key.
4277  */
4278  Assert(!a->isnull1);
4280  state->max_buckets, state->high_mask,
4281  state->low_mask);
4282  Assert(!b->isnull1);
4284  state->max_buckets, state->high_mask,
4285  state->low_mask);
4286  if (bucket1 > bucket2)
4287  return 1;
4288  else if (bucket1 < bucket2)
4289  return -1;
4290 
4291  /*
4292  * If hash values are equal, we sort on ItemPointer. This does not affect
4293  * validity of the finished index, but it may be useful to have index
4294  * scans in physical order.
4295  */
4296  tuple1 = (IndexTuple) a->tuple;
4297  tuple2 = (IndexTuple) b->tuple;
4298 
4299  {
4300  BlockNumber blk1 = ItemPointerGetBlockNumber(&tuple1->t_tid);
4301  BlockNumber blk2 = ItemPointerGetBlockNumber(&tuple2->t_tid);
4302 
4303  if (blk1 != blk2)
4304  return (blk1 < blk2) ? -1 : 1;
4305  }
4306  {
4309 
4310  if (pos1 != pos2)
4311  return (pos1 < pos2) ? -1 : 1;
4312  }
4313 
4314  /* ItemPointer values should never be equal */
4315  Assert(false);
4316 
4317  return 0;
4318 }
4319 
4320 static void
4322 {
4323  /* Not currently needed */
4324  elog(ERROR, "copytup_index() should not be called");
4325 }
4326 
4327 static void
4329 {
4330  IndexTuple tuple = (IndexTuple) stup->tuple;
4331  unsigned int tuplen;
4332 
4333  tuplen = IndexTupleSize(tuple) + sizeof(tuplen);
4334  LogicalTapeWrite(tape, (void *) &tuplen, sizeof(tuplen));
4335  LogicalTapeWrite(tape, (void *) tuple, IndexTupleSize(tuple));
4336  if (state->randomAccess) /* need trailing length word? */
4337  LogicalTapeWrite(tape, (void *) &tuplen, sizeof(tuplen));
4338 
4339  if (!state->slabAllocatorUsed)
4340  {
4341  FREEMEM(state, GetMemoryChunkSpace(tuple));
4342  pfree(tuple);
4343  }
4344 }
4345 
4346 static void
4348  LogicalTape *tape, unsigned int len)
4349 {
4350  unsigned int tuplen = len - sizeof(unsigned int);
4351  IndexTuple tuple = (IndexTuple) readtup_alloc(state, tuplen);
4352 
4353  LogicalTapeReadExact(tape, tuple, tuplen);
4354  if (state->randomAccess) /* need trailing length word? */
4355  LogicalTapeReadExact(tape, &tuplen, sizeof(tuplen));
4356  stup->tuple = (void *) tuple;
4357  /* set up first-column key value */
4358  stup->datum1 = index_getattr(tuple,
4359  1,
4360  RelationGetDescr(state->indexRel),
4361  &stup->isnull1);
4362 }
4363 
4364 /*
4365  * Routines specialized for DatumTuple case
4366  */
4367 
4368 static int
4370 {
4371  int compare;
4372 
4373  compare = ApplySortComparator(a->datum1, a->isnull1,
4374  b->datum1, b->isnull1,
4375  state->sortKeys);
4376  if (compare != 0)
4377  return compare;
4378 
4379  /* if we have abbreviations, then "tuple" has the original value */
4380 
4381  if (state->sortKeys->abbrev_converter)
4383  PointerGetDatum(b->tuple), b->isnull1,
4384  state->sortKeys);
4385 
4386  return compare;
4387 }
4388 
4389 static void
4391 {
4392  /* Not currently needed */
4393  elog(ERROR, "copytup_datum() should not be called");
4394 }
4395 
4396 static void
4398 {
4399  void *waddr;
4400  unsigned int tuplen;
4401  unsigned int writtenlen;
4402 
4403  if (stup->isnull1)
4404  {
4405  waddr = NULL;
4406  tuplen = 0;
4407  }
4408  else if (!state->tuples)
4409  {
4410  waddr = &stup->datum1;
4411  tuplen = sizeof(Datum);
4412  }
4413  else
4414  {
4415  waddr = stup->tuple;
4416  tuplen = datumGetSize(PointerGetDatum(stup->tuple), false, state->datumTypeLen);
4417  Assert(tuplen != 0);
4418  }
4419 
4420  writtenlen = tuplen + sizeof(unsigned int);
4421 
4422  LogicalTapeWrite(tape, (void *) &writtenlen, sizeof(writtenlen));
4423  LogicalTapeWrite(tape, waddr, tuplen);
4424  if (state->randomAccess) /* need trailing length word? */
4425  LogicalTapeWrite(tape, (void *) &writtenlen, sizeof(writtenlen));
4426 
4427  if (!state->slabAllocatorUsed && stup->tuple)
4428  {
4429  FREEMEM(state, GetMemoryChunkSpace(stup->tuple));
4430  pfree(stup->tuple);
4431  }
4432 }
4433 
4434 static void
4436  LogicalTape *tape, unsigned int len)
4437 {
4438  unsigned int tuplen = len - sizeof(unsigned int);
4439 
4440  if (tuplen == 0)
4441  {
4442  /* it's NULL */
4443  stup->datum1 = (Datum) 0;
4444  stup->isnull1 = true;
4445  stup->tuple = NULL;
4446  }
4447  else if (!state->tuples)
4448  {
4449  Assert(tuplen == sizeof(Datum));
4450  LogicalTapeReadExact(tape, &stup->datum1, tuplen);
4451  stup->isnull1 = false;
4452  stup->tuple = NULL;
4453  }
4454  else
4455  {
4456  void *raddr = readtup_alloc(state, tuplen);
4457 
4458  LogicalTapeReadExact(tape, raddr, tuplen);
4459  stup->datum1 = PointerGetDatum(raddr);
4460  stup->isnull1 = false;
4461  stup->tuple = raddr;
4462  }
4463 
4464  if (state->randomAccess) /* need trailing length word? */
4465  LogicalTapeReadExact(tape, &tuplen, sizeof(tuplen));
4466 }
4467 
4468 /*
4469  * Parallel sort routines
4470  */
4471 
4472 /*
4473  * tuplesort_estimate_shared - estimate required shared memory allocation
4474  *
4475  * nWorkers is an estimate of the number of workers (it's the number that
4476  * will be requested).
4477  */
4478 Size
4480 {
4481  Size tapesSize;
4482 
4483  Assert(nWorkers > 0);
4484 
4485  /* Make sure that BufFile shared state is MAXALIGN'd */
4486  tapesSize = mul_size(sizeof(TapeShare), nWorkers);
4487  tapesSize = MAXALIGN(add_size(tapesSize, offsetof(Sharedsort, tapes)));
4488 
4489  return tapesSize;
4490 }
4491 
4492 /*
4493  * tuplesort_initialize_shared - initialize shared tuplesort state
4494  *
4495  * Must be called from leader process before workers are launched, to
4496  * establish state needed up-front for worker tuplesortstates. nWorkers
4497  * should match the argument passed to tuplesort_estimate_shared().
4498  */
4499 void
4501 {
4502  int i;
4503 
4504  Assert(nWorkers > 0);
4505 
4506  SpinLockInit(&shared->mutex);
4507  shared->currentWorker = 0;
4508  shared->workersFinished = 0;
4509  SharedFileSetInit(&shared->fileset, seg);
4510  shared->nTapes = nWorkers;
4511  for (i = 0; i < nWorkers; i++)
4512  {
4513  shared->tapes[i].firstblocknumber = 0L;
4514  }
4515 }
4516 
4517 /*
4518  * tuplesort_attach_shared - attach to shared tuplesort state
4519  *
4520  * Must be called by all worker processes.
4521  */
4522 void
4524 {
4525  /* Attach to SharedFileSet */
4526  SharedFileSetAttach(&shared->fileset, seg);
4527 }
4528 
4529 /*
4530  * worker_get_identifier - Assign and return ordinal identifier for worker
4531  *
4532  * The order in which these are assigned is not well defined, and should not
4533  * matter; worker numbers across parallel sort participants need only be
4534  * distinct and gapless. logtape.c requires this.
4535  *
4536  * Note that the identifiers assigned from here have no relation to
4537  * ParallelWorkerNumber number, to avoid making any assumption about
4538  * caller's requirements. However, we do follow the ParallelWorkerNumber
4539  * convention of representing a non-worker with worker number -1. This
4540  * includes the leader, as well as serial Tuplesort processes.
4541  */
4542 static int
4544 {
4545  Sharedsort *shared = state->shared;
4546  int worker;
4547 
4548  Assert(WORKER(state));
4549 
4550  SpinLockAcquire(&shared->mutex);
4551  worker = shared->currentWorker++;
4552  SpinLockRelease(&shared->mutex);
4553 
4554  return worker;
4555 }
4556 
4557 /*
4558  * worker_freeze_result_tape - freeze worker's result tape for leader
4559  *
4560  * This is called by workers just after the result tape has been determined,
4561  * instead of calling LogicalTapeFreeze() directly. They do so because
4562  * workers require a few additional steps over similar serial
4563  * TSS_SORTEDONTAPE external sort cases, which also happen here. The extra
4564  * steps are around freeing now unneeded resources, and representing to
4565  * leader that worker's input run is available for its merge.
4566  *
4567  * There should only be one final output run for each worker, which consists
4568  * of all tuples that were originally input into worker.
4569  */
4570 static void
4572 {
4573  Sharedsort *shared = state->shared;
4574  TapeShare output;
4575 
4576  Assert(WORKER(state));
4577  Assert(state->result_tape != NULL);
4578  Assert(state->memtupcount == 0);
4579 
4580  /*
4581  * Free most remaining memory, in case caller is sensitive to our holding
4582  * on to it. memtuples may not be a tiny merge heap at this point.
4583  */
4584  pfree(state->memtuples);
4585  /* Be tidy */
4586  state->memtuples = NULL;
4587  state->memtupsize = 0;
4588 
4589  /*
4590  * Parallel worker requires result tape metadata, which is to be stored in
4591  * shared memory for leader
4592  */
4593  LogicalTapeFreeze(state->result_tape, &output);
4594 
4595  /* Store properties of output tape, and update finished worker count */
4596  SpinLockAcquire(&shared->mutex);
4597  shared->tapes[state->worker] = output;
4598  shared->workersFinished++;
4599  SpinLockRelease(&shared->mutex);
4600 }
4601 
4602 /*
4603  * worker_nomergeruns - dump memtuples in worker, without merging
4604  *
4605  * This called as an alternative to mergeruns() with a worker when no
4606  * merging is required.
4607  */
4608 static void
4610 {
4611  Assert(WORKER(state));
4612  Assert(state->result_tape == NULL);
4613  Assert(state->nOutputRuns == 1);
4614 
4615  state->result_tape = state->destTape;
4617 }
4618 
4619 /*
4620  * leader_takeover_tapes - create tapeset for leader from worker tapes
4621  *
4622  * So far, leader Tuplesortstate has performed no actual sorting. By now, all
4623  * sorting has occurred in workers, all of which must have already returned
4624  * from tuplesort_performsort().
4625  *
4626  * When this returns, leader process is left in a state that is virtually
4627  * indistinguishable from it having generated runs as a serial external sort
4628  * might have.
4629  */
4630 static void
4632 {
4633  Sharedsort *shared = state->shared;
4634  int nParticipants = state->nParticipants;
4635  int workersFinished;
4636  int j;
4637 
4638  Assert(LEADER(state));
4639  Assert(nParticipants >= 1);
4640 
4641  SpinLockAcquire(&shared->mutex);
4642  workersFinished = shared->workersFinished;
4643  SpinLockRelease(&shared->mutex);
4644 
4645  if (nParticipants != workersFinished)
4646  elog(ERROR, "cannot take over tapes before all workers finish");
4647 
4648  /*
4649  * Create the tapeset from worker tapes, including a leader-owned tape at
4650  * the end. Parallel workers are far more expensive than logical tapes,
4651  * so the number of tapes allocated here should never be excessive.
4652  */
4653  inittapestate(state, nParticipants);
4654  state->tapeset = LogicalTapeSetCreate(false, &shared->fileset, -1);
4655 
4656  /*
4657  * Set currentRun to reflect the number of runs we will merge (it's not
4658  * used for anything, this is just pro forma)
4659  */
4660  state->currentRun = nParticipants;
4661 
4662  /*
4663  * Initialize the state to look the same as after building the initial
4664  * runs.
4665  *
4666  * There will always be exactly 1 run per worker, and exactly one input
4667  * tape per run, because workers always output exactly 1 run, even when
4668  * there were no input tuples for workers to sort.
4669  */
4670  state->inputTapes = NULL;
4671  state->nInputTapes = 0;
4672  state->nInputRuns = 0;
4673 
4674  state->outputTapes = palloc0(nParticipants * sizeof(LogicalTape *));
4675  state->nOutputTapes = nParticipants;
4676  state->nOutputRuns = nParticipants;
4677 
4678  for (j = 0; j < nParticipants; j++)
4679  {
4680  state->outputTapes[j] = LogicalTapeImport(state->tapeset, j, &shared->tapes[j]);
4681  }
4682 
4683  state->status = TSS_BUILDRUNS;
4684 }
4685 
4686 /*
4687  * Convenience routine to free a tuple previously loaded into sort memory
4688  */
4689 static void
4691 {
4692  if (stup->tuple)
4693  {
4694  FREEMEM(state, GetMemoryChunkSpace(stup->tuple));
4695  pfree(stup->tuple);
4696  stup->tuple = NULL;
4697  }
4698 }
bool tuplesort_used_bound(Tuplesortstate *state)
Definition: tuplesort.c:1379
LogicalTape * LogicalTapeCreate(LogicalTapeSet *lts)
Definition: logtape.c:689
struct SortSupportData * SortSupport
Definition: sortsupport.h:58
IndexTuple tuplesort_getindextuple(Tuplesortstate *state, bool forward)
Definition: tuplesort.c:2465
#define MINIMAL_TUPLE_DATA_OFFSET
Definition: htup_details.h:622
signed short int16
Definition: c.h:428
static int comparetup_index_hash(const SortTuple *a, const SortTuple *b, Tuplesortstate *state)
Definition: tuplesort.c:4266
void FormIndexDatum(IndexInfo *indexInfo, TupleTableSlot *slot, EState *estate, Datum *values, bool *isnull)
Definition: index.c:2662
HeapTuple heap_copytuple(HeapTuple tuple)
Definition: heaptuple.c:680
bool ssup_nulls_first
Definition: sortsupport.h:75
#define DatumGetUInt32(X)
Definition: postgres.h:530
int slock_t
Definition: s_lock.h:958
static int comparetup_heap(const SortTuple *a, const SortTuple *b, Tuplesortstate *state)
Definition: tuplesort.c:3700
static void dumptuples(Tuplesortstate *state, bool alltuples)
Definition: tuplesort.c:3135
static void reversedirection(Tuplesortstate *state)
Definition: tuplesort.c:3627
bool tuplesort_getdatum(Tuplesortstate *state, bool forward, Datum *val, bool *isNull, Datum *abbrev)
Definition: tuplesort.c:2494
int64 availMem
Definition: tuplesort.c:255
TupSortStatus status
Definition: tuplesort.c:247
Relation heapRel
Definition: tuplesort.c:457
void tuplesort_performsort(Tuplesortstate *state)
Definition: tuplesort.c:2043
static bool grow_memtuples(Tuplesortstate *state)
Definition: tuplesort.c:1568
void MemoryContextDelete(MemoryContext context)
Definition: mcxt.c:218
#define AllocSetContextCreate
Definition: memutils.h:173
HeapTupleData * HeapTuple
Definition: htup.h:71
HeapTuple tuplesort_getheaptuple(Tuplesortstate *state, bool forward)
Definition: tuplesort.c:2445
TupleTableSlot * ExecStoreMinimalTuple(MinimalTuple mtup, TupleTableSlot *slot, bool shouldFree)
Definition: execTuples.c:1446
bool isMaxSpaceDisk
Definition: tuplesort.c:261
#define BTGreaterStrategyNumber
Definition: stratnum.h:33
const char * tuplesort_space_type_name(TuplesortSpaceType t)
Definition: tuplesort.c:3390
void tuplesort_putdatum(Tuplesortstate *state, Datum val, bool isNull)
Definition: tuplesort.c:1808
#define AssertState(condition)
Definition: c.h:807
char * slabMemoryEnd
Definition: tuplesort.c:351
Tuplesortstate * tuplesort_begin_datum(Oid datumType, Oid sortOperator, Oid sortCollation, bool nullsFirstFlag, int workMem, SortCoordinate coordinate, bool randomAccess)
Definition: tuplesort.c:1246
void tuplesort_restorepos(Tuplesortstate *state)
Definition: tuplesort.c:3290
void(* readtup)(Tuplesortstate *state, SortTuple *stup, LogicalTape *tape, unsigned int len)
Definition: tuplesort.c:306
static void mergeonerun(Tuplesortstate *state)
Definition: tuplesort.c:3029
static void worker_freeze_result_tape(Tuplesortstate *state)
Definition: tuplesort.c:4571
PGRUsage ru_start
Definition: tuplesort.c:480
#define SERIAL(state)
Definition: tuplesort.c:547
slock_t mutex
Definition: tuplesort.c:491
static TupleTableSlot * ExecClearTuple(TupleTableSlot *slot)
Definition: tuptable.h:425
static void sort_bounded_heap(Tuplesortstate *state)
Definition: tuplesort.c:3460
void SharedFileSetInit(SharedFileSet *fileset, dsm_segment *seg)
Definition: sharedfileset.c:44
TuplesortMethod
Definition: tuplesort.h:72
#define ResetPerTupleExprContext(estate)
Definition: executor.h:542
int64 abbrevNext
Definition: tuplesort.c:443
#define RelationGetDescr(relation)
Definition: rel.h:503
BTScanInsert _bt_mkscankey(Relation rel, IndexTuple itup)
Definition: nbtutils.c:90
Bucket _hash_hashkey2bucket(uint32 hashkey, uint32 maxbucket, uint32 highmask, uint32 lowmask)
Definition: hashutil.c:126
TupleTableSlot * MakeSingleTupleTableSlot(TupleDesc tupdesc, const TupleTableSlotOps *tts_ops)
Definition: execTuples.c:1238
#define RelationGetNumberOfAttributes(relation)
Definition: rel.h:483
EState * estate
Definition: tuplesort.c:451
static void output(uint64 loop_count)
#define PointerGetDatum(X)
Definition: postgres.h:600
HeapTupleHeaderData * HeapTupleHeader
Definition: htup.h:23
static void copytup_index(Tuplesortstate *state, SortTuple *stup, void *tup)
Definition: tuplesort.c:4321
int(* SortTupleComparator)(const SortTuple *a, const SortTuple *b, Tuplesortstate *state)
Definition: tuplesort.c:239
SortSupport sortKeys
Definition: tuplesort.c:429
MemoryContext maincontext
Definition: tuplesort.c:265
#define SpinLockInit(lock)
Definition: spin.h:60
ItemPointerData t_tid
Definition: itup.h:37
void(* copytup)(Tuplesortstate *state, SortTuple *stup, void *tup)
Definition: tuplesort.c:288
#define Min(x, y)
Definition: c.h:986
char buffer[SLAB_SLOT_SIZE]
Definition: tuplesort.c:205
void PrepareSortSupportFromOrderingOp(Oid orderingOp, SortSupport ssup)
Definition: sortsupport.c:135
bool randomAccess
Definition: tuplesort.c:249
int64 maxSpace
Definition: tuplesort.c:259
static MemoryContext MemoryContextSwitchTo(MemoryContext context)
Definition: palloc.h:109
Datum datum1
Definition: tuplesort.c:184
Tuplesortstate * tuplesort_begin_cluster(TupleDesc tupDesc, Relation indexRel, int workMem, SortCoordinate coordinate, bool randomAccess)
Definition: tuplesort.c:970
SortTupleComparator comparetup
Definition: tuplesort.c:280
#define CLUSTER_SORT
Definition: tuplesort.c:126
static void readtup_index(Tuplesortstate *state, SortTuple *stup, LogicalTape *tape, unsigned int len)
Definition: tuplesort.c:4347
Sharedsort * sharedsort
Definition: tuplesort.h:55
#define FLEXIBLE_ARRAY_MEMBER
Definition: c.h:350
int errcode(int sqlerrcode)
Definition: elog.c:698
bool growmemtuples
Definition: tuplesort.c:319
Tuplesortstate * tuplesort_begin_index_hash(Relation heapRel, Relation indexRel, uint32 high_mask, uint32 low_mask, uint32 max_buckets, int workMem, SortCoordinate coordinate, bool randomAccess)
Definition: tuplesort.c:1143
Size GetMemoryChunkSpace(void *pointer)
Definition: mcxt.c:434
long firstblocknumber
Definition: logtape.h:54
#define SLAB_SLOT_SIZE
Definition: tuplesort.c:200
static int comparetup_cluster(const SortTuple *a, const SortTuple *b, Tuplesortstate *state)
Definition: tuplesort.c:3894
void MemoryContextReset(MemoryContext context)
Definition: mcxt.c:143
uint32 BlockNumber
Definition: block.h:31
void LogicalTapeSeek(LogicalTape *lt, long blocknum, int offset)
Definition: logtape.c:1142
#define INDEX_SORT
Definition: tuplesort.c:124
IndexInfo * BuildIndexInfo(Relation index)
Definition: index.c:2378
static void inittapes(Tuplesortstate *state, bool mergeruns)
Definition: tuplesort.c:2689
void tuplesort_initialize_shared(Sharedsort *shared, int nWorkers, dsm_segment *seg)
Definition: tuplesort.c:4500
#define LOG
Definition: elog.h:26
Form_pg_class rd_rel
Definition: rel.h:109
void heap_freetuple(HeapTuple htup)
Definition: heaptuple.c:1338
unsigned int Oid
Definition: postgres_ext.h:31
void(* writetup)(Tuplesortstate *state, LogicalTape *tape, SortTuple *stup)
Definition: tuplesort.c:298
#define HEAP_SORT
Definition: tuplesort.c:123
static int worker_get_identifier(Tuplesortstate *state)
Definition: tuplesort.c:4543
static void copytup_cluster(Tuplesortstate *state, SortTuple *stup, void *tup)
Definition: tuplesort.c:4005
static int comparetup_index_btree(const SortTuple *a, const SortTuple *b, Tuplesortstate *state)
Definition: tuplesort.c:4133
bool isnull1
Definition: tuplesort.c:185
void tuplesort_get_stats(Tuplesortstate *state, TuplesortInstrumentation *stats)
Definition: tuplesort.c:3323
void LogicalTapeClose(LogicalTape *lt)
Definition: logtape.c:742
const char * tuplesort_method_name(TuplesortMethod m)
Definition: tuplesort.c:3367
static void writetup_heap(Tuplesortstate *state, LogicalTape *tape, SortTuple *stup)
Definition: tuplesort.c:3840
MinimalTuple heap_copy_minimal_tuple(MinimalTuple mtup)
Definition: heaptuple.c:1439
void MemoryContextResetOnly(MemoryContext context)
Definition: mcxt.c:162
int markpos_offset
Definition: tuplesort.c:400
bool trace_sort
Definition: tuplesort.c:144
static void mergeruns(Tuplesortstate *state)
Definition: tuplesort.c:2843
static void tuplesort_updatemax(Tuplesortstate *state)
Definition: tuplesort.c:1484
#define PARALLEL_SORT(state)
Definition: tuplesort.c:129
#define MaxAllocHugeSize
Definition: memutils.h:44
signed int int32
Definition: c.h:429
static void init_slab_allocator(Tuplesortstate *state, int numSlots)
Definition: tuplesort.c:2807
#define DATUM_SORT
Definition: tuplesort.c:125
uint16 OffsetNumber
Definition: off.h:24
HeapTupleHeader t_data
Definition: htup.h:68
void pg_rusage_init(PGRUsage *ru0)
Definition: pg_rusage.c:27
static bool mergereadnext(Tuplesortstate *state, LogicalTape *srcTape, SortTuple *stup)
Definition: tuplesort.c:3116
Tuplesortstate * tuplesort_begin_heap(TupleDesc tupDesc, int nkeys, AttrNumber *attNums, Oid *sortOperators, Oid *sortCollations, bool *nullsFirstFlags, int workMem, SortCoordinate coordinate, bool randomAccess)
Definition: tuplesort.c:896
uint32 Bucket
Definition: hash.h:35
static void writetup_index(Tuplesortstate *state, LogicalTape *tape, SortTuple *stup)
Definition: tuplesort.c:4328
#define LogicalTapeReadExact(tape, ptr, len)
Definition: tuplesort.c:601
IndexTuple index_form_tuple(TupleDesc tupleDescriptor, Datum *values, bool *isnull)
Definition: indextuple.c:47
static void tuplesort_begin_batch(Tuplesortstate *state)
Definition: tuplesort.c:832
void FreeExecutorState(EState *estate)
Definition: execUtils.c:186
#define GetPerTupleExprContext(estate)
Definition: executor.h:533
void * tuple
Definition: tuplesort.c:183
int errtableconstraint(Relation rel, const char *conname)
Definition: relcache.c:5793
#define SpinLockAcquire(lock)
Definition: spin.h:62
#define TAPE_BUFFER_OVERHEAD
Definition: tuplesort.c:236
static bool consider_abort_common(Tuplesortstate *state)
Definition: tuplesort.c:1999
Tuplesortstate * tuplesort_begin_index_gist(Relation heapRel, Relation indexRel, int workMem, SortCoordinate coordinate, bool randomAccess)
Definition: tuplesort.c:1189
void pfree(void *pointer)
Definition: mcxt.c:1169
union SlabSlot * nextfree
Definition: tuplesort.c:204
TupSortStatus maxSpaceStatus
Definition: tuplesort.c:264
static int compare(const void *arg1, const void *arg2)
Definition: geqo_pool.c:145
Oid * rd_indcollation
Definition: rel.h:212
#define ERROR
Definition: elog.h:46
void PrepareTempTablespaces(void)
Definition: tablespace.c:1332
void tuplesort_reset(Tuplesortstate *state)
Definition: tuplesort.c:1535
void PrepareSortSupportFromGistIndexRel(Relation indexRel, SortSupport ssup)
Definition: sortsupport.c:189
void heap_free_minimal_tuple(MinimalTuple mtup)
Definition: heaptuple.c:1427
void * lastReturnedTuple
Definition: tuplesort.c:363
LogicalTape * result_tape
Definition: tuplesort.c:394
MemoryContext sortcontext
Definition: tuplesort.c:267
void LogicalTapeWrite(LogicalTape *lt, void *ptr, size_t size)
Definition: logtape.c:770
MemoryContext ssup_cxt
Definition: sortsupport.h:66
uint32 high_mask
Definition: tuplesort.c:464
#define MERGE_BUFFER_SIZE
Definition: tuplesort.c:237
ItemPointerData t_self
Definition: htup.h:65
static int comparetup_datum(const SortTuple *a, const SortTuple *b, Tuplesortstate *state)
Definition: tuplesort.c:4369
void tuplesort_rescan(Tuplesortstate *state)
Definition: tuplesort.c:3226
#define ALLOCSET_DEFAULT_SIZES
Definition: memutils.h:195
bool tuplesort_gettupleslot(Tuplesortstate *state, bool forward, bool copy, TupleTableSlot *slot, Datum *abbrev)
Definition: tuplesort.c:2408
size_t LogicalTapeRead(LogicalTape *lt, void *ptr, size_t size)
Definition: logtape.c:937
int(* comparator)(Datum x, Datum y, SortSupport ssup)
Definition: sortsupport.h:106
static void free_sort_tuple(Tuplesortstate *state, SortTuple *stup)
Definition: tuplesort.c:4690
uint32 t_len
Definition: htup.h:64
static void copytup_datum(Tuplesortstate *state, SortTuple *stup, void *tup)
Definition: tuplesort.c:4390
static MinimalTuple ExecCopySlotMinimalTuple(TupleTableSlot *slot)
Definition: tuptable.h:463
LogicalTape * destTape
Definition: tuplesort.c:387
void tuplesort_set_bound(Tuplesortstate *state, int64 bound)
Definition: tuplesort.c:1333
int(* abbrev_full_comparator)(Datum x, Datum y, SortSupport ssup)
Definition: sortsupport.h:191
static char * buf
Definition: pg_test_fsync.c:68
Sharedsort * shared
Definition: tuplesort.c:420
TuplesortMethod sortMethod
Definition: tuplesort.h:91
void ExecDropSingleTupleTableSlot(TupleTableSlot *slot)
Definition: execTuples.c:1254
MinimalTupleData * MinimalTuple
Definition: htup.h:27
IndexTupleData * IndexTuple
Definition: itup.h:53
#define COMPARETUP(state, a, b)
Definition: tuplesort.c:540
int errdetail(const char *fmt,...)
Definition: elog.c:1042
#define MINORDER
Definition: tuplesort.c:234
union SlabSlot SlabSlot
IndexInfo * indexInfo
Definition: tuplesort.c:450
static void tuplesort_heap_insert(Tuplesortstate *state, SortTuple *tuple)
Definition: tuplesort.c:3528
const char * pg_rusage_show(const PGRUsage *ru0)
Definition: pg_rusage.c:40
#define LEADER(state)
Definition: tuplesort.c:549
#define FREEMEM(state, amt)
Definition: tuplesort.c:546
#define RelationGetRelationName(relation)
Definition: rel.h:511
Datum(* abbrev_converter)(Datum original, SortSupport ssup)
Definition: sortsupport.h:172
static bool tuplesort_gettuple_common(Tuplesortstate *state, bool forward, SortTuple *stup)
Definition: tuplesort.c:2154
unsigned int uint32
Definition: c.h:441
LogicalTapeSet * tapeset
Definition: tuplesort.c:269
Oid t_tableOid
Definition: htup.h:66
void index_deform_tuple(IndexTuple tup, TupleDesc tupleDescriptor, Datum *values, bool *isnull)
Definition: indextuple.c:437
MemoryContext CurrentMemoryContext
Definition: mcxt.c:42
#define WORKER(state)
Definition: tuplesort.c:548
int64 allowedMem
Definition: tuplesort.c:256
#define IndexRelationGetNumberOfKeyAttributes(relation)
Definition: rel.h:496
void LogicalTapeRewindForRead(LogicalTape *lt, size_t buffer_size)
Definition: logtape.c:855
LogicalTape ** outputTapes
Definition: tuplesort.c:383
int workersFinished
Definition: tuplesort.c:502
void PrepareSortSupportFromIndexRel(Relation indexRel, int16 strategy, SortSupport ssup)
Definition: sortsupport.c:162
int nTapes
Definition: tuplesort.c:508
static void markrunend(LogicalTape *tape)
Definition: tuplesort.c:3658
static void selectnewtape(Tuplesortstate *state)
Definition: tuplesort.c:2774
static void tuplesort_free(Tuplesortstate *state)
Definition: tuplesort.c:1390
size_t LogicalTapeBackspace(LogicalTape *lt, size_t size)
Definition: logtape.c:1071
#define AssertArg(condition)
Definition: c.h:806
Datum datumCopy(Datum value, bool typByVal, int typLen)
Definition: datum.c:131
#define COPYTUP(state, stup, tup)
Definition: tuplesort.c:541
EState * CreateExecutorState(void)
Definition: execUtils.c:90
static void copytup_heap(Tuplesortstate *state, SortTuple *stup, void *tup)
Definition: tuplesort.c:3762
#define heap_getattr(tup, attnum, tupleDesc, isnull)
Definition: htup_details.h:761
#define READTUP(state, stup, tape, len)
Definition: tuplesort.c:543
#define SK_BT_NULLS_FIRST
Definition: nbtree.h:1083
#define WRITETUP(state, tape, stup)
Definition: tuplesort.c:542
#define SpinLockRelease(lock)
Definition: spin.h:64
static void writetup_datum(Tuplesortstate *state, LogicalTape *tape, SortTuple *stup)
Definition: tuplesort.c:4397
Size mul_size(Size s1, Size s2)
Definition: shmem.c:519
#define RELEASE_SLAB_SLOT(state, tuple)
Definition: tuplesort.c:528
void LogicalTapeTell(LogicalTape *lt, long *blocknum, int *offset)
Definition: logtape.c:1171
void * palloc0(Size size)
Definition: mcxt.c:1093
Relation indexRel
Definition: tuplesort.c:458
char * slabMemoryBegin
Definition: tuplesort.c:350
uintptr_t Datum
Definition: postgres.h:411
Size add_size(Size s1, Size s2)
Definition: shmem.c:502
AttrNumber ssup_attno
Definition: sortsupport.h:81
static void puttuple_common(Tuplesortstate *state, SortTuple *tuple)
Definition: tuplesort.c:1890
static void tuplesort_heap_delete_top(Tuplesortstate *state)
Definition: tuplesort.c:3563
static unsigned int getlen(LogicalTape *tape, bool eofOK)
Definition: tuplesort.c:3645
#define InvalidOid
Definition: postgres_ext.h:36
void tuplesort_putindextuplevalues(Tuplesortstate *state, Relation rel, ItemPointer self, Datum *values, bool *isnull)
Definition: tuplesort.c:1730
#define ereport(elevel,...)
Definition: elog.h:157
#define Max(x, y)
Definition: c.h:980
size_t tape_buffer_mem
Definition: tuplesort.c:355
void tuplesort_attach_shared(Sharedsort *shared, dsm_segment *seg)
Definition: tuplesort.c:4523
int sk_flags
Definition: skey.h:66
List * ii_Expressions
Definition: execnodes.h:162
#define Assert(condition)
Definition: c.h:804
#define SK_BT_DESC
Definition: nbtree.h:1082
Definition: regguts.h:317
bool enforceUnique
Definition: tuplesort.c:461
int srctape
Definition: tuplesort.c:186
TuplesortSpaceType
Definition: tuplesort.h:83
bool(* abbrev_abort)(int memtupcount, SortSupport ssup)
Definition: sortsupport.h:182
long markpos_block
Definition: tuplesort.c:399
struct ItemPointerData ItemPointerData
#define INDEX_MAX_KEYS
size_t Size
Definition: c.h:540
bool eof_reached
Definition: tuplesort.c:396
TupSortStatus
Definition: tuplesort.c:212
static void readtup_datum(Tuplesortstate *state, SortTuple *stup, LogicalTape *tape, unsigned int len)
Definition: tuplesort.c:4435
int tuplesort_merge_order(int64 allowedMem)
Definition: tuplesort.c:2602
void get_typlenbyval(Oid typid, int16 *typlen, bool *typbyval)
Definition: lsyscache.c:2198
TupleTableSlot * ecxt_scantuple
Definition: execnodes.h:227
#define MAXALIGN(LEN)
Definition: c.h:757
#define INITIAL_MEMTUPSIZE
Definition: tuplesort.c:139
#define index_getattr(tup, attnum, tupleDesc, isnull)
Definition: itup.h:100
static void tuplesort_sort_memtuples(Tuplesortstate *state)
Definition: tuplesort.c:3500
LogicalTape * LogicalTapeImport(LogicalTapeSet *lts, int worker, TapeShare *shared)
Definition: logtape.c:618
#define ItemPointerGetOffsetNumber(pointer)
Definition: itemptr.h:117
ScanKeyData scankeys[INDEX_MAX_KEYS]
Definition: nbtree.h:791
Size tuplesort_estimate_shared(int nWorkers)
Definition: tuplesort.c:4479
uint32 max_buckets
Definition: tuplesort.c:466
#define MINIMAL_TUPLE_OFFSET
Definition: htup_details.h:618
#define INT64_FORMAT
Definition: c.h:483
LogicalTapeSet * LogicalTapeSetCreate(bool preallocate, SharedFileSet *fileset, int worker)
Definition: logtape.c:565
bool slabAllocatorUsed
Definition: tuplesort.c:348
static void leader_takeover_tapes(Tuplesortstate *state)
Definition: tuplesort.c:4631
#define DatumGetPointer(X)
Definition: postgres.h:593
static Datum values[MAXATTR]
Definition: bootstrap.c:156
static Tuplesortstate * tuplesort_begin_common(int workMem, SortCoordinate coordinate, bool randomAccess)
Definition: tuplesort.c:720
static void * readtup_alloc(Tuplesortstate *state, Size tuplen)
Definition: tuplesort.c:3672
MemoryContext tuplecontext
Definition: tuplesort.c:268
void * repalloc_huge(void *pointer, Size size)
Definition: mcxt.c:1252
void * palloc(Size size)
Definition: mcxt.c:1062
int errmsg(const char *fmt,...)
Definition: elog.c:909
void SharedFileSetAttach(SharedFileSet *fileset, dsm_segment *seg)
Definition: sharedfileset.c:62
#define USEMEM(state, amt)
Definition: tuplesort.c:545
bool markpos_eof
Definition: tuplesort.c:401
#define HEAPTUPLESIZE
Definition: htup.h:73
void * MemoryContextAlloc(MemoryContext context, Size size)
Definition: mcxt.c:863
Oid sk_collation
Definition: skey.h:70
#define elog(elevel,...)
Definition: elog.h:232
int i
int currentWorker
Definition: tuplesort.c:501
uint32 low_mask
Definition: tuplesort.c:465
static void readtup_heap(Tuplesortstate *state, SortTuple *stup, LogicalTape *tape, unsigned int len)
Definition: tuplesort.c:3864
TapeShare tapes[FLEXIBLE_ARRAY_MEMBER]
Definition: tuplesort.c:514
void tuplesort_markpos(Tuplesortstate *state)
Definition: tuplesort.c:3259
const TupleTableSlotOps TTSOpsHeapTuple
Definition: execTuples.c:84
void tuplesort_putheaptuple(Tuplesortstate *state, HeapTuple tup)
Definition: tuplesort.c:1709
void LogicalTapeSetClose(LogicalTapeSet *lts)
Definition: logtape.c:676
static void readtup_cluster(Tuplesortstate *state, SortTuple *stup, LogicalTape *tape, unsigned int len)
Definition: tuplesort.c:4098
TuplesortSpaceType spaceType
Definition: tuplesort.h:92
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:120
SortSupport onlyKey
Definition: tuplesort.c:435
static void writetup_cluster(Tuplesortstate *state, LogicalTape *tape, SortTuple *stup)
Definition: tuplesort.c:4078
Tuplesortstate * tuplesort_begin_index_btree(Relation heapRel, Relation indexRel, bool enforceUnique, int workMem, SortCoordinate coordinate, bool randomAccess)
Definition: tuplesort.c:1065
void LogicalTapeSetForgetFreeSpace(LogicalTapeSet *lts)
Definition: logtape.c:759
char * BuildIndexValueDescription(Relation indexRelation, Datum *values, bool *isnull)
Definition: genam.c:177
#define ItemPointerGetBlockNumber(pointer)
Definition: itemptr.h:98
SlabSlot * slabFreeHead
Definition: tuplesort.c:352
static int64 merge_read_buffer_size(int64 avail_mem, int nInputTapes, int nInputRuns, int maxOutputTapes)
Definition: tuplesort.c:2657
long LogicalTapeSetBlocks(LogicalTapeSet *lts)
Definition: logtape.c:1192
AttrNumber ii_IndexAttrNumbers[INDEX_MAX_KEYS]
Definition: execnodes.h:161
void tuplesort_end(Tuplesortstate *state)
Definition: tuplesort.c:1467
#define BTLessStrategyNumber
Definition: stratnum.h:29
static void worker_nomergeruns(Tuplesortstate *state)
Definition: tuplesort.c:4609
static int ApplySortComparator(Datum datum1, bool isNull1, Datum datum2, bool isNull2, SortSupport ssup)
Definition: sortsupport.h:200
Size datumGetSize(Datum value, bool typByVal, int typLen)
Definition: datum.c:64
bool tuplesort_skiptuples(Tuplesortstate *state, int64 ntuples, bool forward)
Definition: tuplesort.c:2534
static void make_bounded_heap(Tuplesortstate *state)
Definition: tuplesort.c:3411
static void tuplesort_heap_replace_top(Tuplesortstate *state, SortTuple *tuple)
Definition: tuplesort.c:3587
int16 AttrNumber
Definition: attnum.h:21
#define MAXORDER
Definition: tuplesort.c:235
static void beginmerge(Tuplesortstate *state)
Definition: tuplesort.c:3088
LogicalTape ** inputTapes
Definition: tuplesort.c:379
#define LACKMEM(state)
Definition: tuplesort.c:544
static void inittapestate(Tuplesortstate *state, int maxTapes)
Definition: tuplesort.c:2740
SharedFileSet fileset
Definition: tuplesort.c:505
long val
Definition: informix.c:664
TupleTableSlot * ExecStoreHeapTuple(HeapTuple tuple, TupleTableSlot *slot, bool shouldFree)
Definition: execTuples.c:1352
#define offsetof(type, field)
Definition: c.h:727
AttrNumber sk_attno
Definition: skey.h:67
TupleDesc tupDesc
Definition: tuplesort.c:428
#define IndexTupleSize(itup)
Definition: itup.h:71
void tuplesort_puttupleslot(Tuplesortstate *state, TupleTableSlot *slot)
Definition: tuplesort.c:1687
SortTuple * memtuples
Definition: tuplesort.c:316
static int ApplySortAbbrevFullComparator(Datum datum1, bool isNull1, Datum datum2, bool isNull2, SortSupport ssup)
Definition: sortsupport.h:238
void LogicalTapeFreeze(LogicalTape *lt, TapeShare *share)
Definition: logtape.c:990