PostgreSQL Source Code  git master
tuplesort.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * tuplesort.c
4  * Generalized tuple sorting routines.
5  *
6  * This module provides a generalized facility for tuple sorting, which can be
7  * applied to different kinds of sortable objects. Implementation of
8  * the particular sorting variants is given in tuplesortvariants.c.
9  * This module works efficiently for both small and large amounts
10  * of data. Small amounts are sorted in-memory using qsort(). Large
11  * amounts are sorted using temporary files and a standard external sort
12  * algorithm.
13  *
14  * See Knuth, volume 3, for more than you want to know about external
15  * sorting algorithms. The algorithm we use is a balanced k-way merge.
16  * Before PostgreSQL 15, we used the polyphase merge algorithm (Knuth's
17  * Algorithm 5.4.2D), but with modern hardware, a straightforward balanced
18  * merge is better. Knuth is assuming that tape drives are expensive
19  * beasts, and in particular that there will always be many more runs than
20  * tape drives. The polyphase merge algorithm was good at keeping all the
21  * tape drives busy, but in our implementation a "tape drive" doesn't cost
22  * much more than a few Kb of memory buffers, so we can afford to have
23  * lots of them. In particular, if we can have as many tape drives as
24  * sorted runs, we can eliminate any repeated I/O at all.
25  *
26  * Historically, we divided the input into sorted runs using replacement
27  * selection, in the form of a priority tree implemented as a heap
28  * (essentially Knuth's Algorithm 5.2.3H), but now we always use quicksort
29  * for run generation.
30  *
31  * The approximate amount of memory allowed for any one sort operation
32  * is specified in kilobytes by the caller (most pass work_mem). Initially,
33  * we absorb tuples and simply store them in an unsorted array as long as
34  * we haven't exceeded workMem. If we reach the end of the input without
35  * exceeding workMem, we sort the array using qsort() and subsequently return
36  * tuples just by scanning the tuple array sequentially. If we do exceed
37  * workMem, we begin to emit tuples into sorted runs in temporary tapes.
38  * When tuples are dumped in batch after quicksorting, we begin a new run
39  * with a new output tape. If we reach the max number of tapes, we write
40  * subsequent runs on the existing tapes in a round-robin fashion. We will
41  * need multiple merge passes to finish the merge in that case. After the
42  * end of the input is reached, we dump out remaining tuples in memory into
43  * a final run, then merge the runs.
44  *
45  * When merging runs, we use a heap containing just the frontmost tuple from
46  * each source run; we repeatedly output the smallest tuple and replace it
47  * with the next tuple from its source tape (if any). When the heap empties,
48  * the merge is complete. The basic merge algorithm thus needs very little
49  * memory --- only M tuples for an M-way merge, and M is constrained to a
50  * small number. However, we can still make good use of our full workMem
51  * allocation by pre-reading additional blocks from each source tape. Without
52  * prereading, our access pattern to the temporary file would be very erratic;
53  * on average we'd read one block from each of M source tapes during the same
54  * time that we're writing M blocks to the output tape, so there is no
55  * sequentiality of access at all, defeating the read-ahead methods used by
56  * most Unix kernels. Worse, the output tape gets written into a very random
57  * sequence of blocks of the temp file, ensuring that things will be even
58  * worse when it comes time to read that tape. A straightforward merge pass
59  * thus ends up doing a lot of waiting for disk seeks. We can improve matters
60  * by prereading from each source tape sequentially, loading about workMem/M
61  * bytes from each tape in turn, and making the sequential blocks immediately
62  * available for reuse. This approach helps to localize both read and write
63  * accesses. The pre-reading is handled by logtape.c, we just tell it how
64  * much memory to use for the buffers.
65  *
66  * In the current code we determine the number of input tapes M on the basis
67  * of workMem: we want workMem/M to be large enough that we read a fair
68  * amount of data each time we read from a tape, so as to maintain the
69  * locality of access described above. Nonetheless, with large workMem we
70  * can have many tapes. The logical "tapes" are implemented by logtape.c,
71  * which avoids space wastage by recycling disk space as soon as each block
72  * is read from its "tape".
73  *
74  * When the caller requests random access to the sort result, we form
75  * the final sorted run on a logical tape which is then "frozen", so
76  * that we can access it randomly. When the caller does not need random
77  * access, we return from tuplesort_performsort() as soon as we are down
78  * to one run per logical tape. The final merge is then performed
79  * on-the-fly as the caller repeatedly calls tuplesort_getXXX; this
80  * saves one cycle of writing all the data out to disk and reading it in.
81  *
82  * This module supports parallel sorting. Parallel sorts involve coordination
83  * among one or more worker processes, and a leader process, each with its own
84  * tuplesort state. The leader process (or, more accurately, the
85  * Tuplesortstate associated with a leader process) creates a full tapeset
86  * consisting of worker tapes with one run to merge; a run for every
87  * worker process. This is then merged. Worker processes are guaranteed to
88  * produce exactly one output run from their partial input.
89  *
90  *
91  * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
92  * Portions Copyright (c) 1994, Regents of the University of California
93  *
94  * IDENTIFICATION
95  * src/backend/utils/sort/tuplesort.c
96  *
97  *-------------------------------------------------------------------------
98  */
99 
100 #include "postgres.h"
101 
102 #include <limits.h>
103 
104 #include "commands/tablespace.h"
105 #include "miscadmin.h"
106 #include "pg_trace.h"
107 #include "storage/shmem.h"
108 #include "utils/memutils.h"
109 #include "utils/pg_rusage.h"
110 #include "utils/tuplesort.h"
111 
112 /*
113  * Initial size of memtuples array. We're trying to select this size so that
114  * array doesn't exceed ALLOCSET_SEPARATE_THRESHOLD and so that the overhead of
115  * allocation might possibly be lowered. However, we don't consider array sizes
116  * less than 1024.
117  *
118  */
119 #define INITIAL_MEMTUPSIZE Max(1024, \
120  ALLOCSET_SEPARATE_THRESHOLD / sizeof(SortTuple) + 1)
121 
122 /* GUC variables */
123 #ifdef TRACE_SORT
124 bool trace_sort = false;
125 #endif
126 
127 #ifdef DEBUG_BOUNDED_SORT
128 bool optimize_bounded_sort = true;
129 #endif
130 
131 
132 /*
133  * During merge, we use a pre-allocated set of fixed-size slots to hold
134  * tuples. To avoid palloc/pfree overhead.
135  *
136  * Merge doesn't require a lot of memory, so we can afford to waste some,
137  * by using gratuitously-sized slots. If a tuple is larger than 1 kB, the
138  * palloc() overhead is not significant anymore.
139  *
140  * 'nextfree' is valid when this chunk is in the free list. When in use, the
141  * slot holds a tuple.
142  */
143 #define SLAB_SLOT_SIZE 1024
144 
145 typedef union SlabSlot
146 {
150 
151 /*
152  * Possible states of a Tuplesort object. These denote the states that
153  * persist between calls of Tuplesort routines.
154  */
155 typedef enum
156 {
157  TSS_INITIAL, /* Loading tuples; still within memory limit */
158  TSS_BOUNDED, /* Loading tuples into bounded-size heap */
159  TSS_BUILDRUNS, /* Loading tuples; writing to tape */
160  TSS_SORTEDINMEM, /* Sort completed entirely in memory */
161  TSS_SORTEDONTAPE, /* Sort completed, final run is on tape */
162  TSS_FINALMERGE, /* Performing final merge on-the-fly */
163 } TupSortStatus;
164 
165 /*
166  * Parameters for calculation of number of tapes to use --- see inittapes()
167  * and tuplesort_merge_order().
168  *
169  * In this calculation we assume that each tape will cost us about 1 blocks
170  * worth of buffer space. This ignores the overhead of all the other data
171  * structures needed for each tape, but it's probably close enough.
172  *
173  * MERGE_BUFFER_SIZE is how much buffer space we'd like to allocate for each
174  * input tape, for pre-reading (see discussion at top of file). This is *in
175  * addition to* the 1 block already included in TAPE_BUFFER_OVERHEAD.
176  */
177 #define MINORDER 6 /* minimum merge order */
178 #define MAXORDER 500 /* maximum merge order */
179 #define TAPE_BUFFER_OVERHEAD BLCKSZ
180 #define MERGE_BUFFER_SIZE (BLCKSZ * 32)
181 
182 
183 /*
184  * Private state of a Tuplesort operation.
185  */
187 {
189  TupSortStatus status; /* enumerated value as shown above */
190  bool bounded; /* did caller specify a maximum number of
191  * tuples to return? */
192  bool boundUsed; /* true if we made use of a bounded heap */
193  int bound; /* if bounded, the maximum number of tuples */
194  int64 availMem; /* remaining memory available, in bytes */
195  int64 allowedMem; /* total memory allowed, in bytes */
196  int maxTapes; /* max number of input tapes to merge in each
197  * pass */
198  int64 maxSpace; /* maximum amount of space occupied among sort
199  * of groups, either in-memory or on-disk */
200  bool isMaxSpaceDisk; /* true when maxSpace is value for on-disk
201  * space, false when it's value for in-memory
202  * space */
203  TupSortStatus maxSpaceStatus; /* sort status when maxSpace was reached */
204  LogicalTapeSet *tapeset; /* logtape.c object for tapes in a temp file */
205 
206  /*
207  * This array holds the tuples now in sort memory. If we are in state
208  * INITIAL, the tuples are in no particular order; if we are in state
209  * SORTEDINMEM, the tuples are in final sorted order; in states BUILDRUNS
210  * and FINALMERGE, the tuples are organized in "heap" order per Algorithm
211  * H. In state SORTEDONTAPE, the array is not used.
212  */
213  SortTuple *memtuples; /* array of SortTuple structs */
214  int memtupcount; /* number of tuples currently present */
215  int memtupsize; /* allocated length of memtuples array */
216  bool growmemtuples; /* memtuples' growth still underway? */
217 
218  /*
219  * Memory for tuples is sometimes allocated using a simple slab allocator,
220  * rather than with palloc(). Currently, we switch to slab allocation
221  * when we start merging. Merging only needs to keep a small, fixed
222  * number of tuples in memory at any time, so we can avoid the
223  * palloc/pfree overhead by recycling a fixed number of fixed-size slots
224  * to hold the tuples.
225  *
226  * For the slab, we use one large allocation, divided into SLAB_SLOT_SIZE
227  * slots. The allocation is sized to have one slot per tape, plus one
228  * additional slot. We need that many slots to hold all the tuples kept
229  * in the heap during merge, plus the one we have last returned from the
230  * sort, with tuplesort_gettuple.
231  *
232  * Initially, all the slots are kept in a linked list of free slots. When
233  * a tuple is read from a tape, it is put to the next available slot, if
234  * it fits. If the tuple is larger than SLAB_SLOT_SIZE, it is palloc'd
235  * instead.
236  *
237  * When we're done processing a tuple, we return the slot back to the free
238  * list, or pfree() if it was palloc'd. We know that a tuple was
239  * allocated from the slab, if its pointer value is between
240  * slabMemoryBegin and -End.
241  *
242  * When the slab allocator is used, the USEMEM/LACKMEM mechanism of
243  * tracking memory usage is not used.
244  */
246 
247  char *slabMemoryBegin; /* beginning of slab memory arena */
248  char *slabMemoryEnd; /* end of slab memory arena */
249  SlabSlot *slabFreeHead; /* head of free list */
250 
251  /* Memory used for input and output tape buffers. */
253 
254  /*
255  * When we return a tuple to the caller in tuplesort_gettuple_XXX, that
256  * came from a tape (that is, in TSS_SORTEDONTAPE or TSS_FINALMERGE
257  * modes), we remember the tuple in 'lastReturnedTuple', so that we can
258  * recycle the memory on next gettuple call.
259  */
261 
262  /*
263  * While building initial runs, this is the current output run number.
264  * Afterwards, it is the number of initial runs we made.
265  */
267 
268  /*
269  * Logical tapes, for merging.
270  *
271  * The initial runs are written in the output tapes. In each merge pass,
272  * the output tapes of the previous pass become the input tapes, and new
273  * output tapes are created as needed. When nInputTapes equals
274  * nInputRuns, there is only one merge pass left.
275  */
279 
283 
284  LogicalTape *destTape; /* current output tape */
285 
286  /*
287  * These variables are used after completion of sorting to keep track of
288  * the next tuple to return. (In the tape case, the tape's current read
289  * position is also critical state.)
290  */
291  LogicalTape *result_tape; /* actual tape of finished output */
292  int current; /* array index (only used if SORTEDINMEM) */
293  bool eof_reached; /* reached EOF (needed for cursors) */
294 
295  /* markpos_xxx holds marked position for mark and restore */
296  int64 markpos_block; /* tape block# (only used if SORTEDONTAPE) */
297  int markpos_offset; /* saved "current", or offset in tape block */
298  bool markpos_eof; /* saved "eof_reached" */
299 
300  /*
301  * These variables are used during parallel sorting.
302  *
303  * worker is our worker identifier. Follows the general convention that
304  * -1 value relates to a leader tuplesort, and values >= 0 worker
305  * tuplesorts. (-1 can also be a serial tuplesort.)
306  *
307  * shared is mutable shared memory state, which is used to coordinate
308  * parallel sorts.
309  *
310  * nParticipants is the number of worker Tuplesortstates known by the
311  * leader to have actually been launched, which implies that they must
312  * finish a run that the leader needs to merge. Typically includes a
313  * worker state held by the leader process itself. Set in the leader
314  * Tuplesortstate only.
315  */
316  int worker;
319 
320  /*
321  * Additional state for managing "abbreviated key" sortsupport routines
322  * (which currently may be used by all cases except the hash index case).
323  * Tracks the intervals at which the optimization's effectiveness is
324  * tested.
325  */
326  int64 abbrevNext; /* Tuple # at which to next check
327  * applicability */
328 
329  /*
330  * Resource snapshot for time of sort start.
331  */
332 #ifdef TRACE_SORT
334 #endif
335 };
336 
337 /*
338  * Private mutable state of tuplesort-parallel-operation. This is allocated
339  * in shared memory.
340  */
342 {
343  /* mutex protects all fields prior to tapes */
345 
346  /*
347  * currentWorker generates ordinal identifier numbers for parallel sort
348  * workers. These start from 0, and are always gapless.
349  *
350  * Workers increment workersFinished to indicate having finished. If this
351  * is equal to state.nParticipants within the leader, leader is ready to
352  * merge worker runs.
353  */
356 
357  /* Temporary file space */
359 
360  /* Size of tapes flexible array */
361  int nTapes;
362 
363  /*
364  * Tapes array used by workers to report back information needed by the
365  * leader to concatenate all worker tapes into one for merging
366  */
368 };
369 
370 /*
371  * Is the given tuple allocated from the slab memory arena?
372  */
373 #define IS_SLAB_SLOT(state, tuple) \
374  ((char *) (tuple) >= (state)->slabMemoryBegin && \
375  (char *) (tuple) < (state)->slabMemoryEnd)
376 
377 /*
378  * Return the given tuple to the slab memory free list, or free it
379  * if it was palloc'd.
380  */
381 #define RELEASE_SLAB_SLOT(state, tuple) \
382  do { \
383  SlabSlot *buf = (SlabSlot *) tuple; \
384  \
385  if (IS_SLAB_SLOT((state), buf)) \
386  { \
387  buf->nextfree = (state)->slabFreeHead; \
388  (state)->slabFreeHead = buf; \
389  } else \
390  pfree(buf); \
391  } while(0)
392 
393 #define REMOVEABBREV(state,stup,count) ((*(state)->base.removeabbrev) (state, stup, count))
394 #define COMPARETUP(state,a,b) ((*(state)->base.comparetup) (a, b, state))
395 #define WRITETUP(state,tape,stup) ((*(state)->base.writetup) (state, tape, stup))
396 #define READTUP(state,stup,tape,len) ((*(state)->base.readtup) (state, stup, tape, len))
397 #define FREESTATE(state) ((state)->base.freestate ? (*(state)->base.freestate) (state) : (void) 0)
398 #define LACKMEM(state) ((state)->availMem < 0 && !(state)->slabAllocatorUsed)
399 #define USEMEM(state,amt) ((state)->availMem -= (amt))
400 #define FREEMEM(state,amt) ((state)->availMem += (amt))
401 #define SERIAL(state) ((state)->shared == NULL)
402 #define WORKER(state) ((state)->shared && (state)->worker != -1)
403 #define LEADER(state) ((state)->shared && (state)->worker == -1)
404 
405 /*
406  * NOTES about on-tape representation of tuples:
407  *
408  * We require the first "unsigned int" of a stored tuple to be the total size
409  * on-tape of the tuple, including itself (so it is never zero; an all-zero
410  * unsigned int is used to delimit runs). The remainder of the stored tuple
411  * may or may not match the in-memory representation of the tuple ---
412  * any conversion needed is the job of the writetup and readtup routines.
413  *
414  * If state->sortopt contains TUPLESORT_RANDOMACCESS, then the stored
415  * representation of the tuple must be followed by another "unsigned int" that
416  * is a copy of the length --- so the total tape space used is actually
417  * sizeof(unsigned int) more than the stored length value. This allows
418  * read-backwards. When the random access flag was not specified, the
419  * write/read routines may omit the extra length word.
420  *
421  * writetup is expected to write both length words as well as the tuple
422  * data. When readtup is called, the tape is positioned just after the
423  * front length word; readtup must read the tuple data and advance past
424  * the back length word (if present).
425  *
426  * The write/read routines can make use of the tuple description data
427  * stored in the Tuplesortstate record, if needed. They are also expected
428  * to adjust state->availMem by the amount of memory space (not tape space!)
429  * released or consumed. There is no error return from either writetup
430  * or readtup; they should ereport() on failure.
431  *
432  *
433  * NOTES about memory consumption calculations:
434  *
435  * We count space allocated for tuples against the workMem limit, plus
436  * the space used by the variable-size memtuples array. Fixed-size space
437  * is not counted; it's small enough to not be interesting.
438  *
439  * Note that we count actual space used (as shown by GetMemoryChunkSpace)
440  * rather than the originally-requested size. This is important since
441  * palloc can add substantial overhead. It's not a complete answer since
442  * we won't count any wasted space in palloc allocation blocks, but it's
443  * a lot better than what we were doing before 7.3. As of 9.6, a
444  * separate memory context is used for caller passed tuples. Resetting
445  * it at certain key increments significantly ameliorates fragmentation.
446  * readtup routines use the slab allocator (they cannot use
447  * the reset context because it gets deleted at the point that merging
448  * begins).
449  */
450 
451 
454 static void inittapes(Tuplesortstate *state, bool mergeruns);
455 static void inittapestate(Tuplesortstate *state, int maxTapes);
456 static void selectnewtape(Tuplesortstate *state);
457 static void init_slab_allocator(Tuplesortstate *state, int numSlots);
458 static void mergeruns(Tuplesortstate *state);
459 static void mergeonerun(Tuplesortstate *state);
460 static void beginmerge(Tuplesortstate *state);
461 static bool mergereadnext(Tuplesortstate *state, LogicalTape *srcTape, SortTuple *stup);
462 static void dumptuples(Tuplesortstate *state, bool alltuples);
470 static unsigned int getlen(LogicalTape *tape, bool eofOK);
471 static void markrunend(LogicalTape *tape);
476 static void free_sort_tuple(Tuplesortstate *state, SortTuple *stup);
477 static void tuplesort_free(Tuplesortstate *state);
479 
480 /*
481  * Specialized comparators that we can inline into specialized sorts. The goal
482  * is to try to sort two tuples without having to follow the pointers to the
483  * comparator or the tuple.
484  *
485  * XXX: For now, there is no specialization for cases where datum1 is
486  * authoritative and we don't even need to fall back to a callback at all (that
487  * would be true for types like int4/int8/timestamp/date, but not true for
488  * abbreviations of text or multi-key sorts. There could be! Is it worth it?
489  */
490 
491 /* Used if first key's comparator is ssup_datum_unsigned_cmp */
494 {
495  int compare;
496 
497  compare = ApplyUnsignedSortComparator(a->datum1, a->isnull1,
498  b->datum1, b->isnull1,
499  &state->base.sortKeys[0]);
500  if (compare != 0)
501  return compare;
502 
503  /*
504  * No need to waste effort calling the tiebreak function when there are no
505  * other keys to sort on.
506  */
507  if (state->base.onlyKey != NULL)
508  return 0;
509 
510  return state->base.comparetup_tiebreak(a, b, state);
511 }
512 
513 #if SIZEOF_DATUM >= 8
514 /* Used if first key's comparator is ssup_datum_signed_cmp */
516 qsort_tuple_signed_compare(SortTuple *a, SortTuple *b, Tuplesortstate *state)
517 {
518  int compare;
519 
520  compare = ApplySignedSortComparator(a->datum1, a->isnull1,
521  b->datum1, b->isnull1,
522  &state->base.sortKeys[0]);
523 
524  if (compare != 0)
525  return compare;
526 
527  /*
528  * No need to waste effort calling the tiebreak function when there are no
529  * other keys to sort on.
530  */
531  if (state->base.onlyKey != NULL)
532  return 0;
533 
534  return state->base.comparetup_tiebreak(a, b, state);
535 }
536 #endif
537 
538 /* Used if first key's comparator is ssup_datum_int32_cmp */
541 {
542  int compare;
543 
544  compare = ApplyInt32SortComparator(a->datum1, a->isnull1,
545  b->datum1, b->isnull1,
546  &state->base.sortKeys[0]);
547 
548  if (compare != 0)
549  return compare;
550 
551  /*
552  * No need to waste effort calling the tiebreak function when there are no
553  * other keys to sort on.
554  */
555  if (state->base.onlyKey != NULL)
556  return 0;
557 
558  return state->base.comparetup_tiebreak(a, b, state);
559 }
560 
561 /*
562  * Special versions of qsort just for SortTuple objects. qsort_tuple() sorts
563  * any variant of SortTuples, using the appropriate comparetup function.
564  * qsort_ssup() is specialized for the case where the comparetup function
565  * reduces to ApplySortComparator(), that is single-key MinimalTuple sorts
566  * and Datum sorts. qsort_tuple_{unsigned,signed,int32} are specialized for
567  * common comparison functions on pass-by-value leading datums.
568  */
569 
570 #define ST_SORT qsort_tuple_unsigned
571 #define ST_ELEMENT_TYPE SortTuple
572 #define ST_COMPARE(a, b, state) qsort_tuple_unsigned_compare(a, b, state)
573 #define ST_COMPARE_ARG_TYPE Tuplesortstate
574 #define ST_CHECK_FOR_INTERRUPTS
575 #define ST_SCOPE static
576 #define ST_DEFINE
577 #include "lib/sort_template.h"
578 
579 #if SIZEOF_DATUM >= 8
580 #define ST_SORT qsort_tuple_signed
581 #define ST_ELEMENT_TYPE SortTuple
582 #define ST_COMPARE(a, b, state) qsort_tuple_signed_compare(a, b, state)
583 #define ST_COMPARE_ARG_TYPE Tuplesortstate
584 #define ST_CHECK_FOR_INTERRUPTS
585 #define ST_SCOPE static
586 #define ST_DEFINE
587 #include "lib/sort_template.h"
588 #endif
589 
590 #define ST_SORT qsort_tuple_int32
591 #define ST_ELEMENT_TYPE SortTuple
592 #define ST_COMPARE(a, b, state) qsort_tuple_int32_compare(a, b, state)
593 #define ST_COMPARE_ARG_TYPE Tuplesortstate
594 #define ST_CHECK_FOR_INTERRUPTS
595 #define ST_SCOPE static
596 #define ST_DEFINE
597 #include "lib/sort_template.h"
598 
599 #define ST_SORT qsort_tuple
600 #define ST_ELEMENT_TYPE SortTuple
601 #define ST_COMPARE_RUNTIME_POINTER
602 #define ST_COMPARE_ARG_TYPE Tuplesortstate
603 #define ST_CHECK_FOR_INTERRUPTS
604 #define ST_SCOPE static
605 #define ST_DECLARE
606 #define ST_DEFINE
607 #include "lib/sort_template.h"
608 
609 #define ST_SORT qsort_ssup
610 #define ST_ELEMENT_TYPE SortTuple
611 #define ST_COMPARE(a, b, ssup) \
612  ApplySortComparator((a)->datum1, (a)->isnull1, \
613  (b)->datum1, (b)->isnull1, (ssup))
614 #define ST_COMPARE_ARG_TYPE SortSupportData
615 #define ST_CHECK_FOR_INTERRUPTS
616 #define ST_SCOPE static
617 #define ST_DEFINE
618 #include "lib/sort_template.h"
619 
620 /*
621  * tuplesort_begin_xxx
622  *
623  * Initialize for a tuple sort operation.
624  *
625  * After calling tuplesort_begin, the caller should call tuplesort_putXXX
626  * zero or more times, then call tuplesort_performsort when all the tuples
627  * have been supplied. After performsort, retrieve the tuples in sorted
628  * order by calling tuplesort_getXXX until it returns false/NULL. (If random
629  * access was requested, rescan, markpos, and restorepos can also be called.)
630  * Call tuplesort_end to terminate the operation and release memory/disk space.
631  *
632  * Each variant of tuplesort_begin has a workMem parameter specifying the
633  * maximum number of kilobytes of RAM to use before spilling data to disk.
634  * (The normal value of this parameter is work_mem, but some callers use
635  * other values.) Each variant also has a sortopt which is a bitmask of
636  * sort options. See TUPLESORT_* definitions in tuplesort.h
637  */
638 
640 tuplesort_begin_common(int workMem, SortCoordinate coordinate, int sortopt)
641 {
643  MemoryContext maincontext;
644  MemoryContext sortcontext;
645  MemoryContext oldcontext;
646 
647  /* See leader_takeover_tapes() remarks on random access support */
648  if (coordinate && (sortopt & TUPLESORT_RANDOMACCESS))
649  elog(ERROR, "random access disallowed under parallel sort");
650 
651  /*
652  * Memory context surviving tuplesort_reset. This memory context holds
653  * data which is useful to keep while sorting multiple similar batches.
654  */
656  "TupleSort main",
658 
659  /*
660  * Create a working memory context for one sort operation. The content of
661  * this context is deleted by tuplesort_reset.
662  */
663  sortcontext = AllocSetContextCreate(maincontext,
664  "TupleSort sort",
666 
667  /*
668  * Additionally a working memory context for tuples is setup in
669  * tuplesort_begin_batch.
670  */
671 
672  /*
673  * Make the Tuplesortstate within the per-sortstate context. This way, we
674  * don't need a separate pfree() operation for it at shutdown.
675  */
676  oldcontext = MemoryContextSwitchTo(maincontext);
677 
679 
680 #ifdef TRACE_SORT
681  if (trace_sort)
682  pg_rusage_init(&state->ru_start);
683 #endif
684 
685  state->base.sortopt = sortopt;
686  state->base.tuples = true;
687  state->abbrevNext = 10;
688 
689  /*
690  * workMem is forced to be at least 64KB, the current minimum valid value
691  * for the work_mem GUC. This is a defense against parallel sort callers
692  * that divide out memory among many workers in a way that leaves each
693  * with very little memory.
694  */
695  state->allowedMem = Max(workMem, 64) * (int64) 1024;
696  state->base.sortcontext = sortcontext;
697  state->base.maincontext = maincontext;
698 
699  /*
700  * Initial size of array must be more than ALLOCSET_SEPARATE_THRESHOLD;
701  * see comments in grow_memtuples().
702  */
703  state->memtupsize = INITIAL_MEMTUPSIZE;
704  state->memtuples = NULL;
705 
706  /*
707  * After all of the other non-parallel-related state, we setup all of the
708  * state needed for each batch.
709  */
711 
712  /*
713  * Initialize parallel-related state based on coordination information
714  * from caller
715  */
716  if (!coordinate)
717  {
718  /* Serial sort */
719  state->shared = NULL;
720  state->worker = -1;
721  state->nParticipants = -1;
722  }
723  else if (coordinate->isWorker)
724  {
725  /* Parallel worker produces exactly one final run from all input */
726  state->shared = coordinate->sharedsort;
727  state->worker = worker_get_identifier(state);
728  state->nParticipants = -1;
729  }
730  else
731  {
732  /* Parallel leader state only used for final merge */
733  state->shared = coordinate->sharedsort;
734  state->worker = -1;
735  state->nParticipants = coordinate->nParticipants;
736  Assert(state->nParticipants >= 1);
737  }
738 
739  MemoryContextSwitchTo(oldcontext);
740 
741  return state;
742 }
743 
744 /*
745  * tuplesort_begin_batch
746  *
747  * Setup, or reset, all state need for processing a new set of tuples with this
748  * sort state. Called both from tuplesort_begin_common (the first time sorting
749  * with this sort state) and tuplesort_reset (for subsequent usages).
750  */
751 static void
753 {
754  MemoryContext oldcontext;
755 
756  oldcontext = MemoryContextSwitchTo(state->base.maincontext);
757 
758  /*
759  * Caller tuple (e.g. IndexTuple) memory context.
760  *
761  * A dedicated child context used exclusively for caller passed tuples
762  * eases memory management. Resetting at key points reduces
763  * fragmentation. Note that the memtuples array of SortTuples is allocated
764  * in the parent context, not this context, because there is no need to
765  * free memtuples early. For bounded sorts, tuples may be pfreed in any
766  * order, so we use a regular aset.c context so that it can make use of
767  * free'd memory. When the sort is not bounded, we make use of a
768  * generation.c context as this keeps allocations more compact with less
769  * wastage. Allocations are also slightly more CPU efficient.
770  */
771  if (state->base.sortopt & TUPLESORT_ALLOWBOUNDED)
772  state->base.tuplecontext = AllocSetContextCreate(state->base.sortcontext,
773  "Caller tuples",
775  else
776  state->base.tuplecontext = GenerationContextCreate(state->base.sortcontext,
777  "Caller tuples",
779 
780 
781  state->status = TSS_INITIAL;
782  state->bounded = false;
783  state->boundUsed = false;
784 
785  state->availMem = state->allowedMem;
786 
787  state->tapeset = NULL;
788 
789  state->memtupcount = 0;
790 
791  /*
792  * Initial size of array must be more than ALLOCSET_SEPARATE_THRESHOLD;
793  * see comments in grow_memtuples().
794  */
795  state->growmemtuples = true;
796  state->slabAllocatorUsed = false;
797  if (state->memtuples != NULL && state->memtupsize != INITIAL_MEMTUPSIZE)
798  {
799  pfree(state->memtuples);
800  state->memtuples = NULL;
801  state->memtupsize = INITIAL_MEMTUPSIZE;
802  }
803  if (state->memtuples == NULL)
804  {
805  state->memtuples = (SortTuple *) palloc(state->memtupsize * sizeof(SortTuple));
806  USEMEM(state, GetMemoryChunkSpace(state->memtuples));
807  }
808 
809  /* workMem must be large enough for the minimal memtuples array */
810  if (LACKMEM(state))
811  elog(ERROR, "insufficient memory allowed for sort");
812 
813  state->currentRun = 0;
814 
815  /*
816  * Tape variables (inputTapes, outputTapes, etc.) will be initialized by
817  * inittapes(), if needed.
818  */
819 
820  state->result_tape = NULL; /* flag that result tape has not been formed */
821 
822  MemoryContextSwitchTo(oldcontext);
823 }
824 
825 /*
826  * tuplesort_set_bound
827  *
828  * Advise tuplesort that at most the first N result tuples are required.
829  *
830  * Must be called before inserting any tuples. (Actually, we could allow it
831  * as long as the sort hasn't spilled to disk, but there seems no need for
832  * delayed calls at the moment.)
833  *
834  * This is a hint only. The tuplesort may still return more tuples than
835  * requested. Parallel leader tuplesorts will always ignore the hint.
836  */
837 void
839 {
840  /* Assert we're called before loading any tuples */
841  Assert(state->status == TSS_INITIAL && state->memtupcount == 0);
842  /* Assert we allow bounded sorts */
843  Assert(state->base.sortopt & TUPLESORT_ALLOWBOUNDED);
844  /* Can't set the bound twice, either */
845  Assert(!state->bounded);
846  /* Also, this shouldn't be called in a parallel worker */
847  Assert(!WORKER(state));
848 
849  /* Parallel leader allows but ignores hint */
850  if (LEADER(state))
851  return;
852 
853 #ifdef DEBUG_BOUNDED_SORT
854  /* Honor GUC setting that disables the feature (for easy testing) */
855  if (!optimize_bounded_sort)
856  return;
857 #endif
858 
859  /* We want to be able to compute bound * 2, so limit the setting */
860  if (bound > (int64) (INT_MAX / 2))
861  return;
862 
863  state->bounded = true;
864  state->bound = (int) bound;
865 
866  /*
867  * Bounded sorts are not an effective target for abbreviated key
868  * optimization. Disable by setting state to be consistent with no
869  * abbreviation support.
870  */
871  state->base.sortKeys->abbrev_converter = NULL;
872  if (state->base.sortKeys->abbrev_full_comparator)
873  state->base.sortKeys->comparator = state->base.sortKeys->abbrev_full_comparator;
874 
875  /* Not strictly necessary, but be tidy */
876  state->base.sortKeys->abbrev_abort = NULL;
877  state->base.sortKeys->abbrev_full_comparator = NULL;
878 }
879 
880 /*
881  * tuplesort_used_bound
882  *
883  * Allow callers to find out if the sort state was able to use a bound.
884  */
885 bool
887 {
888  return state->boundUsed;
889 }
890 
891 /*
892  * tuplesort_free
893  *
894  * Internal routine for freeing resources of tuplesort.
895  */
896 static void
898 {
899  /* context swap probably not needed, but let's be safe */
900  MemoryContext oldcontext = MemoryContextSwitchTo(state->base.sortcontext);
901 
902 #ifdef TRACE_SORT
903  int64 spaceUsed;
904 
905  if (state->tapeset)
906  spaceUsed = LogicalTapeSetBlocks(state->tapeset);
907  else
908  spaceUsed = (state->allowedMem - state->availMem + 1023) / 1024;
909 #endif
910 
911  /*
912  * Delete temporary "tape" files, if any.
913  *
914  * Note: want to include this in reported total cost of sort, hence need
915  * for two #ifdef TRACE_SORT sections.
916  *
917  * We don't bother to destroy the individual tapes here. They will go away
918  * with the sortcontext. (In TSS_FINALMERGE state, we have closed
919  * finished tapes already.)
920  */
921  if (state->tapeset)
922  LogicalTapeSetClose(state->tapeset);
923 
924 #ifdef TRACE_SORT
925  if (trace_sort)
926  {
927  if (state->tapeset)
928  elog(LOG, "%s of worker %d ended, %lld disk blocks used: %s",
929  SERIAL(state) ? "external sort" : "parallel external sort",
930  state->worker, (long long) spaceUsed, pg_rusage_show(&state->ru_start));
931  else
932  elog(LOG, "%s of worker %d ended, %lld KB used: %s",
933  SERIAL(state) ? "internal sort" : "unperformed parallel sort",
934  state->worker, (long long) spaceUsed, pg_rusage_show(&state->ru_start));
935  }
936 
937  TRACE_POSTGRESQL_SORT_DONE(state->tapeset != NULL, spaceUsed);
938 #else
939 
940  /*
941  * If you disabled TRACE_SORT, you can still probe sort__done, but you
942  * ain't getting space-used stats.
943  */
944  TRACE_POSTGRESQL_SORT_DONE(state->tapeset != NULL, 0L);
945 #endif
946 
947  FREESTATE(state);
948  MemoryContextSwitchTo(oldcontext);
949 
950  /*
951  * Free the per-sort memory context, thereby releasing all working memory.
952  */
953  MemoryContextReset(state->base.sortcontext);
954 }
955 
956 /*
957  * tuplesort_end
958  *
959  * Release resources and clean up.
960  *
961  * NOTE: after calling this, any pointers returned by tuplesort_getXXX are
962  * pointing to garbage. Be careful not to attempt to use or free such
963  * pointers afterwards!
964  */
965 void
967 {
969 
970  /*
971  * Free the main memory context, including the Tuplesortstate struct
972  * itself.
973  */
974  MemoryContextDelete(state->base.maincontext);
975 }
976 
977 /*
978  * tuplesort_updatemax
979  *
980  * Update maximum resource usage statistics.
981  */
982 static void
984 {
985  int64 spaceUsed;
986  bool isSpaceDisk;
987 
988  /*
989  * Note: it might seem we should provide both memory and disk usage for a
990  * disk-based sort. However, the current code doesn't track memory space
991  * accurately once we have begun to return tuples to the caller (since we
992  * don't account for pfree's the caller is expected to do), so we cannot
993  * rely on availMem in a disk sort. This does not seem worth the overhead
994  * to fix. Is it worth creating an API for the memory context code to
995  * tell us how much is actually used in sortcontext?
996  */
997  if (state->tapeset)
998  {
999  isSpaceDisk = true;
1000  spaceUsed = LogicalTapeSetBlocks(state->tapeset) * BLCKSZ;
1001  }
1002  else
1003  {
1004  isSpaceDisk = false;
1005  spaceUsed = state->allowedMem - state->availMem;
1006  }
1007 
1008  /*
1009  * Sort evicts data to the disk when it wasn't able to fit that data into
1010  * main memory. This is why we assume space used on the disk to be more
1011  * important for tracking resource usage than space used in memory. Note
1012  * that the amount of space occupied by some tupleset on the disk might be
1013  * less than amount of space occupied by the same tupleset in memory due
1014  * to more compact representation.
1015  */
1016  if ((isSpaceDisk && !state->isMaxSpaceDisk) ||
1017  (isSpaceDisk == state->isMaxSpaceDisk && spaceUsed > state->maxSpace))
1018  {
1019  state->maxSpace = spaceUsed;
1020  state->isMaxSpaceDisk = isSpaceDisk;
1021  state->maxSpaceStatus = state->status;
1022  }
1023 }
1024 
1025 /*
1026  * tuplesort_reset
1027  *
1028  * Reset the tuplesort. Reset all the data in the tuplesort, but leave the
1029  * meta-information in. After tuplesort_reset, tuplesort is ready to start
1030  * a new sort. This allows avoiding recreation of tuple sort states (and
1031  * save resources) when sorting multiple small batches.
1032  */
1033 void
1035 {
1038 
1039  /*
1040  * After we've freed up per-batch memory, re-setup all of the state common
1041  * to both the first batch and any subsequent batch.
1042  */
1044 
1045  state->lastReturnedTuple = NULL;
1046  state->slabMemoryBegin = NULL;
1047  state->slabMemoryEnd = NULL;
1048  state->slabFreeHead = NULL;
1049 }
1050 
1051 /*
1052  * Grow the memtuples[] array, if possible within our memory constraint. We
1053  * must not exceed INT_MAX tuples in memory or the caller-provided memory
1054  * limit. Return true if we were able to enlarge the array, false if not.
1055  *
1056  * Normally, at each increment we double the size of the array. When doing
1057  * that would exceed a limit, we attempt one last, smaller increase (and then
1058  * clear the growmemtuples flag so we don't try any more). That allows us to
1059  * use memory as fully as permitted; sticking to the pure doubling rule could
1060  * result in almost half going unused. Because availMem moves around with
1061  * tuple addition/removal, we need some rule to prevent making repeated small
1062  * increases in memtupsize, which would just be useless thrashing. The
1063  * growmemtuples flag accomplishes that and also prevents useless
1064  * recalculations in this function.
1065  */
1066 static bool
1068 {
1069  int newmemtupsize;
1070  int memtupsize = state->memtupsize;
1071  int64 memNowUsed = state->allowedMem - state->availMem;
1072 
1073  /* Forget it if we've already maxed out memtuples, per comment above */
1074  if (!state->growmemtuples)
1075  return false;
1076 
1077  /* Select new value of memtupsize */
1078  if (memNowUsed <= state->availMem)
1079  {
1080  /*
1081  * We've used no more than half of allowedMem; double our usage,
1082  * clamping at INT_MAX tuples.
1083  */
1084  if (memtupsize < INT_MAX / 2)
1085  newmemtupsize = memtupsize * 2;
1086  else
1087  {
1088  newmemtupsize = INT_MAX;
1089  state->growmemtuples = false;
1090  }
1091  }
1092  else
1093  {
1094  /*
1095  * This will be the last increment of memtupsize. Abandon doubling
1096  * strategy and instead increase as much as we safely can.
1097  *
1098  * To stay within allowedMem, we can't increase memtupsize by more
1099  * than availMem / sizeof(SortTuple) elements. In practice, we want
1100  * to increase it by considerably less, because we need to leave some
1101  * space for the tuples to which the new array slots will refer. We
1102  * assume the new tuples will be about the same size as the tuples
1103  * we've already seen, and thus we can extrapolate from the space
1104  * consumption so far to estimate an appropriate new size for the
1105  * memtuples array. The optimal value might be higher or lower than
1106  * this estimate, but it's hard to know that in advance. We again
1107  * clamp at INT_MAX tuples.
1108  *
1109  * This calculation is safe against enlarging the array so much that
1110  * LACKMEM becomes true, because the memory currently used includes
1111  * the present array; thus, there would be enough allowedMem for the
1112  * new array elements even if no other memory were currently used.
1113  *
1114  * We do the arithmetic in float8, because otherwise the product of
1115  * memtupsize and allowedMem could overflow. Any inaccuracy in the
1116  * result should be insignificant; but even if we computed a
1117  * completely insane result, the checks below will prevent anything
1118  * really bad from happening.
1119  */
1120  double grow_ratio;
1121 
1122  grow_ratio = (double) state->allowedMem / (double) memNowUsed;
1123  if (memtupsize * grow_ratio < INT_MAX)
1124  newmemtupsize = (int) (memtupsize * grow_ratio);
1125  else
1126  newmemtupsize = INT_MAX;
1127 
1128  /* We won't make any further enlargement attempts */
1129  state->growmemtuples = false;
1130  }
1131 
1132  /* Must enlarge array by at least one element, else report failure */
1133  if (newmemtupsize <= memtupsize)
1134  goto noalloc;
1135 
1136  /*
1137  * On a 32-bit machine, allowedMem could exceed MaxAllocHugeSize. Clamp
1138  * to ensure our request won't be rejected. Note that we can easily
1139  * exhaust address space before facing this outcome. (This is presently
1140  * impossible due to guc.c's MAX_KILOBYTES limitation on work_mem, but
1141  * don't rely on that at this distance.)
1142  */
1143  if ((Size) newmemtupsize >= MaxAllocHugeSize / sizeof(SortTuple))
1144  {
1145  newmemtupsize = (int) (MaxAllocHugeSize / sizeof(SortTuple));
1146  state->growmemtuples = false; /* can't grow any more */
1147  }
1148 
1149  /*
1150  * We need to be sure that we do not cause LACKMEM to become true, else
1151  * the space management algorithm will go nuts. The code above should
1152  * never generate a dangerous request, but to be safe, check explicitly
1153  * that the array growth fits within availMem. (We could still cause
1154  * LACKMEM if the memory chunk overhead associated with the memtuples
1155  * array were to increase. That shouldn't happen because we chose the
1156  * initial array size large enough to ensure that palloc will be treating
1157  * both old and new arrays as separate chunks. But we'll check LACKMEM
1158  * explicitly below just in case.)
1159  */
1160  if (state->availMem < (int64) ((newmemtupsize - memtupsize) * sizeof(SortTuple)))
1161  goto noalloc;
1162 
1163  /* OK, do it */
1164  FREEMEM(state, GetMemoryChunkSpace(state->memtuples));
1165  state->memtupsize = newmemtupsize;
1166  state->memtuples = (SortTuple *)
1167  repalloc_huge(state->memtuples,
1168  state->memtupsize * sizeof(SortTuple));
1169  USEMEM(state, GetMemoryChunkSpace(state->memtuples));
1170  if (LACKMEM(state))
1171  elog(ERROR, "unexpected out-of-memory situation in tuplesort");
1172  return true;
1173 
1174 noalloc:
1175  /* If for any reason we didn't realloc, shut off future attempts */
1176  state->growmemtuples = false;
1177  return false;
1178 }
1179 
1180 /*
1181  * Shared code for tuple and datum cases.
1182  */
1183 void
1185 {
1186  MemoryContext oldcontext = MemoryContextSwitchTo(state->base.sortcontext);
1187 
1188  Assert(!LEADER(state));
1189 
1190  /* Count the size of the out-of-line data */
1191  if (tuple->tuple != NULL)
1193 
1194  if (!useAbbrev)
1195  {
1196  /*
1197  * Leave ordinary Datum representation, or NULL value. If there is a
1198  * converter it won't expect NULL values, and cost model is not
1199  * required to account for NULL, so in that case we avoid calling
1200  * converter and just set datum1 to zeroed representation (to be
1201  * consistent, and to support cheap inequality tests for NULL
1202  * abbreviated keys).
1203  */
1204  }
1205  else if (!consider_abort_common(state))
1206  {
1207  /* Store abbreviated key representation */
1208  tuple->datum1 = state->base.sortKeys->abbrev_converter(tuple->datum1,
1209  state->base.sortKeys);
1210  }
1211  else
1212  {
1213  /*
1214  * Set state to be consistent with never trying abbreviation.
1215  *
1216  * Alter datum1 representation in already-copied tuples, so as to
1217  * ensure a consistent representation (current tuple was just
1218  * handled). It does not matter if some dumped tuples are already
1219  * sorted on tape, since serialized tuples lack abbreviated keys
1220  * (TSS_BUILDRUNS state prevents control reaching here in any case).
1221  */
1222  REMOVEABBREV(state, state->memtuples, state->memtupcount);
1223  }
1224 
1225  switch (state->status)
1226  {
1227  case TSS_INITIAL:
1228 
1229  /*
1230  * Save the tuple into the unsorted array. First, grow the array
1231  * as needed. Note that we try to grow the array when there is
1232  * still one free slot remaining --- if we fail, there'll still be
1233  * room to store the incoming tuple, and then we'll switch to
1234  * tape-based operation.
1235  */
1236  if (state->memtupcount >= state->memtupsize - 1)
1237  {
1238  (void) grow_memtuples(state);
1239  Assert(state->memtupcount < state->memtupsize);
1240  }
1241  state->memtuples[state->memtupcount++] = *tuple;
1242 
1243  /*
1244  * Check if it's time to switch over to a bounded heapsort. We do
1245  * so if the input tuple count exceeds twice the desired tuple
1246  * count (this is a heuristic for where heapsort becomes cheaper
1247  * than a quicksort), or if we've just filled workMem and have
1248  * enough tuples to meet the bound.
1249  *
1250  * Note that once we enter TSS_BOUNDED state we will always try to
1251  * complete the sort that way. In the worst case, if later input
1252  * tuples are larger than earlier ones, this might cause us to
1253  * exceed workMem significantly.
1254  */
1255  if (state->bounded &&
1256  (state->memtupcount > state->bound * 2 ||
1257  (state->memtupcount > state->bound && LACKMEM(state))))
1258  {
1259 #ifdef TRACE_SORT
1260  if (trace_sort)
1261  elog(LOG, "switching to bounded heapsort at %d tuples: %s",
1262  state->memtupcount,
1263  pg_rusage_show(&state->ru_start));
1264 #endif
1266  MemoryContextSwitchTo(oldcontext);
1267  return;
1268  }
1269 
1270  /*
1271  * Done if we still fit in available memory and have array slots.
1272  */
1273  if (state->memtupcount < state->memtupsize && !LACKMEM(state))
1274  {
1275  MemoryContextSwitchTo(oldcontext);
1276  return;
1277  }
1278 
1279  /*
1280  * Nope; time to switch to tape-based operation.
1281  */
1282  inittapes(state, true);
1283 
1284  /*
1285  * Dump all tuples.
1286  */
1287  dumptuples(state, false);
1288  break;
1289 
1290  case TSS_BOUNDED:
1291 
1292  /*
1293  * We don't want to grow the array here, so check whether the new
1294  * tuple can be discarded before putting it in. This should be a
1295  * good speed optimization, too, since when there are many more
1296  * input tuples than the bound, most input tuples can be discarded
1297  * with just this one comparison. Note that because we currently
1298  * have the sort direction reversed, we must check for <= not >=.
1299  */
1300  if (COMPARETUP(state, tuple, &state->memtuples[0]) <= 0)
1301  {
1302  /* new tuple <= top of the heap, so we can discard it */
1303  free_sort_tuple(state, tuple);
1305  }
1306  else
1307  {
1308  /* discard top of heap, replacing it with the new tuple */
1309  free_sort_tuple(state, &state->memtuples[0]);
1311  }
1312  break;
1313 
1314  case TSS_BUILDRUNS:
1315 
1316  /*
1317  * Save the tuple into the unsorted array (there must be space)
1318  */
1319  state->memtuples[state->memtupcount++] = *tuple;
1320 
1321  /*
1322  * If we are over the memory limit, dump all tuples.
1323  */
1324  dumptuples(state, false);
1325  break;
1326 
1327  default:
1328  elog(ERROR, "invalid tuplesort state");
1329  break;
1330  }
1331  MemoryContextSwitchTo(oldcontext);
1332 }
1333 
1334 static bool
1336 {
1337  Assert(state->base.sortKeys[0].abbrev_converter != NULL);
1338  Assert(state->base.sortKeys[0].abbrev_abort != NULL);
1339  Assert(state->base.sortKeys[0].abbrev_full_comparator != NULL);
1340 
1341  /*
1342  * Check effectiveness of abbreviation optimization. Consider aborting
1343  * when still within memory limit.
1344  */
1345  if (state->status == TSS_INITIAL &&
1346  state->memtupcount >= state->abbrevNext)
1347  {
1348  state->abbrevNext *= 2;
1349 
1350  /*
1351  * Check opclass-supplied abbreviation abort routine. It may indicate
1352  * that abbreviation should not proceed.
1353  */
1354  if (!state->base.sortKeys->abbrev_abort(state->memtupcount,
1355  state->base.sortKeys))
1356  return false;
1357 
1358  /*
1359  * Finally, restore authoritative comparator, and indicate that
1360  * abbreviation is not in play by setting abbrev_converter to NULL
1361  */
1362  state->base.sortKeys[0].comparator = state->base.sortKeys[0].abbrev_full_comparator;
1363  state->base.sortKeys[0].abbrev_converter = NULL;
1364  /* Not strictly necessary, but be tidy */
1365  state->base.sortKeys[0].abbrev_abort = NULL;
1366  state->base.sortKeys[0].abbrev_full_comparator = NULL;
1367 
1368  /* Give up - expect original pass-by-value representation */
1369  return true;
1370  }
1371 
1372  return false;
1373 }
1374 
1375 /*
1376  * All tuples have been provided; finish the sort.
1377  */
1378 void
1380 {
1381  MemoryContext oldcontext = MemoryContextSwitchTo(state->base.sortcontext);
1382 
1383 #ifdef TRACE_SORT
1384  if (trace_sort)
1385  elog(LOG, "performsort of worker %d starting: %s",
1386  state->worker, pg_rusage_show(&state->ru_start));
1387 #endif
1388 
1389  switch (state->status)
1390  {
1391  case TSS_INITIAL:
1392 
1393  /*
1394  * We were able to accumulate all the tuples within the allowed
1395  * amount of memory, or leader to take over worker tapes
1396  */
1397  if (SERIAL(state))
1398  {
1399  /* Just qsort 'em and we're done */
1401  state->status = TSS_SORTEDINMEM;
1402  }
1403  else if (WORKER(state))
1404  {
1405  /*
1406  * Parallel workers must still dump out tuples to tape. No
1407  * merge is required to produce single output run, though.
1408  */
1409  inittapes(state, false);
1410  dumptuples(state, true);
1412  state->status = TSS_SORTEDONTAPE;
1413  }
1414  else
1415  {
1416  /*
1417  * Leader will take over worker tapes and merge worker runs.
1418  * Note that mergeruns sets the correct state->status.
1419  */
1421  mergeruns(state);
1422  }
1423  state->current = 0;
1424  state->eof_reached = false;
1425  state->markpos_block = 0L;
1426  state->markpos_offset = 0;
1427  state->markpos_eof = false;
1428  break;
1429 
1430  case TSS_BOUNDED:
1431 
1432  /*
1433  * We were able to accumulate all the tuples required for output
1434  * in memory, using a heap to eliminate excess tuples. Now we
1435  * have to transform the heap to a properly-sorted array. Note
1436  * that sort_bounded_heap sets the correct state->status.
1437  */
1439  state->current = 0;
1440  state->eof_reached = false;
1441  state->markpos_offset = 0;
1442  state->markpos_eof = false;
1443  break;
1444 
1445  case TSS_BUILDRUNS:
1446 
1447  /*
1448  * Finish tape-based sort. First, flush all tuples remaining in
1449  * memory out to tape; then merge until we have a single remaining
1450  * run (or, if !randomAccess and !WORKER(), one run per tape).
1451  * Note that mergeruns sets the correct state->status.
1452  */
1453  dumptuples(state, true);
1454  mergeruns(state);
1455  state->eof_reached = false;
1456  state->markpos_block = 0L;
1457  state->markpos_offset = 0;
1458  state->markpos_eof = false;
1459  break;
1460 
1461  default:
1462  elog(ERROR, "invalid tuplesort state");
1463  break;
1464  }
1465 
1466 #ifdef TRACE_SORT
1467  if (trace_sort)
1468  {
1469  if (state->status == TSS_FINALMERGE)
1470  elog(LOG, "performsort of worker %d done (except %d-way final merge): %s",
1471  state->worker, state->nInputTapes,
1472  pg_rusage_show(&state->ru_start));
1473  else
1474  elog(LOG, "performsort of worker %d done: %s",
1475  state->worker, pg_rusage_show(&state->ru_start));
1476  }
1477 #endif
1478 
1479  MemoryContextSwitchTo(oldcontext);
1480 }
1481 
1482 /*
1483  * Internal routine to fetch the next tuple in either forward or back
1484  * direction into *stup. Returns false if no more tuples.
1485  * Returned tuple belongs to tuplesort memory context, and must not be freed
1486  * by caller. Note that fetched tuple is stored in memory that may be
1487  * recycled by any future fetch.
1488  */
1489 bool
1491  SortTuple *stup)
1492 {
1493  unsigned int tuplen;
1494  size_t nmoved;
1495 
1496  Assert(!WORKER(state));
1497 
1498  switch (state->status)
1499  {
1500  case TSS_SORTEDINMEM:
1501  Assert(forward || state->base.sortopt & TUPLESORT_RANDOMACCESS);
1502  Assert(!state->slabAllocatorUsed);
1503  if (forward)
1504  {
1505  if (state->current < state->memtupcount)
1506  {
1507  *stup = state->memtuples[state->current++];
1508  return true;
1509  }
1510  state->eof_reached = true;
1511 
1512  /*
1513  * Complain if caller tries to retrieve more tuples than
1514  * originally asked for in a bounded sort. This is because
1515  * returning EOF here might be the wrong thing.
1516  */
1517  if (state->bounded && state->current >= state->bound)
1518  elog(ERROR, "retrieved too many tuples in a bounded sort");
1519 
1520  return false;
1521  }
1522  else
1523  {
1524  if (state->current <= 0)
1525  return false;
1526 
1527  /*
1528  * if all tuples are fetched already then we return last
1529  * tuple, else - tuple before last returned.
1530  */
1531  if (state->eof_reached)
1532  state->eof_reached = false;
1533  else
1534  {
1535  state->current--; /* last returned tuple */
1536  if (state->current <= 0)
1537  return false;
1538  }
1539  *stup = state->memtuples[state->current - 1];
1540  return true;
1541  }
1542  break;
1543 
1544  case TSS_SORTEDONTAPE:
1545  Assert(forward || state->base.sortopt & TUPLESORT_RANDOMACCESS);
1546  Assert(state->slabAllocatorUsed);
1547 
1548  /*
1549  * The slot that held the tuple that we returned in previous
1550  * gettuple call can now be reused.
1551  */
1552  if (state->lastReturnedTuple)
1553  {
1554  RELEASE_SLAB_SLOT(state, state->lastReturnedTuple);
1555  state->lastReturnedTuple = NULL;
1556  }
1557 
1558  if (forward)
1559  {
1560  if (state->eof_reached)
1561  return false;
1562 
1563  if ((tuplen = getlen(state->result_tape, true)) != 0)
1564  {
1565  READTUP(state, stup, state->result_tape, tuplen);
1566 
1567  /*
1568  * Remember the tuple we return, so that we can recycle
1569  * its memory on next call. (This can be NULL, in the
1570  * !state->tuples case).
1571  */
1572  state->lastReturnedTuple = stup->tuple;
1573 
1574  return true;
1575  }
1576  else
1577  {
1578  state->eof_reached = true;
1579  return false;
1580  }
1581  }
1582 
1583  /*
1584  * Backward.
1585  *
1586  * if all tuples are fetched already then we return last tuple,
1587  * else - tuple before last returned.
1588  */
1589  if (state->eof_reached)
1590  {
1591  /*
1592  * Seek position is pointing just past the zero tuplen at the
1593  * end of file; back up to fetch last tuple's ending length
1594  * word. If seek fails we must have a completely empty file.
1595  */
1596  nmoved = LogicalTapeBackspace(state->result_tape,
1597  2 * sizeof(unsigned int));
1598  if (nmoved == 0)
1599  return false;
1600  else if (nmoved != 2 * sizeof(unsigned int))
1601  elog(ERROR, "unexpected tape position");
1602  state->eof_reached = false;
1603  }
1604  else
1605  {
1606  /*
1607  * Back up and fetch previously-returned tuple's ending length
1608  * word. If seek fails, assume we are at start of file.
1609  */
1610  nmoved = LogicalTapeBackspace(state->result_tape,
1611  sizeof(unsigned int));
1612  if (nmoved == 0)
1613  return false;
1614  else if (nmoved != sizeof(unsigned int))
1615  elog(ERROR, "unexpected tape position");
1616  tuplen = getlen(state->result_tape, false);
1617 
1618  /*
1619  * Back up to get ending length word of tuple before it.
1620  */
1621  nmoved = LogicalTapeBackspace(state->result_tape,
1622  tuplen + 2 * sizeof(unsigned int));
1623  if (nmoved == tuplen + sizeof(unsigned int))
1624  {
1625  /*
1626  * We backed up over the previous tuple, but there was no
1627  * ending length word before it. That means that the prev
1628  * tuple is the first tuple in the file. It is now the
1629  * next to read in forward direction (not obviously right,
1630  * but that is what in-memory case does).
1631  */
1632  return false;
1633  }
1634  else if (nmoved != tuplen + 2 * sizeof(unsigned int))
1635  elog(ERROR, "bogus tuple length in backward scan");
1636  }
1637 
1638  tuplen = getlen(state->result_tape, false);
1639 
1640  /*
1641  * Now we have the length of the prior tuple, back up and read it.
1642  * Note: READTUP expects we are positioned after the initial
1643  * length word of the tuple, so back up to that point.
1644  */
1645  nmoved = LogicalTapeBackspace(state->result_tape,
1646  tuplen);
1647  if (nmoved != tuplen)
1648  elog(ERROR, "bogus tuple length in backward scan");
1649  READTUP(state, stup, state->result_tape, tuplen);
1650 
1651  /*
1652  * Remember the tuple we return, so that we can recycle its memory
1653  * on next call. (This can be NULL, in the Datum case).
1654  */
1655  state->lastReturnedTuple = stup->tuple;
1656 
1657  return true;
1658 
1659  case TSS_FINALMERGE:
1660  Assert(forward);
1661  /* We are managing memory ourselves, with the slab allocator. */
1662  Assert(state->slabAllocatorUsed);
1663 
1664  /*
1665  * The slab slot holding the tuple that we returned in previous
1666  * gettuple call can now be reused.
1667  */
1668  if (state->lastReturnedTuple)
1669  {
1670  RELEASE_SLAB_SLOT(state, state->lastReturnedTuple);
1671  state->lastReturnedTuple = NULL;
1672  }
1673 
1674  /*
1675  * This code should match the inner loop of mergeonerun().
1676  */
1677  if (state->memtupcount > 0)
1678  {
1679  int srcTapeIndex = state->memtuples[0].srctape;
1680  LogicalTape *srcTape = state->inputTapes[srcTapeIndex];
1681  SortTuple newtup;
1682 
1683  *stup = state->memtuples[0];
1684 
1685  /*
1686  * Remember the tuple we return, so that we can recycle its
1687  * memory on next call. (This can be NULL, in the Datum case).
1688  */
1689  state->lastReturnedTuple = stup->tuple;
1690 
1691  /*
1692  * Pull next tuple from tape, and replace the returned tuple
1693  * at top of the heap with it.
1694  */
1695  if (!mergereadnext(state, srcTape, &newtup))
1696  {
1697  /*
1698  * If no more data, we've reached end of run on this tape.
1699  * Remove the top node from the heap.
1700  */
1702  state->nInputRuns--;
1703 
1704  /*
1705  * Close the tape. It'd go away at the end of the sort
1706  * anyway, but better to release the memory early.
1707  */
1708  LogicalTapeClose(srcTape);
1709  return true;
1710  }
1711  newtup.srctape = srcTapeIndex;
1713  return true;
1714  }
1715  return false;
1716 
1717  default:
1718  elog(ERROR, "invalid tuplesort state");
1719  return false; /* keep compiler quiet */
1720  }
1721 }
1722 
1723 
1724 /*
1725  * Advance over N tuples in either forward or back direction,
1726  * without returning any data. N==0 is a no-op.
1727  * Returns true if successful, false if ran out of tuples.
1728  */
1729 bool
1730 tuplesort_skiptuples(Tuplesortstate *state, int64 ntuples, bool forward)
1731 {
1732  MemoryContext oldcontext;
1733 
1734  /*
1735  * We don't actually support backwards skip yet, because no callers need
1736  * it. The API is designed to allow for that later, though.
1737  */
1738  Assert(forward);
1739  Assert(ntuples >= 0);
1740  Assert(!WORKER(state));
1741 
1742  switch (state->status)
1743  {
1744  case TSS_SORTEDINMEM:
1745  if (state->memtupcount - state->current >= ntuples)
1746  {
1747  state->current += ntuples;
1748  return true;
1749  }
1750  state->current = state->memtupcount;
1751  state->eof_reached = true;
1752 
1753  /*
1754  * Complain if caller tries to retrieve more tuples than
1755  * originally asked for in a bounded sort. This is because
1756  * returning EOF here might be the wrong thing.
1757  */
1758  if (state->bounded && state->current >= state->bound)
1759  elog(ERROR, "retrieved too many tuples in a bounded sort");
1760 
1761  return false;
1762 
1763  case TSS_SORTEDONTAPE:
1764  case TSS_FINALMERGE:
1765 
1766  /*
1767  * We could probably optimize these cases better, but for now it's
1768  * not worth the trouble.
1769  */
1770  oldcontext = MemoryContextSwitchTo(state->base.sortcontext);
1771  while (ntuples-- > 0)
1772  {
1773  SortTuple stup;
1774 
1775  if (!tuplesort_gettuple_common(state, forward, &stup))
1776  {
1777  MemoryContextSwitchTo(oldcontext);
1778  return false;
1779  }
1781  }
1782  MemoryContextSwitchTo(oldcontext);
1783  return true;
1784 
1785  default:
1786  elog(ERROR, "invalid tuplesort state");
1787  return false; /* keep compiler quiet */
1788  }
1789 }
1790 
1791 /*
1792  * tuplesort_merge_order - report merge order we'll use for given memory
1793  * (note: "merge order" just means the number of input tapes in the merge).
1794  *
1795  * This is exported for use by the planner. allowedMem is in bytes.
1796  */
1797 int
1798 tuplesort_merge_order(int64 allowedMem)
1799 {
1800  int mOrder;
1801 
1802  /*----------
1803  * In the merge phase, we need buffer space for each input and output tape.
1804  * Each pass in the balanced merge algorithm reads from M input tapes, and
1805  * writes to N output tapes. Each tape consumes TAPE_BUFFER_OVERHEAD bytes
1806  * of memory. In addition to that, we want MERGE_BUFFER_SIZE workspace per
1807  * input tape.
1808  *
1809  * totalMem = M * (TAPE_BUFFER_OVERHEAD + MERGE_BUFFER_SIZE) +
1810  * N * TAPE_BUFFER_OVERHEAD
1811  *
1812  * Except for the last and next-to-last merge passes, where there can be
1813  * fewer tapes left to process, M = N. We choose M so that we have the
1814  * desired amount of memory available for the input buffers
1815  * (TAPE_BUFFER_OVERHEAD + MERGE_BUFFER_SIZE), given the total memory
1816  * available for the tape buffers (allowedMem).
1817  *
1818  * Note: you might be thinking we need to account for the memtuples[]
1819  * array in this calculation, but we effectively treat that as part of the
1820  * MERGE_BUFFER_SIZE workspace.
1821  *----------
1822  */
1823  mOrder = allowedMem /
1825 
1826  /*
1827  * Even in minimum memory, use at least a MINORDER merge. On the other
1828  * hand, even when we have lots of memory, do not use more than a MAXORDER
1829  * merge. Tapes are pretty cheap, but they're not entirely free. Each
1830  * additional tape reduces the amount of memory available to build runs,
1831  * which in turn can cause the same sort to need more runs, which makes
1832  * merging slower even if it can still be done in a single pass. Also,
1833  * high order merges are quite slow due to CPU cache effects; it can be
1834  * faster to pay the I/O cost of a multi-pass merge than to perform a
1835  * single merge pass across many hundreds of tapes.
1836  */
1837  mOrder = Max(mOrder, MINORDER);
1838  mOrder = Min(mOrder, MAXORDER);
1839 
1840  return mOrder;
1841 }
1842 
1843 /*
1844  * Helper function to calculate how much memory to allocate for the read buffer
1845  * of each input tape in a merge pass.
1846  *
1847  * 'avail_mem' is the amount of memory available for the buffers of all the
1848  * tapes, both input and output.
1849  * 'nInputTapes' and 'nInputRuns' are the number of input tapes and runs.
1850  * 'maxOutputTapes' is the max. number of output tapes we should produce.
1851  */
1852 static int64
1853 merge_read_buffer_size(int64 avail_mem, int nInputTapes, int nInputRuns,
1854  int maxOutputTapes)
1855 {
1856  int nOutputRuns;
1857  int nOutputTapes;
1858 
1859  /*
1860  * How many output tapes will we produce in this pass?
1861  *
1862  * This is nInputRuns / nInputTapes, rounded up.
1863  */
1864  nOutputRuns = (nInputRuns + nInputTapes - 1) / nInputTapes;
1865 
1866  nOutputTapes = Min(nOutputRuns, maxOutputTapes);
1867 
1868  /*
1869  * Each output tape consumes TAPE_BUFFER_OVERHEAD bytes of memory. All
1870  * remaining memory is divided evenly between the input tapes.
1871  *
1872  * This also follows from the formula in tuplesort_merge_order, but here
1873  * we derive the input buffer size from the amount of memory available,
1874  * and M and N.
1875  */
1876  return Max((avail_mem - TAPE_BUFFER_OVERHEAD * nOutputTapes) / nInputTapes, 0);
1877 }
1878 
1879 /*
1880  * inittapes - initialize for tape sorting.
1881  *
1882  * This is called only if we have found we won't sort in memory.
1883  */
1884 static void
1886 {
1887  Assert(!LEADER(state));
1888 
1889  if (mergeruns)
1890  {
1891  /* Compute number of input tapes to use when merging */
1892  state->maxTapes = tuplesort_merge_order(state->allowedMem);
1893  }
1894  else
1895  {
1896  /* Workers can sometimes produce single run, output without merge */
1897  Assert(WORKER(state));
1898  state->maxTapes = MINORDER;
1899  }
1900 
1901 #ifdef TRACE_SORT
1902  if (trace_sort)
1903  elog(LOG, "worker %d switching to external sort with %d tapes: %s",
1904  state->worker, state->maxTapes, pg_rusage_show(&state->ru_start));
1905 #endif
1906 
1907  /* Create the tape set */
1908  inittapestate(state, state->maxTapes);
1909  state->tapeset =
1910  LogicalTapeSetCreate(false,
1911  state->shared ? &state->shared->fileset : NULL,
1912  state->worker);
1913 
1914  state->currentRun = 0;
1915 
1916  /*
1917  * Initialize logical tape arrays.
1918  */
1919  state->inputTapes = NULL;
1920  state->nInputTapes = 0;
1921  state->nInputRuns = 0;
1922 
1923  state->outputTapes = palloc0(state->maxTapes * sizeof(LogicalTape *));
1924  state->nOutputTapes = 0;
1925  state->nOutputRuns = 0;
1926 
1927  state->status = TSS_BUILDRUNS;
1928 
1930 }
1931 
1932 /*
1933  * inittapestate - initialize generic tape management state
1934  */
1935 static void
1937 {
1938  int64 tapeSpace;
1939 
1940  /*
1941  * Decrease availMem to reflect the space needed for tape buffers; but
1942  * don't decrease it to the point that we have no room for tuples. (That
1943  * case is only likely to occur if sorting pass-by-value Datums; in all
1944  * other scenarios the memtuples[] array is unlikely to occupy more than
1945  * half of allowedMem. In the pass-by-value case it's not important to
1946  * account for tuple space, so we don't care if LACKMEM becomes
1947  * inaccurate.)
1948  */
1949  tapeSpace = (int64) maxTapes * TAPE_BUFFER_OVERHEAD;
1950 
1951  if (tapeSpace + GetMemoryChunkSpace(state->memtuples) < state->allowedMem)
1952  USEMEM(state, tapeSpace);
1953 
1954  /*
1955  * Make sure that the temp file(s) underlying the tape set are created in
1956  * suitable temp tablespaces. For parallel sorts, this should have been
1957  * called already, but it doesn't matter if it is called a second time.
1958  */
1960 }
1961 
1962 /*
1963  * selectnewtape -- select next tape to output to.
1964  *
1965  * This is called after finishing a run when we know another run
1966  * must be started. This is used both when building the initial
1967  * runs, and during merge passes.
1968  */
1969 static void
1971 {
1972  /*
1973  * At the beginning of each merge pass, nOutputTapes and nOutputRuns are
1974  * both zero. On each call, we create a new output tape to hold the next
1975  * run, until maxTapes is reached. After that, we assign new runs to the
1976  * existing tapes in a round robin fashion.
1977  */
1978  if (state->nOutputTapes < state->maxTapes)
1979  {
1980  /* Create a new tape to hold the next run */
1981  Assert(state->outputTapes[state->nOutputRuns] == NULL);
1982  Assert(state->nOutputRuns == state->nOutputTapes);
1983  state->destTape = LogicalTapeCreate(state->tapeset);
1984  state->outputTapes[state->nOutputTapes] = state->destTape;
1985  state->nOutputTapes++;
1986  state->nOutputRuns++;
1987  }
1988  else
1989  {
1990  /*
1991  * We have reached the max number of tapes. Append to an existing
1992  * tape.
1993  */
1994  state->destTape = state->outputTapes[state->nOutputRuns % state->nOutputTapes];
1995  state->nOutputRuns++;
1996  }
1997 }
1998 
1999 /*
2000  * Initialize the slab allocation arena, for the given number of slots.
2001  */
2002 static void
2004 {
2005  if (numSlots > 0)
2006  {
2007  char *p;
2008  int i;
2009 
2010  state->slabMemoryBegin = palloc(numSlots * SLAB_SLOT_SIZE);
2011  state->slabMemoryEnd = state->slabMemoryBegin +
2012  numSlots * SLAB_SLOT_SIZE;
2013  state->slabFreeHead = (SlabSlot *) state->slabMemoryBegin;
2014  USEMEM(state, numSlots * SLAB_SLOT_SIZE);
2015 
2016  p = state->slabMemoryBegin;
2017  for (i = 0; i < numSlots - 1; i++)
2018  {
2019  ((SlabSlot *) p)->nextfree = (SlabSlot *) (p + SLAB_SLOT_SIZE);
2020  p += SLAB_SLOT_SIZE;
2021  }
2022  ((SlabSlot *) p)->nextfree = NULL;
2023  }
2024  else
2025  {
2026  state->slabMemoryBegin = state->slabMemoryEnd = NULL;
2027  state->slabFreeHead = NULL;
2028  }
2029  state->slabAllocatorUsed = true;
2030 }
2031 
2032 /*
2033  * mergeruns -- merge all the completed initial runs.
2034  *
2035  * This implements the Balanced k-Way Merge Algorithm. All input data has
2036  * already been written to initial runs on tape (see dumptuples).
2037  */
2038 static void
2040 {
2041  int tapenum;
2042 
2043  Assert(state->status == TSS_BUILDRUNS);
2044  Assert(state->memtupcount == 0);
2045 
2046  if (state->base.sortKeys != NULL && state->base.sortKeys->abbrev_converter != NULL)
2047  {
2048  /*
2049  * If there are multiple runs to be merged, when we go to read back
2050  * tuples from disk, abbreviated keys will not have been stored, and
2051  * we don't care to regenerate them. Disable abbreviation from this
2052  * point on.
2053  */
2054  state->base.sortKeys->abbrev_converter = NULL;
2055  state->base.sortKeys->comparator = state->base.sortKeys->abbrev_full_comparator;
2056 
2057  /* Not strictly necessary, but be tidy */
2058  state->base.sortKeys->abbrev_abort = NULL;
2059  state->base.sortKeys->abbrev_full_comparator = NULL;
2060  }
2061 
2062  /*
2063  * Reset tuple memory. We've freed all the tuples that we previously
2064  * allocated. We will use the slab allocator from now on.
2065  */
2066  MemoryContextResetOnly(state->base.tuplecontext);
2067 
2068  /*
2069  * We no longer need a large memtuples array. (We will allocate a smaller
2070  * one for the heap later.)
2071  */
2072  FREEMEM(state, GetMemoryChunkSpace(state->memtuples));
2073  pfree(state->memtuples);
2074  state->memtuples = NULL;
2075 
2076  /*
2077  * Initialize the slab allocator. We need one slab slot per input tape,
2078  * for the tuples in the heap, plus one to hold the tuple last returned
2079  * from tuplesort_gettuple. (If we're sorting pass-by-val Datums,
2080  * however, we don't need to do allocate anything.)
2081  *
2082  * In a multi-pass merge, we could shrink this allocation for the last
2083  * merge pass, if it has fewer tapes than previous passes, but we don't
2084  * bother.
2085  *
2086  * From this point on, we no longer use the USEMEM()/LACKMEM() mechanism
2087  * to track memory usage of individual tuples.
2088  */
2089  if (state->base.tuples)
2090  init_slab_allocator(state, state->nOutputTapes + 1);
2091  else
2093 
2094  /*
2095  * Allocate a new 'memtuples' array, for the heap. It will hold one tuple
2096  * from each input tape.
2097  *
2098  * We could shrink this, too, between passes in a multi-pass merge, but we
2099  * don't bother. (The initial input tapes are still in outputTapes. The
2100  * number of input tapes will not increase between passes.)
2101  */
2102  state->memtupsize = state->nOutputTapes;
2103  state->memtuples = (SortTuple *) MemoryContextAlloc(state->base.maincontext,
2104  state->nOutputTapes * sizeof(SortTuple));
2105  USEMEM(state, GetMemoryChunkSpace(state->memtuples));
2106 
2107  /*
2108  * Use all the remaining memory we have available for tape buffers among
2109  * all the input tapes. At the beginning of each merge pass, we will
2110  * divide this memory between the input and output tapes in the pass.
2111  */
2112  state->tape_buffer_mem = state->availMem;
2113  USEMEM(state, state->tape_buffer_mem);
2114 #ifdef TRACE_SORT
2115  if (trace_sort)
2116  elog(LOG, "worker %d using %zu KB of memory for tape buffers",
2117  state->worker, state->tape_buffer_mem / 1024);
2118 #endif
2119 
2120  for (;;)
2121  {
2122  /*
2123  * On the first iteration, or if we have read all the runs from the
2124  * input tapes in a multi-pass merge, it's time to start a new pass.
2125  * Rewind all the output tapes, and make them inputs for the next
2126  * pass.
2127  */
2128  if (state->nInputRuns == 0)
2129  {
2130  int64 input_buffer_size;
2131 
2132  /* Close the old, emptied, input tapes */
2133  if (state->nInputTapes > 0)
2134  {
2135  for (tapenum = 0; tapenum < state->nInputTapes; tapenum++)
2136  LogicalTapeClose(state->inputTapes[tapenum]);
2137  pfree(state->inputTapes);
2138  }
2139 
2140  /* Previous pass's outputs become next pass's inputs. */
2141  state->inputTapes = state->outputTapes;
2142  state->nInputTapes = state->nOutputTapes;
2143  state->nInputRuns = state->nOutputRuns;
2144 
2145  /*
2146  * Reset output tape variables. The actual LogicalTapes will be
2147  * created as needed, here we only allocate the array to hold
2148  * them.
2149  */
2150  state->outputTapes = palloc0(state->nInputTapes * sizeof(LogicalTape *));
2151  state->nOutputTapes = 0;
2152  state->nOutputRuns = 0;
2153 
2154  /*
2155  * Redistribute the memory allocated for tape buffers, among the
2156  * new input and output tapes.
2157  */
2158  input_buffer_size = merge_read_buffer_size(state->tape_buffer_mem,
2159  state->nInputTapes,
2160  state->nInputRuns,
2161  state->maxTapes);
2162 
2163 #ifdef TRACE_SORT
2164  if (trace_sort)
2165  elog(LOG, "starting merge pass of %d input runs on %d tapes, " INT64_FORMAT " KB of memory for each input tape: %s",
2166  state->nInputRuns, state->nInputTapes, input_buffer_size / 1024,
2167  pg_rusage_show(&state->ru_start));
2168 #endif
2169 
2170  /* Prepare the new input tapes for merge pass. */
2171  for (tapenum = 0; tapenum < state->nInputTapes; tapenum++)
2172  LogicalTapeRewindForRead(state->inputTapes[tapenum], input_buffer_size);
2173 
2174  /*
2175  * If there's just one run left on each input tape, then only one
2176  * merge pass remains. If we don't have to produce a materialized
2177  * sorted tape, we can stop at this point and do the final merge
2178  * on-the-fly.
2179  */
2180  if ((state->base.sortopt & TUPLESORT_RANDOMACCESS) == 0
2181  && state->nInputRuns <= state->nInputTapes
2182  && !WORKER(state))
2183  {
2184  /* Tell logtape.c we won't be writing anymore */
2186  /* Initialize for the final merge pass */
2187  beginmerge(state);
2188  state->status = TSS_FINALMERGE;
2189  return;
2190  }
2191  }
2192 
2193  /* Select an output tape */
2195 
2196  /* Merge one run from each input tape. */
2197  mergeonerun(state);
2198 
2199  /*
2200  * If the input tapes are empty, and we output only one output run,
2201  * we're done. The current output tape contains the final result.
2202  */
2203  if (state->nInputRuns == 0 && state->nOutputRuns <= 1)
2204  break;
2205  }
2206 
2207  /*
2208  * Done. The result is on a single run on a single tape.
2209  */
2210  state->result_tape = state->outputTapes[0];
2211  if (!WORKER(state))
2212  LogicalTapeFreeze(state->result_tape, NULL);
2213  else
2215  state->status = TSS_SORTEDONTAPE;
2216 
2217  /* Close all the now-empty input tapes, to release their read buffers. */
2218  for (tapenum = 0; tapenum < state->nInputTapes; tapenum++)
2219  LogicalTapeClose(state->inputTapes[tapenum]);
2220 }
2221 
2222 /*
2223  * Merge one run from each input tape.
2224  */
2225 static void
2227 {
2228  int srcTapeIndex;
2229  LogicalTape *srcTape;
2230 
2231  /*
2232  * Start the merge by loading one tuple from each active source tape into
2233  * the heap.
2234  */
2235  beginmerge(state);
2236 
2237  Assert(state->slabAllocatorUsed);
2238 
2239  /*
2240  * Execute merge by repeatedly extracting lowest tuple in heap, writing it
2241  * out, and replacing it with next tuple from same tape (if there is
2242  * another one).
2243  */
2244  while (state->memtupcount > 0)
2245  {
2246  SortTuple stup;
2247 
2248  /* write the tuple to destTape */
2249  srcTapeIndex = state->memtuples[0].srctape;
2250  srcTape = state->inputTapes[srcTapeIndex];
2251  WRITETUP(state, state->destTape, &state->memtuples[0]);
2252 
2253  /* recycle the slot of the tuple we just wrote out, for the next read */
2254  if (state->memtuples[0].tuple)
2255  RELEASE_SLAB_SLOT(state, state->memtuples[0].tuple);
2256 
2257  /*
2258  * pull next tuple from the tape, and replace the written-out tuple in
2259  * the heap with it.
2260  */
2261  if (mergereadnext(state, srcTape, &stup))
2262  {
2263  stup.srctape = srcTapeIndex;
2265  }
2266  else
2267  {
2269  state->nInputRuns--;
2270  }
2271  }
2272 
2273  /*
2274  * When the heap empties, we're done. Write an end-of-run marker on the
2275  * output tape.
2276  */
2277  markrunend(state->destTape);
2278 }
2279 
2280 /*
2281  * beginmerge - initialize for a merge pass
2282  *
2283  * Fill the merge heap with the first tuple from each input tape.
2284  */
2285 static void
2287 {
2288  int activeTapes;
2289  int srcTapeIndex;
2290 
2291  /* Heap should be empty here */
2292  Assert(state->memtupcount == 0);
2293 
2294  activeTapes = Min(state->nInputTapes, state->nInputRuns);
2295 
2296  for (srcTapeIndex = 0; srcTapeIndex < activeTapes; srcTapeIndex++)
2297  {
2298  SortTuple tup;
2299 
2300  if (mergereadnext(state, state->inputTapes[srcTapeIndex], &tup))
2301  {
2302  tup.srctape = srcTapeIndex;
2304  }
2305  }
2306 }
2307 
2308 /*
2309  * mergereadnext - read next tuple from one merge input tape
2310  *
2311  * Returns false on EOF.
2312  */
2313 static bool
2315 {
2316  unsigned int tuplen;
2317 
2318  /* read next tuple, if any */
2319  if ((tuplen = getlen(srcTape, true)) == 0)
2320  return false;
2321  READTUP(state, stup, srcTape, tuplen);
2322 
2323  return true;
2324 }
2325 
2326 /*
2327  * dumptuples - remove tuples from memtuples and write initial run to tape
2328  *
2329  * When alltuples = true, dump everything currently in memory. (This case is
2330  * only used at end of input data.)
2331  */
2332 static void
2334 {
2335  int memtupwrite;
2336  int i;
2337 
2338  /*
2339  * Nothing to do if we still fit in available memory and have array slots,
2340  * unless this is the final call during initial run generation.
2341  */
2342  if (state->memtupcount < state->memtupsize && !LACKMEM(state) &&
2343  !alltuples)
2344  return;
2345 
2346  /*
2347  * Final call might require no sorting, in rare cases where we just so
2348  * happen to have previously LACKMEM()'d at the point where exactly all
2349  * remaining tuples are loaded into memory, just before input was
2350  * exhausted. In general, short final runs are quite possible, but avoid
2351  * creating a completely empty run. In a worker, though, we must produce
2352  * at least one tape, even if it's empty.
2353  */
2354  if (state->memtupcount == 0 && state->currentRun > 0)
2355  return;
2356 
2357  Assert(state->status == TSS_BUILDRUNS);
2358 
2359  /*
2360  * It seems unlikely that this limit will ever be exceeded, but take no
2361  * chances
2362  */
2363  if (state->currentRun == INT_MAX)
2364  ereport(ERROR,
2365  (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
2366  errmsg("cannot have more than %d runs for an external sort",
2367  INT_MAX)));
2368 
2369  if (state->currentRun > 0)
2371 
2372  state->currentRun++;
2373 
2374 #ifdef TRACE_SORT
2375  if (trace_sort)
2376  elog(LOG, "worker %d starting quicksort of run %d: %s",
2377  state->worker, state->currentRun,
2378  pg_rusage_show(&state->ru_start));
2379 #endif
2380 
2381  /*
2382  * Sort all tuples accumulated within the allowed amount of memory for
2383  * this run using quicksort
2384  */
2386 
2387 #ifdef TRACE_SORT
2388  if (trace_sort)
2389  elog(LOG, "worker %d finished quicksort of run %d: %s",
2390  state->worker, state->currentRun,
2391  pg_rusage_show(&state->ru_start));
2392 #endif
2393 
2394  memtupwrite = state->memtupcount;
2395  for (i = 0; i < memtupwrite; i++)
2396  {
2397  SortTuple *stup = &state->memtuples[i];
2398 
2399  WRITETUP(state, state->destTape, stup);
2400 
2401  /*
2402  * Account for freeing the tuple, but no need to do the actual pfree
2403  * since the tuplecontext is being reset after the loop.
2404  */
2405  if (stup->tuple != NULL)
2407  }
2408 
2409  state->memtupcount = 0;
2410 
2411  /*
2412  * Reset tuple memory. We've freed all of the tuples that we previously
2413  * allocated. It's important to avoid fragmentation when there is a stark
2414  * change in the sizes of incoming tuples. Fragmentation due to
2415  * AllocSetFree's bucketing by size class might be particularly bad if
2416  * this step wasn't taken.
2417  */
2418  MemoryContextReset(state->base.tuplecontext);
2419 
2420  markrunend(state->destTape);
2421 
2422 #ifdef TRACE_SORT
2423  if (trace_sort)
2424  elog(LOG, "worker %d finished writing run %d to tape %d: %s",
2425  state->worker, state->currentRun, (state->currentRun - 1) % state->nOutputTapes + 1,
2426  pg_rusage_show(&state->ru_start));
2427 #endif
2428 }
2429 
2430 /*
2431  * tuplesort_rescan - rewind and replay the scan
2432  */
2433 void
2435 {
2436  MemoryContext oldcontext = MemoryContextSwitchTo(state->base.sortcontext);
2437 
2438  Assert(state->base.sortopt & TUPLESORT_RANDOMACCESS);
2439 
2440  switch (state->status)
2441  {
2442  case TSS_SORTEDINMEM:
2443  state->current = 0;
2444  state->eof_reached = false;
2445  state->markpos_offset = 0;
2446  state->markpos_eof = false;
2447  break;
2448  case TSS_SORTEDONTAPE:
2449  LogicalTapeRewindForRead(state->result_tape, 0);
2450  state->eof_reached = false;
2451  state->markpos_block = 0L;
2452  state->markpos_offset = 0;
2453  state->markpos_eof = false;
2454  break;
2455  default:
2456  elog(ERROR, "invalid tuplesort state");
2457  break;
2458  }
2459 
2460  MemoryContextSwitchTo(oldcontext);
2461 }
2462 
2463 /*
2464  * tuplesort_markpos - saves current position in the merged sort file
2465  */
2466 void
2468 {
2469  MemoryContext oldcontext = MemoryContextSwitchTo(state->base.sortcontext);
2470 
2471  Assert(state->base.sortopt & TUPLESORT_RANDOMACCESS);
2472 
2473  switch (state->status)
2474  {
2475  case TSS_SORTEDINMEM:
2476  state->markpos_offset = state->current;
2477  state->markpos_eof = state->eof_reached;
2478  break;
2479  case TSS_SORTEDONTAPE:
2480  LogicalTapeTell(state->result_tape,
2481  &state->markpos_block,
2482  &state->markpos_offset);
2483  state->markpos_eof = state->eof_reached;
2484  break;
2485  default:
2486  elog(ERROR, "invalid tuplesort state");
2487  break;
2488  }
2489 
2490  MemoryContextSwitchTo(oldcontext);
2491 }
2492 
2493 /*
2494  * tuplesort_restorepos - restores current position in merged sort file to
2495  * last saved position
2496  */
2497 void
2499 {
2500  MemoryContext oldcontext = MemoryContextSwitchTo(state->base.sortcontext);
2501 
2502  Assert(state->base.sortopt & TUPLESORT_RANDOMACCESS);
2503 
2504  switch (state->status)
2505  {
2506  case TSS_SORTEDINMEM:
2507  state->current = state->markpos_offset;
2508  state->eof_reached = state->markpos_eof;
2509  break;
2510  case TSS_SORTEDONTAPE:
2511  LogicalTapeSeek(state->result_tape,
2512  state->markpos_block,
2513  state->markpos_offset);
2514  state->eof_reached = state->markpos_eof;
2515  break;
2516  default:
2517  elog(ERROR, "invalid tuplesort state");
2518  break;
2519  }
2520 
2521  MemoryContextSwitchTo(oldcontext);
2522 }
2523 
2524 /*
2525  * tuplesort_get_stats - extract summary statistics
2526  *
2527  * This can be called after tuplesort_performsort() finishes to obtain
2528  * printable summary information about how the sort was performed.
2529  */
2530 void
2532  TuplesortInstrumentation *stats)
2533 {
2534  /*
2535  * Note: it might seem we should provide both memory and disk usage for a
2536  * disk-based sort. However, the current code doesn't track memory space
2537  * accurately once we have begun to return tuples to the caller (since we
2538  * don't account for pfree's the caller is expected to do), so we cannot
2539  * rely on availMem in a disk sort. This does not seem worth the overhead
2540  * to fix. Is it worth creating an API for the memory context code to
2541  * tell us how much is actually used in sortcontext?
2542  */
2544 
2545  if (state->isMaxSpaceDisk)
2547  else
2549  stats->spaceUsed = (state->maxSpace + 1023) / 1024;
2550 
2551  switch (state->maxSpaceStatus)
2552  {
2553  case TSS_SORTEDINMEM:
2554  if (state->boundUsed)
2556  else
2558  break;
2559  case TSS_SORTEDONTAPE:
2561  break;
2562  case TSS_FINALMERGE:
2564  break;
2565  default:
2567  break;
2568  }
2569 }
2570 
2571 /*
2572  * Convert TuplesortMethod to a string.
2573  */
2574 const char *
2576 {
2577  switch (m)
2578  {
2580  return "still in progress";
2582  return "top-N heapsort";
2583  case SORT_TYPE_QUICKSORT:
2584  return "quicksort";
2586  return "external sort";
2588  return "external merge";
2589  }
2590 
2591  return "unknown";
2592 }
2593 
2594 /*
2595  * Convert TuplesortSpaceType to a string.
2596  */
2597 const char *
2599 {
2601  return t == SORT_SPACE_TYPE_DISK ? "Disk" : "Memory";
2602 }
2603 
2604 
2605 /*
2606  * Heap manipulation routines, per Knuth's Algorithm 5.2.3H.
2607  */
2608 
2609 /*
2610  * Convert the existing unordered array of SortTuples to a bounded heap,
2611  * discarding all but the smallest "state->bound" tuples.
2612  *
2613  * When working with a bounded heap, we want to keep the largest entry
2614  * at the root (array entry zero), instead of the smallest as in the normal
2615  * sort case. This allows us to discard the largest entry cheaply.
2616  * Therefore, we temporarily reverse the sort direction.
2617  */
2618 static void
2620 {
2621  int tupcount = state->memtupcount;
2622  int i;
2623 
2624  Assert(state->status == TSS_INITIAL);
2625  Assert(state->bounded);
2626  Assert(tupcount >= state->bound);
2627  Assert(SERIAL(state));
2628 
2629  /* Reverse sort direction so largest entry will be at root */
2631 
2632  state->memtupcount = 0; /* make the heap empty */
2633  for (i = 0; i < tupcount; i++)
2634  {
2635  if (state->memtupcount < state->bound)
2636  {
2637  /* Insert next tuple into heap */
2638  /* Must copy source tuple to avoid possible overwrite */
2639  SortTuple stup = state->memtuples[i];
2640 
2641  tuplesort_heap_insert(state, &stup);
2642  }
2643  else
2644  {
2645  /*
2646  * The heap is full. Replace the largest entry with the new
2647  * tuple, or just discard it, if it's larger than anything already
2648  * in the heap.
2649  */
2650  if (COMPARETUP(state, &state->memtuples[i], &state->memtuples[0]) <= 0)
2651  {
2652  free_sort_tuple(state, &state->memtuples[i]);
2654  }
2655  else
2656  tuplesort_heap_replace_top(state, &state->memtuples[i]);
2657  }
2658  }
2659 
2660  Assert(state->memtupcount == state->bound);
2661  state->status = TSS_BOUNDED;
2662 }
2663 
2664 /*
2665  * Convert the bounded heap to a properly-sorted array
2666  */
2667 static void
2669 {
2670  int tupcount = state->memtupcount;
2671 
2672  Assert(state->status == TSS_BOUNDED);
2673  Assert(state->bounded);
2674  Assert(tupcount == state->bound);
2675  Assert(SERIAL(state));
2676 
2677  /*
2678  * We can unheapify in place because each delete-top call will remove the
2679  * largest entry, which we can promptly store in the newly freed slot at
2680  * the end. Once we're down to a single-entry heap, we're done.
2681  */
2682  while (state->memtupcount > 1)
2683  {
2684  SortTuple stup = state->memtuples[0];
2685 
2686  /* this sifts-up the next-largest entry and decreases memtupcount */
2688  state->memtuples[state->memtupcount] = stup;
2689  }
2690  state->memtupcount = tupcount;
2691 
2692  /*
2693  * Reverse sort direction back to the original state. This is not
2694  * actually necessary but seems like a good idea for tidiness.
2695  */
2697 
2698  state->status = TSS_SORTEDINMEM;
2699  state->boundUsed = true;
2700 }
2701 
2702 /*
2703  * Sort all memtuples using specialized qsort() routines.
2704  *
2705  * Quicksort is used for small in-memory sorts, and external sort runs.
2706  */
2707 static void
2709 {
2710  Assert(!LEADER(state));
2711 
2712  if (state->memtupcount > 1)
2713  {
2714  /*
2715  * Do we have the leading column's value or abbreviation in datum1,
2716  * and is there a specialization for its comparator?
2717  */
2718  if (state->base.haveDatum1 && state->base.sortKeys)
2719  {
2720  if (state->base.sortKeys[0].comparator == ssup_datum_unsigned_cmp)
2721  {
2722  qsort_tuple_unsigned(state->memtuples,
2723  state->memtupcount,
2724  state);
2725  return;
2726  }
2727 #if SIZEOF_DATUM >= 8
2728  else if (state->base.sortKeys[0].comparator == ssup_datum_signed_cmp)
2729  {
2730  qsort_tuple_signed(state->memtuples,
2731  state->memtupcount,
2732  state);
2733  return;
2734  }
2735 #endif
2736  else if (state->base.sortKeys[0].comparator == ssup_datum_int32_cmp)
2737  {
2738  qsort_tuple_int32(state->memtuples,
2739  state->memtupcount,
2740  state);
2741  return;
2742  }
2743  }
2744 
2745  /* Can we use the single-key sort function? */
2746  if (state->base.onlyKey != NULL)
2747  {
2748  qsort_ssup(state->memtuples, state->memtupcount,
2749  state->base.onlyKey);
2750  }
2751  else
2752  {
2753  qsort_tuple(state->memtuples,
2754  state->memtupcount,
2755  state->base.comparetup,
2756  state);
2757  }
2758  }
2759 }
2760 
2761 /*
2762  * Insert a new tuple into an empty or existing heap, maintaining the
2763  * heap invariant. Caller is responsible for ensuring there's room.
2764  *
2765  * Note: For some callers, tuple points to a memtuples[] entry above the
2766  * end of the heap. This is safe as long as it's not immediately adjacent
2767  * to the end of the heap (ie, in the [memtupcount] array entry) --- if it
2768  * is, it might get overwritten before being moved into the heap!
2769  */
2770 static void
2772 {
2773  SortTuple *memtuples;
2774  int j;
2775 
2776  memtuples = state->memtuples;
2777  Assert(state->memtupcount < state->memtupsize);
2778 
2780 
2781  /*
2782  * Sift-up the new entry, per Knuth 5.2.3 exercise 16. Note that Knuth is
2783  * using 1-based array indexes, not 0-based.
2784  */
2785  j = state->memtupcount++;
2786  while (j > 0)
2787  {
2788  int i = (j - 1) >> 1;
2789 
2790  if (COMPARETUP(state, tuple, &memtuples[i]) >= 0)
2791  break;
2792  memtuples[j] = memtuples[i];
2793  j = i;
2794  }
2795  memtuples[j] = *tuple;
2796 }
2797 
2798 /*
2799  * Remove the tuple at state->memtuples[0] from the heap. Decrement
2800  * memtupcount, and sift up to maintain the heap invariant.
2801  *
2802  * The caller has already free'd the tuple the top node points to,
2803  * if necessary.
2804  */
2805 static void
2807 {
2808  SortTuple *memtuples = state->memtuples;
2809  SortTuple *tuple;
2810 
2811  if (--state->memtupcount <= 0)
2812  return;
2813 
2814  /*
2815  * Remove the last tuple in the heap, and re-insert it, by replacing the
2816  * current top node with it.
2817  */
2818  tuple = &memtuples[state->memtupcount];
2820 }
2821 
2822 /*
2823  * Replace the tuple at state->memtuples[0] with a new tuple. Sift up to
2824  * maintain the heap invariant.
2825  *
2826  * This corresponds to Knuth's "sift-up" algorithm (Algorithm 5.2.3H,
2827  * Heapsort, steps H3-H8).
2828  */
2829 static void
2831 {
2832  SortTuple *memtuples = state->memtuples;
2833  unsigned int i,
2834  n;
2835 
2836  Assert(state->memtupcount >= 1);
2837 
2839 
2840  /*
2841  * state->memtupcount is "int", but we use "unsigned int" for i, j, n.
2842  * This prevents overflow in the "2 * i + 1" calculation, since at the top
2843  * of the loop we must have i < n <= INT_MAX <= UINT_MAX/2.
2844  */
2845  n = state->memtupcount;
2846  i = 0; /* i is where the "hole" is */
2847  for (;;)
2848  {
2849  unsigned int j = 2 * i + 1;
2850 
2851  if (j >= n)
2852  break;
2853  if (j + 1 < n &&
2854  COMPARETUP(state, &memtuples[j], &memtuples[j + 1]) > 0)
2855  j++;
2856  if (COMPARETUP(state, tuple, &memtuples[j]) <= 0)
2857  break;
2858  memtuples[i] = memtuples[j];
2859  i = j;
2860  }
2861  memtuples[i] = *tuple;
2862 }
2863 
2864 /*
2865  * Function to reverse the sort direction from its current state
2866  *
2867  * It is not safe to call this when performing hash tuplesorts
2868  */
2869 static void
2871 {
2872  SortSupport sortKey = state->base.sortKeys;
2873  int nkey;
2874 
2875  for (nkey = 0; nkey < state->base.nKeys; nkey++, sortKey++)
2876  {
2877  sortKey->ssup_reverse = !sortKey->ssup_reverse;
2878  sortKey->ssup_nulls_first = !sortKey->ssup_nulls_first;
2879  }
2880 }
2881 
2882 
2883 /*
2884  * Tape interface routines
2885  */
2886 
2887 static unsigned int
2888 getlen(LogicalTape *tape, bool eofOK)
2889 {
2890  unsigned int len;
2891 
2892  if (LogicalTapeRead(tape,
2893  &len, sizeof(len)) != sizeof(len))
2894  elog(ERROR, "unexpected end of tape");
2895  if (len == 0 && !eofOK)
2896  elog(ERROR, "unexpected end of data");
2897  return len;
2898 }
2899 
2900 static void
2902 {
2903  unsigned int len = 0;
2904 
2905  LogicalTapeWrite(tape, &len, sizeof(len));
2906 }
2907 
2908 /*
2909  * Get memory for tuple from within READTUP() routine.
2910  *
2911  * We use next free slot from the slab allocator, or palloc() if the tuple
2912  * is too large for that.
2913  */
2914 void *
2916 {
2917  SlabSlot *buf;
2918 
2919  /*
2920  * We pre-allocate enough slots in the slab arena that we should never run
2921  * out.
2922  */
2923  Assert(state->slabFreeHead);
2924 
2925  if (tuplen > SLAB_SLOT_SIZE || !state->slabFreeHead)
2926  return MemoryContextAlloc(state->base.sortcontext, tuplen);
2927  else
2928  {
2929  buf = state->slabFreeHead;
2930  /* Reuse this slot */
2931  state->slabFreeHead = buf->nextfree;
2932 
2933  return buf;
2934  }
2935 }
2936 
2937 
2938 /*
2939  * Parallel sort routines
2940  */
2941 
2942 /*
2943  * tuplesort_estimate_shared - estimate required shared memory allocation
2944  *
2945  * nWorkers is an estimate of the number of workers (it's the number that
2946  * will be requested).
2947  */
2948 Size
2950 {
2951  Size tapesSize;
2952 
2953  Assert(nWorkers > 0);
2954 
2955  /* Make sure that BufFile shared state is MAXALIGN'd */
2956  tapesSize = mul_size(sizeof(TapeShare), nWorkers);
2957  tapesSize = MAXALIGN(add_size(tapesSize, offsetof(Sharedsort, tapes)));
2958 
2959  return tapesSize;
2960 }
2961 
2962 /*
2963  * tuplesort_initialize_shared - initialize shared tuplesort state
2964  *
2965  * Must be called from leader process before workers are launched, to
2966  * establish state needed up-front for worker tuplesortstates. nWorkers
2967  * should match the argument passed to tuplesort_estimate_shared().
2968  */
2969 void
2971 {
2972  int i;
2973 
2974  Assert(nWorkers > 0);
2975 
2976  SpinLockInit(&shared->mutex);
2977  shared->currentWorker = 0;
2978  shared->workersFinished = 0;
2979  SharedFileSetInit(&shared->fileset, seg);
2980  shared->nTapes = nWorkers;
2981  for (i = 0; i < nWorkers; i++)
2982  {
2983  shared->tapes[i].firstblocknumber = 0L;
2984  }
2985 }
2986 
2987 /*
2988  * tuplesort_attach_shared - attach to shared tuplesort state
2989  *
2990  * Must be called by all worker processes.
2991  */
2992 void
2994 {
2995  /* Attach to SharedFileSet */
2996  SharedFileSetAttach(&shared->fileset, seg);
2997 }
2998 
2999 /*
3000  * worker_get_identifier - Assign and return ordinal identifier for worker
3001  *
3002  * The order in which these are assigned is not well defined, and should not
3003  * matter; worker numbers across parallel sort participants need only be
3004  * distinct and gapless. logtape.c requires this.
3005  *
3006  * Note that the identifiers assigned from here have no relation to
3007  * ParallelWorkerNumber number, to avoid making any assumption about
3008  * caller's requirements. However, we do follow the ParallelWorkerNumber
3009  * convention of representing a non-worker with worker number -1. This
3010  * includes the leader, as well as serial Tuplesort processes.
3011  */
3012 static int
3014 {
3015  Sharedsort *shared = state->shared;
3016  int worker;
3017 
3018  Assert(WORKER(state));
3019 
3020  SpinLockAcquire(&shared->mutex);
3021  worker = shared->currentWorker++;
3022  SpinLockRelease(&shared->mutex);
3023 
3024  return worker;
3025 }
3026 
3027 /*
3028  * worker_freeze_result_tape - freeze worker's result tape for leader
3029  *
3030  * This is called by workers just after the result tape has been determined,
3031  * instead of calling LogicalTapeFreeze() directly. They do so because
3032  * workers require a few additional steps over similar serial
3033  * TSS_SORTEDONTAPE external sort cases, which also happen here. The extra
3034  * steps are around freeing now unneeded resources, and representing to
3035  * leader that worker's input run is available for its merge.
3036  *
3037  * There should only be one final output run for each worker, which consists
3038  * of all tuples that were originally input into worker.
3039  */
3040 static void
3042 {
3043  Sharedsort *shared = state->shared;
3044  TapeShare output;
3045 
3046  Assert(WORKER(state));
3047  Assert(state->result_tape != NULL);
3048  Assert(state->memtupcount == 0);
3049 
3050  /*
3051  * Free most remaining memory, in case caller is sensitive to our holding
3052  * on to it. memtuples may not be a tiny merge heap at this point.
3053  */
3054  pfree(state->memtuples);
3055  /* Be tidy */
3056  state->memtuples = NULL;
3057  state->memtupsize = 0;
3058 
3059  /*
3060  * Parallel worker requires result tape metadata, which is to be stored in
3061  * shared memory for leader
3062  */
3063  LogicalTapeFreeze(state->result_tape, &output);
3064 
3065  /* Store properties of output tape, and update finished worker count */
3066  SpinLockAcquire(&shared->mutex);
3067  shared->tapes[state->worker] = output;
3068  shared->workersFinished++;
3069  SpinLockRelease(&shared->mutex);
3070 }
3071 
3072 /*
3073  * worker_nomergeruns - dump memtuples in worker, without merging
3074  *
3075  * This called as an alternative to mergeruns() with a worker when no
3076  * merging is required.
3077  */
3078 static void
3080 {
3081  Assert(WORKER(state));
3082  Assert(state->result_tape == NULL);
3083  Assert(state->nOutputRuns == 1);
3084 
3085  state->result_tape = state->destTape;
3087 }
3088 
3089 /*
3090  * leader_takeover_tapes - create tapeset for leader from worker tapes
3091  *
3092  * So far, leader Tuplesortstate has performed no actual sorting. By now, all
3093  * sorting has occurred in workers, all of which must have already returned
3094  * from tuplesort_performsort().
3095  *
3096  * When this returns, leader process is left in a state that is virtually
3097  * indistinguishable from it having generated runs as a serial external sort
3098  * might have.
3099  */
3100 static void
3102 {
3103  Sharedsort *shared = state->shared;
3104  int nParticipants = state->nParticipants;
3105  int workersFinished;
3106  int j;
3107 
3108  Assert(LEADER(state));
3109  Assert(nParticipants >= 1);
3110 
3111  SpinLockAcquire(&shared->mutex);
3112  workersFinished = shared->workersFinished;
3113  SpinLockRelease(&shared->mutex);
3114 
3115  if (nParticipants != workersFinished)
3116  elog(ERROR, "cannot take over tapes before all workers finish");
3117 
3118  /*
3119  * Create the tapeset from worker tapes, including a leader-owned tape at
3120  * the end. Parallel workers are far more expensive than logical tapes,
3121  * so the number of tapes allocated here should never be excessive.
3122  */
3123  inittapestate(state, nParticipants);
3124  state->tapeset = LogicalTapeSetCreate(false, &shared->fileset, -1);
3125 
3126  /*
3127  * Set currentRun to reflect the number of runs we will merge (it's not
3128  * used for anything, this is just pro forma)
3129  */
3130  state->currentRun = nParticipants;
3131 
3132  /*
3133  * Initialize the state to look the same as after building the initial
3134  * runs.
3135  *
3136  * There will always be exactly 1 run per worker, and exactly one input
3137  * tape per run, because workers always output exactly 1 run, even when
3138  * there were no input tuples for workers to sort.
3139  */
3140  state->inputTapes = NULL;
3141  state->nInputTapes = 0;
3142  state->nInputRuns = 0;
3143 
3144  state->outputTapes = palloc0(nParticipants * sizeof(LogicalTape *));
3145  state->nOutputTapes = nParticipants;
3146  state->nOutputRuns = nParticipants;
3147 
3148  for (j = 0; j < nParticipants; j++)
3149  {
3150  state->outputTapes[j] = LogicalTapeImport(state->tapeset, j, &shared->tapes[j]);
3151  }
3152 
3153  state->status = TSS_BUILDRUNS;
3154 }
3155 
3156 /*
3157  * Convenience routine to free a tuple previously loaded into sort memory
3158  */
3159 static void
3161 {
3162  if (stup->tuple)
3163  {
3165  pfree(stup->tuple);
3166  stup->tuple = NULL;
3167  }
3168 }
3169 
3170 int
3172 {
3173  if (x < y)
3174  return -1;
3175  else if (x > y)
3176  return 1;
3177  else
3178  return 0;
3179 }
3180 
3181 #if SIZEOF_DATUM >= 8
3182 int
3183 ssup_datum_signed_cmp(Datum x, Datum y, SortSupport ssup)
3184 {
3185  int64 xx = DatumGetInt64(x);
3186  int64 yy = DatumGetInt64(y);
3187 
3188  if (xx < yy)
3189  return -1;
3190  else if (xx > yy)
3191  return 1;
3192  else
3193  return 0;
3194 }
3195 #endif
3196 
3197 int
3199 {
3200  int32 xx = DatumGetInt32(x);
3201  int32 yy = DatumGetInt32(y);
3202 
3203  if (xx < yy)
3204  return -1;
3205  else if (xx > yy)
3206  return 1;
3207  else
3208  return 0;
3209 }
void PrepareTempTablespaces(void)
Definition: tablespace.c:1331
#define Min(x, y)
Definition: c.h:991
#define MAXALIGN(LEN)
Definition: c.h:798
signed int int32
Definition: c.h:481
#define Max(x, y)
Definition: c.h:985
#define INT64_FORMAT
Definition: c.h:535
#define FLEXIBLE_ARRAY_MEMBER
Definition: c.h:385
#define pg_attribute_always_inline
Definition: c.h:221
size_t Size
Definition: c.h:592
int errcode(int sqlerrcode)
Definition: elog.c:859
int errmsg(const char *fmt,...)
Definition: elog.c:1072
#define LOG
Definition: elog.h:31
#define ERROR
Definition: elog.h:39
#define elog(elevel,...)
Definition: elog.h:224
#define ereport(elevel,...)
Definition: elog.h:149
MemoryContext GenerationContextCreate(MemoryContext parent, const char *name, Size minContextSize, Size initBlockSize, Size maxBlockSize)
Definition: generation.c:160
static int compare(const void *arg1, const void *arg2)
Definition: geqo_pool.c:145
FILE * output
int y
Definition: isn.c:72
int b
Definition: isn.c:70
int x
Definition: isn.c:71
int a
Definition: isn.c:69
int j
Definition: isn.c:74
int i
Definition: isn.c:73
Assert(fmt[strlen(fmt) - 1] !='\n')
LogicalTape * LogicalTapeCreate(LogicalTapeSet *lts)
Definition: logtape.c:680
void LogicalTapeRewindForRead(LogicalTape *lt, size_t buffer_size)
Definition: logtape.c:846
void LogicalTapeSetForgetFreeSpace(LogicalTapeSet *lts)
Definition: logtape.c:750
size_t LogicalTapeBackspace(LogicalTape *lt, size_t size)
Definition: logtape.c:1062
size_t LogicalTapeRead(LogicalTape *lt, void *ptr, size_t size)
Definition: logtape.c:928
int64 LogicalTapeSetBlocks(LogicalTapeSet *lts)
Definition: logtape.c:1183
void LogicalTapeClose(LogicalTape *lt)
Definition: logtape.c:733
void LogicalTapeSetClose(LogicalTapeSet *lts)
Definition: logtape.c:667
void LogicalTapeSeek(LogicalTape *lt, int64 blocknum, int offset)
Definition: logtape.c:1133
void LogicalTapeTell(LogicalTape *lt, int64 *blocknum, int *offset)
Definition: logtape.c:1162
void LogicalTapeWrite(LogicalTape *lt, const void *ptr, size_t size)
Definition: logtape.c:761
LogicalTapeSet * LogicalTapeSetCreate(bool preallocate, SharedFileSet *fileset, int worker)
Definition: logtape.c:556
void LogicalTapeFreeze(LogicalTape *lt, TapeShare *share)
Definition: logtape.c:981
LogicalTape * LogicalTapeImport(LogicalTapeSet *lts, int worker, TapeShare *shared)
Definition: logtape.c:609
void MemoryContextReset(MemoryContext context)
Definition: mcxt.c:371
void pfree(void *pointer)
Definition: mcxt.c:1508
Size GetMemoryChunkSpace(void *pointer)
Definition: mcxt.c:709
void * palloc0(Size size)
Definition: mcxt.c:1334
MemoryContext CurrentMemoryContext
Definition: mcxt.c:131
void * MemoryContextAlloc(MemoryContext context, Size size)
Definition: mcxt.c:1168
void MemoryContextDelete(MemoryContext context)
Definition: mcxt.c:442
void * palloc(Size size)
Definition: mcxt.c:1304
void MemoryContextResetOnly(MemoryContext context)
Definition: mcxt.c:390
void * repalloc_huge(void *pointer, Size size)
Definition: mcxt.c:1659
#define AllocSetContextCreate
Definition: memutils.h:129
#define MaxAllocHugeSize
Definition: memutils.h:45
#define ALLOCSET_DEFAULT_SIZES
Definition: memutils.h:153
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:122
static MemoryContext MemoryContextSwitchTo(MemoryContext context)
Definition: palloc.h:124
const void size_t len
const char * pg_rusage_show(const PGRUsage *ru0)
Definition: pg_rusage.c:40
void pg_rusage_init(PGRUsage *ru0)
Definition: pg_rusage.c:27
static char * buf
Definition: pg_test_fsync.c:73
static int64 DatumGetInt64(Datum X)
Definition: postgres.h:385
uintptr_t Datum
Definition: postgres.h:64
static int32 DatumGetInt32(Datum X)
Definition: postgres.h:202
int slock_t
Definition: s_lock.h:735
void SharedFileSetAttach(SharedFileSet *fileset, dsm_segment *seg)
Definition: sharedfileset.c:56
void SharedFileSetInit(SharedFileSet *fileset, dsm_segment *seg)
Definition: sharedfileset.c:38
Size add_size(Size s1, Size s2)
Definition: shmem.c:493
Size mul_size(Size s1, Size s2)
Definition: shmem.c:510
static int ApplyUnsignedSortComparator(Datum datum1, bool isNull1, Datum datum2, bool isNull2, SortSupport ssup)
Definition: sortsupport.h:233
static int ApplyInt32SortComparator(Datum datum1, bool isNull1, Datum datum2, bool isNull2, SortSupport ssup)
Definition: sortsupport.h:302
#define SpinLockInit(lock)
Definition: spin.h:60
#define SpinLockRelease(lock)
Definition: spin.h:64
#define SpinLockAcquire(lock)
Definition: spin.h:62
SharedFileSet fileset
Definition: tuplesort.c:358
TapeShare tapes[FLEXIBLE_ARRAY_MEMBER]
Definition: tuplesort.c:367
int workersFinished
Definition: tuplesort.c:355
int nTapes
Definition: tuplesort.c:361
slock_t mutex
Definition: tuplesort.c:344
int currentWorker
Definition: tuplesort.c:354
Sharedsort * sharedsort
Definition: tuplesort.h:58
bool ssup_nulls_first
Definition: sortsupport.h:75
void * tuple
Definition: tuplesort.h:139
int srctape
Definition: tuplesort.h:142
Datum datum1
Definition: tuplesort.h:140
int64 firstblocknumber
Definition: logtape.h:54
TuplesortMethod sortMethod
Definition: tuplesort.h:103
TuplesortSpaceType spaceType
Definition: tuplesort.h:104
void * lastReturnedTuple
Definition: tuplesort.c:260
LogicalTapeSet * tapeset
Definition: tuplesort.c:204
bool isMaxSpaceDisk
Definition: tuplesort.c:200
bool growmemtuples
Definition: tuplesort.c:216
SortTuple * memtuples
Definition: tuplesort.c:213
int64 maxSpace
Definition: tuplesort.c:198
LogicalTape ** inputTapes
Definition: tuplesort.c:276
bool slabAllocatorUsed
Definition: tuplesort.c:245
TuplesortPublic base
Definition: tuplesort.c:188
char * slabMemoryEnd
Definition: tuplesort.c:248
PGRUsage ru_start
Definition: tuplesort.c:333
char * slabMemoryBegin
Definition: tuplesort.c:247
LogicalTape ** outputTapes
Definition: tuplesort.c:280
bool eof_reached
Definition: tuplesort.c:293
size_t tape_buffer_mem
Definition: tuplesort.c:252
TupSortStatus status
Definition: tuplesort.c:189
int64 availMem
Definition: tuplesort.c:194
LogicalTape * destTape
Definition: tuplesort.c:284
TupSortStatus maxSpaceStatus
Definition: tuplesort.c:203
bool markpos_eof
Definition: tuplesort.c:298
int64 abbrevNext
Definition: tuplesort.c:326
int64 markpos_block
Definition: tuplesort.c:296
Sharedsort * shared
Definition: tuplesort.c:317
LogicalTape * result_tape
Definition: tuplesort.c:291
SlabSlot * slabFreeHead
Definition: tuplesort.c:249
int markpos_offset
Definition: tuplesort.c:297
int64 allowedMem
Definition: tuplesort.c:195
Definition: regguts.h:323
void tuplesort_rescan(Tuplesortstate *state)
Definition: tuplesort.c:2434
void tuplesort_performsort(Tuplesortstate *state)
Definition: tuplesort.c:1379
int tuplesort_merge_order(int64 allowedMem)
Definition: tuplesort.c:1798
#define TAPE_BUFFER_OVERHEAD
Definition: tuplesort.c:179
static void tuplesort_heap_delete_top(Tuplesortstate *state)
Definition: tuplesort.c:2806
#define INITIAL_MEMTUPSIZE
Definition: tuplesort.c:119
static unsigned int getlen(LogicalTape *tape, bool eofOK)
Definition: tuplesort.c:2888
void tuplesort_initialize_shared(Sharedsort *shared, int nWorkers, dsm_segment *seg)
Definition: tuplesort.c:2970
#define COMPARETUP(state, a, b)
Definition: tuplesort.c:394
Tuplesortstate * tuplesort_begin_common(int workMem, SortCoordinate coordinate, int sortopt)
Definition: tuplesort.c:640
static void selectnewtape(Tuplesortstate *state)
Definition: tuplesort.c:1970
void tuplesort_reset(Tuplesortstate *state)
Definition: tuplesort.c:1034
#define SERIAL(state)
Definition: tuplesort.c:401
#define FREESTATE(state)
Definition: tuplesort.c:397
static void markrunend(LogicalTape *tape)
Definition: tuplesort.c:2901
bool tuplesort_skiptuples(Tuplesortstate *state, int64 ntuples, bool forward)
Definition: tuplesort.c:1730
static void free_sort_tuple(Tuplesortstate *state, SortTuple *stup)
Definition: tuplesort.c:3160
#define REMOVEABBREV(state, stup, count)
Definition: tuplesort.c:393
#define LACKMEM(state)
Definition: tuplesort.c:398
static void reversedirection(Tuplesortstate *state)
Definition: tuplesort.c:2870
#define USEMEM(state, amt)
Definition: tuplesort.c:399
static void tuplesort_heap_insert(Tuplesortstate *state, SortTuple *tuple)
Definition: tuplesort.c:2771
static bool grow_memtuples(Tuplesortstate *state)
Definition: tuplesort.c:1067
int ssup_datum_unsigned_cmp(Datum x, Datum y, SortSupport ssup)
Definition: tuplesort.c:3171
static void beginmerge(Tuplesortstate *state)
Definition: tuplesort.c:2286
static void make_bounded_heap(Tuplesortstate *state)
Definition: tuplesort.c:2619
bool tuplesort_used_bound(Tuplesortstate *state)
Definition: tuplesort.c:886
#define WRITETUP(state, tape, stup)
Definition: tuplesort.c:395
static void sort_bounded_heap(Tuplesortstate *state)
Definition: tuplesort.c:2668
TupSortStatus
Definition: tuplesort.c:156
@ TSS_SORTEDONTAPE
Definition: tuplesort.c:161
@ TSS_SORTEDINMEM
Definition: tuplesort.c:160
@ TSS_INITIAL
Definition: tuplesort.c:157
@ TSS_FINALMERGE
Definition: tuplesort.c:162
@ TSS_BUILDRUNS
Definition: tuplesort.c:159
@ TSS_BOUNDED
Definition: tuplesort.c:158
static int worker_get_identifier(Tuplesortstate *state)
Definition: tuplesort.c:3013
static void mergeonerun(Tuplesortstate *state)
Definition: tuplesort.c:2226
#define FREEMEM(state, amt)
Definition: tuplesort.c:400
const char * tuplesort_space_type_name(TuplesortSpaceType t)
Definition: tuplesort.c:2598
#define MAXORDER
Definition: tuplesort.c:178
static void inittapestate(Tuplesortstate *state, int maxTapes)
Definition: tuplesort.c:1936
#define SLAB_SLOT_SIZE
Definition: tuplesort.c:143
static void leader_takeover_tapes(Tuplesortstate *state)
Definition: tuplesort.c:3101
Size tuplesort_estimate_shared(int nWorkers)
Definition: tuplesort.c:2949
void tuplesort_get_stats(Tuplesortstate *state, TuplesortInstrumentation *stats)
Definition: tuplesort.c:2531
static void tuplesort_sort_memtuples(Tuplesortstate *state)
Definition: tuplesort.c:2708
void tuplesort_end(Tuplesortstate *state)
Definition: tuplesort.c:966
static void inittapes(Tuplesortstate *state, bool mergeruns)
Definition: tuplesort.c:1885
void tuplesort_markpos(Tuplesortstate *state)
Definition: tuplesort.c:2467
void tuplesort_puttuple_common(Tuplesortstate *state, SortTuple *tuple, bool useAbbrev)
Definition: tuplesort.c:1184
#define MERGE_BUFFER_SIZE
Definition: tuplesort.c:180
#define READTUP(state, stup, tape, len)
Definition: tuplesort.c:396
int ssup_datum_int32_cmp(Datum x, Datum y, SortSupport ssup)
Definition: tuplesort.c:3198
#define LEADER(state)
Definition: tuplesort.c:403
#define WORKER(state)
Definition: tuplesort.c:402
bool tuplesort_gettuple_common(Tuplesortstate *state, bool forward, SortTuple *stup)
Definition: tuplesort.c:1490
static int64 merge_read_buffer_size(int64 avail_mem, int nInputTapes, int nInputRuns, int maxOutputTapes)
Definition: tuplesort.c:1853
static bool mergereadnext(Tuplesortstate *state, LogicalTape *srcTape, SortTuple *stup)
Definition: tuplesort.c:2314
union SlabSlot SlabSlot
static void tuplesort_updatemax(Tuplesortstate *state)
Definition: tuplesort.c:983
static void worker_freeze_result_tape(Tuplesortstate *state)
Definition: tuplesort.c:3041
bool trace_sort
Definition: tuplesort.c:124
#define RELEASE_SLAB_SLOT(state, tuple)
Definition: tuplesort.c:381
void tuplesort_attach_shared(Sharedsort *shared, dsm_segment *seg)
Definition: tuplesort.c:2993
static void worker_nomergeruns(Tuplesortstate *state)
Definition: tuplesort.c:3079
static pg_attribute_always_inline int qsort_tuple_unsigned_compare(SortTuple *a, SortTuple *b, Tuplesortstate *state)
Definition: tuplesort.c:493
static void tuplesort_heap_replace_top(Tuplesortstate *state, SortTuple *tuple)
Definition: tuplesort.c:2830
void tuplesort_restorepos(Tuplesortstate *state)
Definition: tuplesort.c:2498
static pg_attribute_always_inline int qsort_tuple_int32_compare(SortTuple *a, SortTuple *b, Tuplesortstate *state)
Definition: tuplesort.c:540
static void mergeruns(Tuplesortstate *state)
Definition: tuplesort.c:2039
void * tuplesort_readtup_alloc(Tuplesortstate *state, Size tuplen)
Definition: tuplesort.c:2915
#define MINORDER
Definition: tuplesort.c:177
static void tuplesort_begin_batch(Tuplesortstate *state)
Definition: tuplesort.c:752
void tuplesort_set_bound(Tuplesortstate *state, int64 bound)
Definition: tuplesort.c:838
static void init_slab_allocator(Tuplesortstate *state, int numSlots)
Definition: tuplesort.c:2003
const char * tuplesort_method_name(TuplesortMethod m)
Definition: tuplesort.c:2575
static bool consider_abort_common(Tuplesortstate *state)
Definition: tuplesort.c:1335
static void tuplesort_free(Tuplesortstate *state)
Definition: tuplesort.c:897
static void dumptuples(Tuplesortstate *state, bool alltuples)
Definition: tuplesort.c:2333
#define TUPLESORT_RANDOMACCESS
Definition: tuplesort.h:96
#define TUPLESORT_ALLOWBOUNDED
Definition: tuplesort.h:99
TuplesortSpaceType
Definition: tuplesort.h:87
@ SORT_SPACE_TYPE_DISK
Definition: tuplesort.h:88
@ SORT_SPACE_TYPE_MEMORY
Definition: tuplesort.h:89
TuplesortMethod
Definition: tuplesort.h:76
@ SORT_TYPE_EXTERNAL_SORT
Definition: tuplesort.h:80
@ SORT_TYPE_TOP_N_HEAPSORT
Definition: tuplesort.h:78
@ SORT_TYPE_QUICKSORT
Definition: tuplesort.h:79
@ SORT_TYPE_STILL_IN_PROGRESS
Definition: tuplesort.h:77
@ SORT_TYPE_EXTERNAL_MERGE
Definition: tuplesort.h:81
char buffer[SLAB_SLOT_SIZE]
Definition: tuplesort.c:148
union SlabSlot * nextfree
Definition: tuplesort.c:147