PostgreSQL Source Code  git master
tuplesort.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * tuplesort.c
4  * Generalized tuple sorting routines.
5  *
6  * This module handles sorting of heap tuples, index tuples, or single
7  * Datums (and could easily support other kinds of sortable objects,
8  * if necessary). It works efficiently for both small and large amounts
9  * of data. Small amounts are sorted in-memory using qsort(). Large
10  * amounts are sorted using temporary files and a standard external sort
11  * algorithm.
12  *
13  * See Knuth, volume 3, for more than you want to know about the external
14  * sorting algorithm. Historically, we divided the input into sorted runs
15  * using replacement selection, in the form of a priority tree implemented
16  * as a heap (essentially his Algorithm 5.2.3H), but now we always use
17  * quicksort for run generation. We merge the runs using polyphase merge,
18  * Knuth's Algorithm 5.4.2D. The logical "tapes" used by Algorithm D are
19  * implemented by logtape.c, which avoids space wastage by recycling disk
20  * space as soon as each block is read from its "tape".
21  *
22  * The approximate amount of memory allowed for any one sort operation
23  * is specified in kilobytes by the caller (most pass work_mem). Initially,
24  * we absorb tuples and simply store them in an unsorted array as long as
25  * we haven't exceeded workMem. If we reach the end of the input without
26  * exceeding workMem, we sort the array using qsort() and subsequently return
27  * tuples just by scanning the tuple array sequentially. If we do exceed
28  * workMem, we begin to emit tuples into sorted runs in temporary tapes.
29  * When tuples are dumped in batch after quicksorting, we begin a new run
30  * with a new output tape (selected per Algorithm D). After the end of the
31  * input is reached, we dump out remaining tuples in memory into a final run,
32  * then merge the runs using Algorithm D.
33  *
34  * When merging runs, we use a heap containing just the frontmost tuple from
35  * each source run; we repeatedly output the smallest tuple and replace it
36  * with the next tuple from its source tape (if any). When the heap empties,
37  * the merge is complete. The basic merge algorithm thus needs very little
38  * memory --- only M tuples for an M-way merge, and M is constrained to a
39  * small number. However, we can still make good use of our full workMem
40  * allocation by pre-reading additional blocks from each source tape. Without
41  * prereading, our access pattern to the temporary file would be very erratic;
42  * on average we'd read one block from each of M source tapes during the same
43  * time that we're writing M blocks to the output tape, so there is no
44  * sequentiality of access at all, defeating the read-ahead methods used by
45  * most Unix kernels. Worse, the output tape gets written into a very random
46  * sequence of blocks of the temp file, ensuring that things will be even
47  * worse when it comes time to read that tape. A straightforward merge pass
48  * thus ends up doing a lot of waiting for disk seeks. We can improve matters
49  * by prereading from each source tape sequentially, loading about workMem/M
50  * bytes from each tape in turn, and making the sequential blocks immediately
51  * available for reuse. This approach helps to localize both read and write
52  * accesses. The pre-reading is handled by logtape.c, we just tell it how
53  * much memory to use for the buffers.
54  *
55  * When the caller requests random access to the sort result, we form
56  * the final sorted run on a logical tape which is then "frozen", so
57  * that we can access it randomly. When the caller does not need random
58  * access, we return from tuplesort_performsort() as soon as we are down
59  * to one run per logical tape. The final merge is then performed
60  * on-the-fly as the caller repeatedly calls tuplesort_getXXX; this
61  * saves one cycle of writing all the data out to disk and reading it in.
62  *
63  * Before Postgres 8.2, we always used a seven-tape polyphase merge, on the
64  * grounds that 7 is the "sweet spot" on the tapes-to-passes curve according
65  * to Knuth's figure 70 (section 5.4.2). However, Knuth is assuming that
66  * tape drives are expensive beasts, and in particular that there will always
67  * be many more runs than tape drives. In our implementation a "tape drive"
68  * doesn't cost much more than a few Kb of memory buffers, so we can afford
69  * to have lots of them. In particular, if we can have as many tape drives
70  * as sorted runs, we can eliminate any repeated I/O at all. In the current
71  * code we determine the number of tapes M on the basis of workMem: we want
72  * workMem/M to be large enough that we read a fair amount of data each time
73  * we preread from a tape, so as to maintain the locality of access described
74  * above. Nonetheless, with large workMem we can have many tapes (but not
75  * too many -- see the comments in tuplesort_merge_order).
76  *
77  *
78  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
79  * Portions Copyright (c) 1994, Regents of the University of California
80  *
81  * IDENTIFICATION
82  * src/backend/utils/sort/tuplesort.c
83  *
84  *-------------------------------------------------------------------------
85  */
86 
87 #include "postgres.h"
88 
89 #include <limits.h>
90 
91 #include "access/htup_details.h"
92 #include "access/nbtree.h"
93 #include "access/hash.h"
94 #include "catalog/index.h"
95 #include "catalog/pg_am.h"
96 #include "commands/tablespace.h"
97 #include "executor/executor.h"
98 #include "miscadmin.h"
99 #include "pg_trace.h"
100 #include "utils/datum.h"
101 #include "utils/logtape.h"
102 #include "utils/lsyscache.h"
103 #include "utils/memutils.h"
104 #include "utils/pg_rusage.h"
105 #include "utils/rel.h"
106 #include "utils/sortsupport.h"
107 #include "utils/tuplesort.h"
108 
109 
110 /* sort-type codes for sort__start probes */
111 #define HEAP_SORT 0
112 #define INDEX_SORT 1
113 #define DATUM_SORT 2
114 #define CLUSTER_SORT 3
115 
116 /* GUC variables */
117 #ifdef TRACE_SORT
118 bool trace_sort = false;
119 #endif
120 
121 #ifdef DEBUG_BOUNDED_SORT
122 bool optimize_bounded_sort = true;
123 #endif
124 
125 
126 /*
127  * The objects we actually sort are SortTuple structs. These contain
128  * a pointer to the tuple proper (might be a MinimalTuple or IndexTuple),
129  * which is a separate palloc chunk --- we assume it is just one chunk and
130  * can be freed by a simple pfree() (except during merge, when we use a
131  * simple slab allocator). SortTuples also contain the tuple's first key
132  * column in Datum/nullflag format, and an index integer.
133  *
134  * Storing the first key column lets us save heap_getattr or index_getattr
135  * calls during tuple comparisons. We could extract and save all the key
136  * columns not just the first, but this would increase code complexity and
137  * overhead, and wouldn't actually save any comparison cycles in the common
138  * case where the first key determines the comparison result. Note that
139  * for a pass-by-reference datatype, datum1 points into the "tuple" storage.
140  *
141  * There is one special case: when the sort support infrastructure provides an
142  * "abbreviated key" representation, where the key is (typically) a pass by
143  * value proxy for a pass by reference type. In this case, the abbreviated key
144  * is stored in datum1 in place of the actual first key column.
145  *
146  * When sorting single Datums, the data value is represented directly by
147  * datum1/isnull1 for pass by value types (or null values). If the datatype is
148  * pass-by-reference and isnull1 is false, then "tuple" points to a separately
149  * palloc'd data value, otherwise "tuple" is NULL. The value of datum1 is then
150  * either the same pointer as "tuple", or is an abbreviated key value as
151  * described above. Accordingly, "tuple" is always used in preference to
152  * datum1 as the authoritative value for pass-by-reference cases.
153  *
154  * tupindex holds the input tape number that each tuple in the heap was read
155  * from during merge passes.
156  */
157 typedef struct
158 {
159  void *tuple; /* the tuple itself */
160  Datum datum1; /* value of first key column */
161  bool isnull1; /* is first key column NULL? */
162  int tupindex; /* see notes above */
163 } SortTuple;
164 
165 /*
166  * During merge, we use a pre-allocated set of fixed-size slots to hold
167  * tuples. To avoid palloc/pfree overhead.
168  *
169  * Merge doesn't require a lot of memory, so we can afford to waste some,
170  * by using gratuitously-sized slots. If a tuple is larger than 1 kB, the
171  * palloc() overhead is not significant anymore.
172  *
173  * 'nextfree' is valid when this chunk is in the free list. When in use, the
174  * slot holds a tuple.
175  */
176 #define SLAB_SLOT_SIZE 1024
177 
178 typedef union SlabSlot
179 {
182 } SlabSlot;
183 
184 /*
185  * Possible states of a Tuplesort object. These denote the states that
186  * persist between calls of Tuplesort routines.
187  */
188 typedef enum
189 {
190  TSS_INITIAL, /* Loading tuples; still within memory limit */
191  TSS_BOUNDED, /* Loading tuples into bounded-size heap */
192  TSS_BUILDRUNS, /* Loading tuples; writing to tape */
193  TSS_SORTEDINMEM, /* Sort completed entirely in memory */
194  TSS_SORTEDONTAPE, /* Sort completed, final run is on tape */
195  TSS_FINALMERGE /* Performing final merge on-the-fly */
196 } TupSortStatus;
197 
198 /*
199  * Parameters for calculation of number of tapes to use --- see inittapes()
200  * and tuplesort_merge_order().
201  *
202  * In this calculation we assume that each tape will cost us about 1 blocks
203  * worth of buffer space. This ignores the overhead of all the other data
204  * structures needed for each tape, but it's probably close enough.
205  *
206  * MERGE_BUFFER_SIZE is how much data we'd like to read from each input
207  * tape during a preread cycle (see discussion at top of file).
208  */
209 #define MINORDER 6 /* minimum merge order */
210 #define MAXORDER 500 /* maximum merge order */
211 #define TAPE_BUFFER_OVERHEAD BLCKSZ
212 #define MERGE_BUFFER_SIZE (BLCKSZ * 32)
213 
214 typedef int (*SortTupleComparator) (const SortTuple *a, const SortTuple *b,
216 
217 /*
218  * Private state of a Tuplesort operation.
219  */
221 {
222  TupSortStatus status; /* enumerated value as shown above */
223  int nKeys; /* number of columns in sort key */
224  bool randomAccess; /* did caller request random access? */
225  bool bounded; /* did caller specify a maximum number of
226  * tuples to return? */
227  bool boundUsed; /* true if we made use of a bounded heap */
228  int bound; /* if bounded, the maximum number of tuples */
229  bool tuples; /* Can SortTuple.tuple ever be set? */
230  int64 availMem; /* remaining memory available, in bytes */
231  int64 allowedMem; /* total memory allowed, in bytes */
232  int maxTapes; /* number of tapes (Knuth's T) */
233  int tapeRange; /* maxTapes-1 (Knuth's P) */
234  MemoryContext sortcontext; /* memory context holding most sort data */
235  MemoryContext tuplecontext; /* sub-context of sortcontext for tuple data */
236  LogicalTapeSet *tapeset; /* logtape.c object for tapes in a temp file */
237 
238  /*
239  * These function pointers decouple the routines that must know what kind
240  * of tuple we are sorting from the routines that don't need to know it.
241  * They are set up by the tuplesort_begin_xxx routines.
242  *
243  * Function to compare two tuples; result is per qsort() convention, ie:
244  * <0, 0, >0 according as a<b, a=b, a>b. The API must match
245  * qsort_arg_comparator.
246  */
248 
249  /*
250  * Function to copy a supplied input tuple into palloc'd space and set up
251  * its SortTuple representation (ie, set tuple/datum1/isnull1). Also,
252  * state->availMem must be decreased by the amount of space used for the
253  * tuple copy (note the SortTuple struct itself is not counted).
254  */
255  void (*copytup) (Tuplesortstate *state, SortTuple *stup, void *tup);
256 
257  /*
258  * Function to write a stored tuple onto tape. The representation of the
259  * tuple on tape need not be the same as it is in memory; requirements on
260  * the tape representation are given below. Unless the slab allocator is
261  * used, after writing the tuple, pfree() the out-of-line data (not the
262  * SortTuple struct!), and increase state->availMem by the amount of
263  * memory space thereby released.
264  */
265  void (*writetup) (Tuplesortstate *state, int tapenum,
266  SortTuple *stup);
267 
268  /*
269  * Function to read a stored tuple from tape back into memory. 'len' is
270  * the already-read length of the stored tuple. The tuple is allocated
271  * from the slab memory arena, or is palloc'd, see readtup_alloc().
272  */
273  void (*readtup) (Tuplesortstate *state, SortTuple *stup,
274  int tapenum, unsigned int len);
275 
276  /*
277  * This array holds the tuples now in sort memory. If we are in state
278  * INITIAL, the tuples are in no particular order; if we are in state
279  * SORTEDINMEM, the tuples are in final sorted order; in states BUILDRUNS
280  * and FINALMERGE, the tuples are organized in "heap" order per Algorithm
281  * H. In state SORTEDONTAPE, the array is not used.
282  */
283  SortTuple *memtuples; /* array of SortTuple structs */
284  int memtupcount; /* number of tuples currently present */
285  int memtupsize; /* allocated length of memtuples array */
286  bool growmemtuples; /* memtuples' growth still underway? */
287 
288  /*
289  * Memory for tuples is sometimes allocated using a simple slab allocator,
290  * rather than with palloc(). Currently, we switch to slab allocation
291  * when we start merging. Merging only needs to keep a small, fixed
292  * number of tuples in memory at any time, so we can avoid the
293  * palloc/pfree overhead by recycling a fixed number of fixed-size slots
294  * to hold the tuples.
295  *
296  * For the slab, we use one large allocation, divided into SLAB_SLOT_SIZE
297  * slots. The allocation is sized to have one slot per tape, plus one
298  * additional slot. We need that many slots to hold all the tuples kept
299  * in the heap during merge, plus the one we have last returned from the
300  * sort, with tuplesort_gettuple.
301  *
302  * Initially, all the slots are kept in a linked list of free slots. When
303  * a tuple is read from a tape, it is put to the next available slot, if
304  * it fits. If the tuple is larger than SLAB_SLOT_SIZE, it is palloc'd
305  * instead.
306  *
307  * When we're done processing a tuple, we return the slot back to the free
308  * list, or pfree() if it was palloc'd. We know that a tuple was
309  * allocated from the slab, if its pointer value is between
310  * slabMemoryBegin and -End.
311  *
312  * When the slab allocator is used, the USEMEM/LACKMEM mechanism of
313  * tracking memory usage is not used.
314  */
316 
317  char *slabMemoryBegin; /* beginning of slab memory arena */
318  char *slabMemoryEnd; /* end of slab memory arena */
319  SlabSlot *slabFreeHead; /* head of free list */
320 
321  /* Buffer size to use for reading input tapes, during merge. */
323 
324  /*
325  * When we return a tuple to the caller in tuplesort_gettuple_XXX, that
326  * came from a tape (that is, in TSS_SORTEDONTAPE or TSS_FINALMERGE
327  * modes), we remember the tuple in 'lastReturnedTuple', so that we can
328  * recycle the memory on next gettuple call.
329  */
331 
332  /*
333  * While building initial runs, this is the current output run number.
334  * Afterwards, it is the number of initial runs we made.
335  */
337 
338  /*
339  * Unless otherwise noted, all pointer variables below are pointers to
340  * arrays of length maxTapes, holding per-tape data.
341  */
342 
343  /*
344  * This variable is only used during merge passes. mergeactive[i] is true
345  * if we are reading an input run from (actual) tape number i and have not
346  * yet exhausted that run.
347  */
348  bool *mergeactive; /* active input run source? */
349 
350  /*
351  * Variables for Algorithm D. Note that destTape is a "logical" tape
352  * number, ie, an index into the tp_xxx[] arrays. Be careful to keep
353  * "logical" and "actual" tape numbers straight!
354  */
355  int Level; /* Knuth's l */
356  int destTape; /* current output tape (Knuth's j, less 1) */
357  int *tp_fib; /* Target Fibonacci run counts (A[]) */
358  int *tp_runs; /* # of real runs on each tape */
359  int *tp_dummy; /* # of dummy runs for each tape (D[]) */
360  int *tp_tapenum; /* Actual tape numbers (TAPE[]) */
361  int activeTapes; /* # of active input tapes in merge pass */
362 
363  /*
364  * These variables are used after completion of sorting to keep track of
365  * the next tuple to return. (In the tape case, the tape's current read
366  * position is also critical state.)
367  */
368  int result_tape; /* actual tape number of finished output */
369  int current; /* array index (only used if SORTEDINMEM) */
370  bool eof_reached; /* reached EOF (needed for cursors) */
371 
372  /* markpos_xxx holds marked position for mark and restore */
373  long markpos_block; /* tape block# (only used if SORTEDONTAPE) */
374  int markpos_offset; /* saved "current", or offset in tape block */
375  bool markpos_eof; /* saved "eof_reached" */
376 
377  /*
378  * The sortKeys variable is used by every case other than the hash index
379  * case; it is set by tuplesort_begin_xxx. tupDesc is only used by the
380  * MinimalTuple and CLUSTER routines, though.
381  */
383  SortSupport sortKeys; /* array of length nKeys */
384 
385  /*
386  * This variable is shared by the single-key MinimalTuple case and the
387  * Datum case (which both use qsort_ssup()). Otherwise it's NULL.
388  */
390 
391  /*
392  * Additional state for managing "abbreviated key" sortsupport routines
393  * (which currently may be used by all cases except the hash index case).
394  * Tracks the intervals at which the optimization's effectiveness is
395  * tested.
396  */
397  int64 abbrevNext; /* Tuple # at which to next check
398  * applicability */
399 
400  /*
401  * These variables are specific to the CLUSTER case; they are set by
402  * tuplesort_begin_cluster.
403  */
404  IndexInfo *indexInfo; /* info about index being used for reference */
405  EState *estate; /* for evaluating index expressions */
406 
407  /*
408  * These variables are specific to the IndexTuple case; they are set by
409  * tuplesort_begin_index_xxx and used only by the IndexTuple routines.
410  */
411  Relation heapRel; /* table the index is being built on */
412  Relation indexRel; /* index being built */
413 
414  /* These are specific to the index_btree subcase: */
415  bool enforceUnique; /* complain if we find duplicate tuples */
416 
417  /* These are specific to the index_hash subcase: */
418  uint32 high_mask; /* masks for sortable part of hash code */
421 
422  /*
423  * These variables are specific to the Datum case; they are set by
424  * tuplesort_begin_datum and used only by the DatumTuple routines.
425  */
427  /* we need typelen in order to know how to copy the Datums. */
429 
430  /*
431  * Resource snapshot for time of sort start.
432  */
433 #ifdef TRACE_SORT
435 #endif
436 };
437 
438 /*
439  * Is the given tuple allocated from the slab memory arena?
440  */
441 #define IS_SLAB_SLOT(state, tuple) \
442  ((char *) (tuple) >= (state)->slabMemoryBegin && \
443  (char *) (tuple) < (state)->slabMemoryEnd)
444 
445 /*
446  * Return the given tuple to the slab memory free list, or free it
447  * if it was palloc'd.
448  */
449 #define RELEASE_SLAB_SLOT(state, tuple) \
450  do { \
451  SlabSlot *buf = (SlabSlot *) tuple; \
452  \
453  if (IS_SLAB_SLOT((state), buf)) \
454  { \
455  buf->nextfree = (state)->slabFreeHead; \
456  (state)->slabFreeHead = buf; \
457  } else \
458  pfree(buf); \
459  } while(0)
460 
461 #define COMPARETUP(state,a,b) ((*(state)->comparetup) (a, b, state))
462 #define COPYTUP(state,stup,tup) ((*(state)->copytup) (state, stup, tup))
463 #define WRITETUP(state,tape,stup) ((*(state)->writetup) (state, tape, stup))
464 #define READTUP(state,stup,tape,len) ((*(state)->readtup) (state, stup, tape, len))
465 #define LACKMEM(state) ((state)->availMem < 0 && !(state)->slabAllocatorUsed)
466 #define USEMEM(state,amt) ((state)->availMem -= (amt))
467 #define FREEMEM(state,amt) ((state)->availMem += (amt))
468 
469 /*
470  * NOTES about on-tape representation of tuples:
471  *
472  * We require the first "unsigned int" of a stored tuple to be the total size
473  * on-tape of the tuple, including itself (so it is never zero; an all-zero
474  * unsigned int is used to delimit runs). The remainder of the stored tuple
475  * may or may not match the in-memory representation of the tuple ---
476  * any conversion needed is the job of the writetup and readtup routines.
477  *
478  * If state->randomAccess is true, then the stored representation of the
479  * tuple must be followed by another "unsigned int" that is a copy of the
480  * length --- so the total tape space used is actually sizeof(unsigned int)
481  * more than the stored length value. This allows read-backwards. When
482  * randomAccess is not true, the write/read routines may omit the extra
483  * length word.
484  *
485  * writetup is expected to write both length words as well as the tuple
486  * data. When readtup is called, the tape is positioned just after the
487  * front length word; readtup must read the tuple data and advance past
488  * the back length word (if present).
489  *
490  * The write/read routines can make use of the tuple description data
491  * stored in the Tuplesortstate record, if needed. They are also expected
492  * to adjust state->availMem by the amount of memory space (not tape space!)
493  * released or consumed. There is no error return from either writetup
494  * or readtup; they should ereport() on failure.
495  *
496  *
497  * NOTES about memory consumption calculations:
498  *
499  * We count space allocated for tuples against the workMem limit, plus
500  * the space used by the variable-size memtuples array. Fixed-size space
501  * is not counted; it's small enough to not be interesting.
502  *
503  * Note that we count actual space used (as shown by GetMemoryChunkSpace)
504  * rather than the originally-requested size. This is important since
505  * palloc can add substantial overhead. It's not a complete answer since
506  * we won't count any wasted space in palloc allocation blocks, but it's
507  * a lot better than what we were doing before 7.3. As of 9.6, a
508  * separate memory context is used for caller passed tuples. Resetting
509  * it at certain key increments significantly ameliorates fragmentation.
510  * Note that this places a responsibility on readtup and copytup routines
511  * to use the right memory context for these tuples (and to not use the
512  * reset context for anything whose lifetime needs to span multiple
513  * external sort runs).
514  */
515 
516 /* When using this macro, beware of double evaluation of len */
517 #define LogicalTapeReadExact(tapeset, tapenum, ptr, len) \
518  do { \
519  if (LogicalTapeRead(tapeset, tapenum, ptr, len) != (size_t) (len)) \
520  elog(ERROR, "unexpected end of data"); \
521  } while(0)
522 
523 
524 static Tuplesortstate *tuplesort_begin_common(int workMem, bool randomAccess);
525 static void puttuple_common(Tuplesortstate *state, SortTuple *tuple);
527 static void inittapes(Tuplesortstate *state);
528 static void selectnewtape(Tuplesortstate *state);
529 static void init_slab_allocator(Tuplesortstate *state, int numSlots);
530 static void mergeruns(Tuplesortstate *state);
531 static void mergeonerun(Tuplesortstate *state);
532 static void beginmerge(Tuplesortstate *state);
533 static bool mergereadnext(Tuplesortstate *state, int srcTape, SortTuple *stup);
534 static void dumptuples(Tuplesortstate *state, bool alltuples);
542 static unsigned int getlen(Tuplesortstate *state, int tapenum, bool eofOK);
543 static void markrunend(Tuplesortstate *state, int tapenum);
544 static void *readtup_alloc(Tuplesortstate *state, Size tuplen);
545 static int comparetup_heap(const SortTuple *a, const SortTuple *b,
547 static void copytup_heap(Tuplesortstate *state, SortTuple *stup, void *tup);
548 static void writetup_heap(Tuplesortstate *state, int tapenum,
549  SortTuple *stup);
550 static void readtup_heap(Tuplesortstate *state, SortTuple *stup,
551  int tapenum, unsigned int len);
552 static int comparetup_cluster(const SortTuple *a, const SortTuple *b,
554 static void copytup_cluster(Tuplesortstate *state, SortTuple *stup, void *tup);
555 static void writetup_cluster(Tuplesortstate *state, int tapenum,
556  SortTuple *stup);
557 static void readtup_cluster(Tuplesortstate *state, SortTuple *stup,
558  int tapenum, unsigned int len);
559 static int comparetup_index_btree(const SortTuple *a, const SortTuple *b,
561 static int comparetup_index_hash(const SortTuple *a, const SortTuple *b,
563 static void copytup_index(Tuplesortstate *state, SortTuple *stup, void *tup);
564 static void writetup_index(Tuplesortstate *state, int tapenum,
565  SortTuple *stup);
566 static void readtup_index(Tuplesortstate *state, SortTuple *stup,
567  int tapenum, unsigned int len);
568 static int comparetup_datum(const SortTuple *a, const SortTuple *b,
570 static void copytup_datum(Tuplesortstate *state, SortTuple *stup, void *tup);
571 static void writetup_datum(Tuplesortstate *state, int tapenum,
572  SortTuple *stup);
573 static void readtup_datum(Tuplesortstate *state, SortTuple *stup,
574  int tapenum, unsigned int len);
575 static void free_sort_tuple(Tuplesortstate *state, SortTuple *stup);
576 
577 /*
578  * Special versions of qsort just for SortTuple objects. qsort_tuple() sorts
579  * any variant of SortTuples, using the appropriate comparetup function.
580  * qsort_ssup() is specialized for the case where the comparetup function
581  * reduces to ApplySortComparator(), that is single-key MinimalTuple sorts
582  * and Datum sorts.
583  */
584 #include "qsort_tuple.c"
585 
586 
587 /*
588  * tuplesort_begin_xxx
589  *
590  * Initialize for a tuple sort operation.
591  *
592  * After calling tuplesort_begin, the caller should call tuplesort_putXXX
593  * zero or more times, then call tuplesort_performsort when all the tuples
594  * have been supplied. After performsort, retrieve the tuples in sorted
595  * order by calling tuplesort_getXXX until it returns false/NULL. (If random
596  * access was requested, rescan, markpos, and restorepos can also be called.)
597  * Call tuplesort_end to terminate the operation and release memory/disk space.
598  *
599  * Each variant of tuplesort_begin has a workMem parameter specifying the
600  * maximum number of kilobytes of RAM to use before spilling data to disk.
601  * (The normal value of this parameter is work_mem, but some callers use
602  * other values.) Each variant also has a randomAccess parameter specifying
603  * whether the caller needs non-sequential access to the sort result.
604  */
605 
606 static Tuplesortstate *
607 tuplesort_begin_common(int workMem, bool randomAccess)
608 {
610  MemoryContext sortcontext;
611  MemoryContext tuplecontext;
612  MemoryContext oldcontext;
613 
614  /*
615  * Create a working memory context for this sort operation. All data
616  * needed by the sort will live inside this context.
617  */
619  "TupleSort main",
621 
622  /*
623  * Caller tuple (e.g. IndexTuple) memory context.
624  *
625  * A dedicated child context used exclusively for caller passed tuples
626  * eases memory management. Resetting at key points reduces
627  * fragmentation. Note that the memtuples array of SortTuples is allocated
628  * in the parent context, not this context, because there is no need to
629  * free memtuples early.
630  */
631  tuplecontext = AllocSetContextCreate(sortcontext,
632  "Caller tuples",
634 
635  /*
636  * Make the Tuplesortstate within the per-sort context. This way, we
637  * don't need a separate pfree() operation for it at shutdown.
638  */
639  oldcontext = MemoryContextSwitchTo(sortcontext);
640 
641  state = (Tuplesortstate *) palloc0(sizeof(Tuplesortstate));
642 
643 #ifdef TRACE_SORT
644  if (trace_sort)
645  pg_rusage_init(&state->ru_start);
646 #endif
647 
648  state->status = TSS_INITIAL;
649  state->randomAccess = randomAccess;
650  state->bounded = false;
651  state->tuples = true;
652  state->boundUsed = false;
653  state->allowedMem = workMem * (int64) 1024;
654  state->availMem = state->allowedMem;
655  state->sortcontext = sortcontext;
656  state->tuplecontext = tuplecontext;
657  state->tapeset = NULL;
658 
659  state->memtupcount = 0;
660 
661  /*
662  * Initial size of array must be more than ALLOCSET_SEPARATE_THRESHOLD;
663  * see comments in grow_memtuples().
664  */
665  state->memtupsize = Max(1024,
666  ALLOCSET_SEPARATE_THRESHOLD / sizeof(SortTuple) + 1);
667 
668  state->growmemtuples = true;
669  state->slabAllocatorUsed = false;
670  state->memtuples = (SortTuple *) palloc(state->memtupsize * sizeof(SortTuple));
671 
672  USEMEM(state, GetMemoryChunkSpace(state->memtuples));
673 
674  /* workMem must be large enough for the minimal memtuples array */
675  if (LACKMEM(state))
676  elog(ERROR, "insufficient memory allowed for sort");
677 
678  state->currentRun = 0;
679 
680  /*
681  * maxTapes, tapeRange, and Algorithm D variables will be initialized by
682  * inittapes(), if needed
683  */
684 
685  state->result_tape = -1; /* flag that result tape has not been formed */
686 
687  MemoryContextSwitchTo(oldcontext);
688 
689  return state;
690 }
691 
694  int nkeys, AttrNumber *attNums,
695  Oid *sortOperators, Oid *sortCollations,
696  bool *nullsFirstFlags,
697  int workMem, bool randomAccess)
698 {
699  Tuplesortstate *state = tuplesort_begin_common(workMem, randomAccess);
700  MemoryContext oldcontext;
701  int i;
702 
703  oldcontext = MemoryContextSwitchTo(state->sortcontext);
704 
705  AssertArg(nkeys > 0);
706 
707 #ifdef TRACE_SORT
708  if (trace_sort)
709  elog(LOG,
710  "begin tuple sort: nkeys = %d, workMem = %d, randomAccess = %c",
711  nkeys, workMem, randomAccess ? 't' : 'f');
712 #endif
713 
714  state->nKeys = nkeys;
715 
716  TRACE_POSTGRESQL_SORT_START(HEAP_SORT,
717  false, /* no unique check */
718  nkeys,
719  workMem,
720  randomAccess);
721 
722  state->comparetup = comparetup_heap;
723  state->copytup = copytup_heap;
724  state->writetup = writetup_heap;
725  state->readtup = readtup_heap;
726 
727  state->tupDesc = tupDesc; /* assume we need not copy tupDesc */
728  state->abbrevNext = 10;
729 
730  /* Prepare SortSupport data for each column */
731  state->sortKeys = (SortSupport) palloc0(nkeys * sizeof(SortSupportData));
732 
733  for (i = 0; i < nkeys; i++)
734  {
735  SortSupport sortKey = state->sortKeys + i;
736 
737  AssertArg(attNums[i] != 0);
738  AssertArg(sortOperators[i] != 0);
739 
740  sortKey->ssup_cxt = CurrentMemoryContext;
741  sortKey->ssup_collation = sortCollations[i];
742  sortKey->ssup_nulls_first = nullsFirstFlags[i];
743  sortKey->ssup_attno = attNums[i];
744  /* Convey if abbreviation optimization is applicable in principle */
745  sortKey->abbreviate = (i == 0);
746 
747  PrepareSortSupportFromOrderingOp(sortOperators[i], sortKey);
748  }
749 
750  /*
751  * The "onlyKey" optimization cannot be used with abbreviated keys, since
752  * tie-breaker comparisons may be required. Typically, the optimization
753  * is only of value to pass-by-value types anyway, whereas abbreviated
754  * keys are typically only of value to pass-by-reference types.
755  */
756  if (nkeys == 1 && !state->sortKeys->abbrev_converter)
757  state->onlyKey = state->sortKeys;
758 
759  MemoryContextSwitchTo(oldcontext);
760 
761  return state;
762 }
763 
766  Relation indexRel,
767  int workMem, bool randomAccess)
768 {
769  Tuplesortstate *state = tuplesort_begin_common(workMem, randomAccess);
770  ScanKey indexScanKey;
771  MemoryContext oldcontext;
772  int i;
773 
774  Assert(indexRel->rd_rel->relam == BTREE_AM_OID);
775 
776  oldcontext = MemoryContextSwitchTo(state->sortcontext);
777 
778 #ifdef TRACE_SORT
779  if (trace_sort)
780  elog(LOG,
781  "begin tuple sort: nkeys = %d, workMem = %d, randomAccess = %c",
783  workMem, randomAccess ? 't' : 'f');
784 #endif
785 
786  state->nKeys = RelationGetNumberOfAttributes(indexRel);
787 
788  TRACE_POSTGRESQL_SORT_START(CLUSTER_SORT,
789  false, /* no unique check */
790  state->nKeys,
791  workMem,
792  randomAccess);
793 
795  state->copytup = copytup_cluster;
796  state->writetup = writetup_cluster;
797  state->readtup = readtup_cluster;
798  state->abbrevNext = 10;
799 
800  state->indexInfo = BuildIndexInfo(indexRel);
801 
802  state->tupDesc = tupDesc; /* assume we need not copy tupDesc */
803 
804  indexScanKey = _bt_mkscankey_nodata(indexRel);
805 
806  if (state->indexInfo->ii_Expressions != NULL)
807  {
808  TupleTableSlot *slot;
809  ExprContext *econtext;
810 
811  /*
812  * We will need to use FormIndexDatum to evaluate the index
813  * expressions. To do that, we need an EState, as well as a
814  * TupleTableSlot to put the table tuples into. The econtext's
815  * scantuple has to point to that slot, too.
816  */
817  state->estate = CreateExecutorState();
818  slot = MakeSingleTupleTableSlot(tupDesc);
819  econtext = GetPerTupleExprContext(state->estate);
820  econtext->ecxt_scantuple = slot;
821  }
822 
823  /* Prepare SortSupport data for each column */
824  state->sortKeys = (SortSupport) palloc0(state->nKeys *
825  sizeof(SortSupportData));
826 
827  for (i = 0; i < state->nKeys; i++)
828  {
829  SortSupport sortKey = state->sortKeys + i;
830  ScanKey scanKey = indexScanKey + i;
831  int16 strategy;
832 
833  sortKey->ssup_cxt = CurrentMemoryContext;
834  sortKey->ssup_collation = scanKey->sk_collation;
835  sortKey->ssup_nulls_first =
836  (scanKey->sk_flags & SK_BT_NULLS_FIRST) != 0;
837  sortKey->ssup_attno = scanKey->sk_attno;
838  /* Convey if abbreviation optimization is applicable in principle */
839  sortKey->abbreviate = (i == 0);
840 
841  AssertState(sortKey->ssup_attno != 0);
842 
843  strategy = (scanKey->sk_flags & SK_BT_DESC) != 0 ?
845 
846  PrepareSortSupportFromIndexRel(indexRel, strategy, sortKey);
847  }
848 
849  _bt_freeskey(indexScanKey);
850 
851  MemoryContextSwitchTo(oldcontext);
852 
853  return state;
854 }
855 
858  Relation indexRel,
859  bool enforceUnique,
860  int workMem, bool randomAccess)
861 {
862  Tuplesortstate *state = tuplesort_begin_common(workMem, randomAccess);
863  ScanKey indexScanKey;
864  MemoryContext oldcontext;
865  int i;
866 
867  oldcontext = MemoryContextSwitchTo(state->sortcontext);
868 
869 #ifdef TRACE_SORT
870  if (trace_sort)
871  elog(LOG,
872  "begin index sort: unique = %c, workMem = %d, randomAccess = %c",
873  enforceUnique ? 't' : 'f',
874  workMem, randomAccess ? 't' : 'f');
875 #endif
876 
877  state->nKeys = RelationGetNumberOfAttributes(indexRel);
878 
879  TRACE_POSTGRESQL_SORT_START(INDEX_SORT,
880  enforceUnique,
881  state->nKeys,
882  workMem,
883  randomAccess);
884 
886  state->copytup = copytup_index;
887  state->writetup = writetup_index;
888  state->readtup = readtup_index;
889  state->abbrevNext = 10;
890 
891  state->heapRel = heapRel;
892  state->indexRel = indexRel;
893  state->enforceUnique = enforceUnique;
894 
895  indexScanKey = _bt_mkscankey_nodata(indexRel);
896  state->nKeys = RelationGetNumberOfAttributes(indexRel);
897 
898  /* Prepare SortSupport data for each column */
899  state->sortKeys = (SortSupport) palloc0(state->nKeys *
900  sizeof(SortSupportData));
901 
902  for (i = 0; i < state->nKeys; i++)
903  {
904  SortSupport sortKey = state->sortKeys + i;
905  ScanKey scanKey = indexScanKey + i;
906  int16 strategy;
907 
908  sortKey->ssup_cxt = CurrentMemoryContext;
909  sortKey->ssup_collation = scanKey->sk_collation;
910  sortKey->ssup_nulls_first =
911  (scanKey->sk_flags & SK_BT_NULLS_FIRST) != 0;
912  sortKey->ssup_attno = scanKey->sk_attno;
913  /* Convey if abbreviation optimization is applicable in principle */
914  sortKey->abbreviate = (i == 0);
915 
916  AssertState(sortKey->ssup_attno != 0);
917 
918  strategy = (scanKey->sk_flags & SK_BT_DESC) != 0 ?
920 
921  PrepareSortSupportFromIndexRel(indexRel, strategy, sortKey);
922  }
923 
924  _bt_freeskey(indexScanKey);
925 
926  MemoryContextSwitchTo(oldcontext);
927 
928  return state;
929 }
930 
933  Relation indexRel,
934  uint32 high_mask,
935  uint32 low_mask,
936  uint32 max_buckets,
937  int workMem, bool randomAccess)
938 {
939  Tuplesortstate *state = tuplesort_begin_common(workMem, randomAccess);
940  MemoryContext oldcontext;
941 
942  oldcontext = MemoryContextSwitchTo(state->sortcontext);
943 
944 #ifdef TRACE_SORT
945  if (trace_sort)
946  elog(LOG,
947  "begin index sort: high_mask = 0x%x, low_mask = 0x%x, "
948  "max_buckets = 0x%x, workMem = %d, randomAccess = %c",
949  high_mask,
950  low_mask,
951  max_buckets,
952  workMem, randomAccess ? 't' : 'f');
953 #endif
954 
955  state->nKeys = 1; /* Only one sort column, the hash code */
956 
958  state->copytup = copytup_index;
959  state->writetup = writetup_index;
960  state->readtup = readtup_index;
961 
962  state->heapRel = heapRel;
963  state->indexRel = indexRel;
964 
965  state->high_mask = high_mask;
966  state->low_mask = low_mask;
967  state->max_buckets = max_buckets;
968 
969  MemoryContextSwitchTo(oldcontext);
970 
971  return state;
972 }
973 
975 tuplesort_begin_datum(Oid datumType, Oid sortOperator, Oid sortCollation,
976  bool nullsFirstFlag,
977  int workMem, bool randomAccess)
978 {
979  Tuplesortstate *state = tuplesort_begin_common(workMem, randomAccess);
980  MemoryContext oldcontext;
981  int16 typlen;
982  bool typbyval;
983 
984  oldcontext = MemoryContextSwitchTo(state->sortcontext);
985 
986 #ifdef TRACE_SORT
987  if (trace_sort)
988  elog(LOG,
989  "begin datum sort: workMem = %d, randomAccess = %c",
990  workMem, randomAccess ? 't' : 'f');
991 #endif
992 
993  state->nKeys = 1; /* always a one-column sort */
994 
995  TRACE_POSTGRESQL_SORT_START(DATUM_SORT,
996  false, /* no unique check */
997  1,
998  workMem,
999  randomAccess);
1000 
1001  state->comparetup = comparetup_datum;
1002  state->copytup = copytup_datum;
1003  state->writetup = writetup_datum;
1004  state->readtup = readtup_datum;
1005  state->abbrevNext = 10;
1006 
1007  state->datumType = datumType;
1008 
1009  /* lookup necessary attributes of the datum type */
1010  get_typlenbyval(datumType, &typlen, &typbyval);
1011  state->datumTypeLen = typlen;
1012  state->tuples = !typbyval;
1013 
1014  /* Prepare SortSupport data */
1015  state->sortKeys = (SortSupport) palloc0(sizeof(SortSupportData));
1016 
1018  state->sortKeys->ssup_collation = sortCollation;
1019  state->sortKeys->ssup_nulls_first = nullsFirstFlag;
1020 
1021  /*
1022  * Abbreviation is possible here only for by-reference types. In theory,
1023  * a pass-by-value datatype could have an abbreviated form that is cheaper
1024  * to compare. In a tuple sort, we could support that, because we can
1025  * always extract the original datum from the tuple is needed. Here, we
1026  * can't, because a datum sort only stores a single copy of the datum; the
1027  * "tuple" field of each sortTuple is NULL.
1028  */
1029  state->sortKeys->abbreviate = !typbyval;
1030 
1031  PrepareSortSupportFromOrderingOp(sortOperator, state->sortKeys);
1032 
1033  /*
1034  * The "onlyKey" optimization cannot be used with abbreviated keys, since
1035  * tie-breaker comparisons may be required. Typically, the optimization
1036  * is only of value to pass-by-value types anyway, whereas abbreviated
1037  * keys are typically only of value to pass-by-reference types.
1038  */
1039  if (!state->sortKeys->abbrev_converter)
1040  state->onlyKey = state->sortKeys;
1041 
1042  MemoryContextSwitchTo(oldcontext);
1043 
1044  return state;
1045 }
1046 
1047 /*
1048  * tuplesort_set_bound
1049  *
1050  * Advise tuplesort that at most the first N result tuples are required.
1051  *
1052  * Must be called before inserting any tuples. (Actually, we could allow it
1053  * as long as the sort hasn't spilled to disk, but there seems no need for
1054  * delayed calls at the moment.)
1055  *
1056  * This is a hint only. The tuplesort may still return more tuples than
1057  * requested.
1058  */
1059 void
1061 {
1062  /* Assert we're called before loading any tuples */
1063  Assert(state->status == TSS_INITIAL);
1064  Assert(state->memtupcount == 0);
1065  Assert(!state->bounded);
1066 
1067 #ifdef DEBUG_BOUNDED_SORT
1068  /* Honor GUC setting that disables the feature (for easy testing) */
1069  if (!optimize_bounded_sort)
1070  return;
1071 #endif
1072 
1073  /* We want to be able to compute bound * 2, so limit the setting */
1074  if (bound > (int64) (INT_MAX / 2))
1075  return;
1076 
1077  state->bounded = true;
1078  state->bound = (int) bound;
1079 
1080  /*
1081  * Bounded sorts are not an effective target for abbreviated key
1082  * optimization. Disable by setting state to be consistent with no
1083  * abbreviation support.
1084  */
1085  state->sortKeys->abbrev_converter = NULL;
1086  if (state->sortKeys->abbrev_full_comparator)
1088 
1089  /* Not strictly necessary, but be tidy */
1090  state->sortKeys->abbrev_abort = NULL;
1091  state->sortKeys->abbrev_full_comparator = NULL;
1092 }
1093 
1094 /*
1095  * tuplesort_end
1096  *
1097  * Release resources and clean up.
1098  *
1099  * NOTE: after calling this, any pointers returned by tuplesort_getXXX are
1100  * pointing to garbage. Be careful not to attempt to use or free such
1101  * pointers afterwards!
1102  */
1103 void
1105 {
1106  /* context swap probably not needed, but let's be safe */
1107  MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext);
1108 
1109 #ifdef TRACE_SORT
1110  long spaceUsed;
1111 
1112  if (state->tapeset)
1113  spaceUsed = LogicalTapeSetBlocks(state->tapeset);
1114  else
1115  spaceUsed = (state->allowedMem - state->availMem + 1023) / 1024;
1116 #endif
1117 
1118  /*
1119  * Delete temporary "tape" files, if any.
1120  *
1121  * Note: want to include this in reported total cost of sort, hence need
1122  * for two #ifdef TRACE_SORT sections.
1123  */
1124  if (state->tapeset)
1125  LogicalTapeSetClose(state->tapeset);
1126 
1127 #ifdef TRACE_SORT
1128  if (trace_sort)
1129  {
1130  if (state->tapeset)
1131  elog(LOG, "external sort ended, %ld disk blocks used: %s",
1132  spaceUsed, pg_rusage_show(&state->ru_start));
1133  else
1134  elog(LOG, "internal sort ended, %ld KB used: %s",
1135  spaceUsed, pg_rusage_show(&state->ru_start));
1136  }
1137 
1138  TRACE_POSTGRESQL_SORT_DONE(state->tapeset != NULL, spaceUsed);
1139 #else
1140 
1141  /*
1142  * If you disabled TRACE_SORT, you can still probe sort__done, but you
1143  * ain't getting space-used stats.
1144  */
1145  TRACE_POSTGRESQL_SORT_DONE(state->tapeset != NULL, 0L);
1146 #endif
1147 
1148  /* Free any execution state created for CLUSTER case */
1149  if (state->estate != NULL)
1150  {
1151  ExprContext *econtext = GetPerTupleExprContext(state->estate);
1152 
1154  FreeExecutorState(state->estate);
1155  }
1156 
1157  MemoryContextSwitchTo(oldcontext);
1158 
1159  /*
1160  * Free the per-sort memory context, thereby releasing all working memory,
1161  * including the Tuplesortstate struct itself.
1162  */
1164 }
1165 
1166 /*
1167  * Grow the memtuples[] array, if possible within our memory constraint. We
1168  * must not exceed INT_MAX tuples in memory or the caller-provided memory
1169  * limit. Return true if we were able to enlarge the array, false if not.
1170  *
1171  * Normally, at each increment we double the size of the array. When doing
1172  * that would exceed a limit, we attempt one last, smaller increase (and then
1173  * clear the growmemtuples flag so we don't try any more). That allows us to
1174  * use memory as fully as permitted; sticking to the pure doubling rule could
1175  * result in almost half going unused. Because availMem moves around with
1176  * tuple addition/removal, we need some rule to prevent making repeated small
1177  * increases in memtupsize, which would just be useless thrashing. The
1178  * growmemtuples flag accomplishes that and also prevents useless
1179  * recalculations in this function.
1180  */
1181 static bool
1183 {
1184  int newmemtupsize;
1185  int memtupsize = state->memtupsize;
1186  int64 memNowUsed = state->allowedMem - state->availMem;
1187 
1188  /* Forget it if we've already maxed out memtuples, per comment above */
1189  if (!state->growmemtuples)
1190  return false;
1191 
1192  /* Select new value of memtupsize */
1193  if (memNowUsed <= state->availMem)
1194  {
1195  /*
1196  * We've used no more than half of allowedMem; double our usage,
1197  * clamping at INT_MAX tuples.
1198  */
1199  if (memtupsize < INT_MAX / 2)
1200  newmemtupsize = memtupsize * 2;
1201  else
1202  {
1203  newmemtupsize = INT_MAX;
1204  state->growmemtuples = false;
1205  }
1206  }
1207  else
1208  {
1209  /*
1210  * This will be the last increment of memtupsize. Abandon doubling
1211  * strategy and instead increase as much as we safely can.
1212  *
1213  * To stay within allowedMem, we can't increase memtupsize by more
1214  * than availMem / sizeof(SortTuple) elements. In practice, we want
1215  * to increase it by considerably less, because we need to leave some
1216  * space for the tuples to which the new array slots will refer. We
1217  * assume the new tuples will be about the same size as the tuples
1218  * we've already seen, and thus we can extrapolate from the space
1219  * consumption so far to estimate an appropriate new size for the
1220  * memtuples array. The optimal value might be higher or lower than
1221  * this estimate, but it's hard to know that in advance. We again
1222  * clamp at INT_MAX tuples.
1223  *
1224  * This calculation is safe against enlarging the array so much that
1225  * LACKMEM becomes true, because the memory currently used includes
1226  * the present array; thus, there would be enough allowedMem for the
1227  * new array elements even if no other memory were currently used.
1228  *
1229  * We do the arithmetic in float8, because otherwise the product of
1230  * memtupsize and allowedMem could overflow. Any inaccuracy in the
1231  * result should be insignificant; but even if we computed a
1232  * completely insane result, the checks below will prevent anything
1233  * really bad from happening.
1234  */
1235  double grow_ratio;
1236 
1237  grow_ratio = (double) state->allowedMem / (double) memNowUsed;
1238  if (memtupsize * grow_ratio < INT_MAX)
1239  newmemtupsize = (int) (memtupsize * grow_ratio);
1240  else
1241  newmemtupsize = INT_MAX;
1242 
1243  /* We won't make any further enlargement attempts */
1244  state->growmemtuples = false;
1245  }
1246 
1247  /* Must enlarge array by at least one element, else report failure */
1248  if (newmemtupsize <= memtupsize)
1249  goto noalloc;
1250 
1251  /*
1252  * On a 32-bit machine, allowedMem could exceed MaxAllocHugeSize. Clamp
1253  * to ensure our request won't be rejected. Note that we can easily
1254  * exhaust address space before facing this outcome. (This is presently
1255  * impossible due to guc.c's MAX_KILOBYTES limitation on work_mem, but
1256  * don't rely on that at this distance.)
1257  */
1258  if ((Size) newmemtupsize >= MaxAllocHugeSize / sizeof(SortTuple))
1259  {
1260  newmemtupsize = (int) (MaxAllocHugeSize / sizeof(SortTuple));
1261  state->growmemtuples = false; /* can't grow any more */
1262  }
1263 
1264  /*
1265  * We need to be sure that we do not cause LACKMEM to become true, else
1266  * the space management algorithm will go nuts. The code above should
1267  * never generate a dangerous request, but to be safe, check explicitly
1268  * that the array growth fits within availMem. (We could still cause
1269  * LACKMEM if the memory chunk overhead associated with the memtuples
1270  * array were to increase. That shouldn't happen because we chose the
1271  * initial array size large enough to ensure that palloc will be treating
1272  * both old and new arrays as separate chunks. But we'll check LACKMEM
1273  * explicitly below just in case.)
1274  */
1275  if (state->availMem < (int64) ((newmemtupsize - memtupsize) * sizeof(SortTuple)))
1276  goto noalloc;
1277 
1278  /* OK, do it */
1279  FREEMEM(state, GetMemoryChunkSpace(state->memtuples));
1280  state->memtupsize = newmemtupsize;
1281  state->memtuples = (SortTuple *)
1282  repalloc_huge(state->memtuples,
1283  state->memtupsize * sizeof(SortTuple));
1284  USEMEM(state, GetMemoryChunkSpace(state->memtuples));
1285  if (LACKMEM(state))
1286  elog(ERROR, "unexpected out-of-memory situation in tuplesort");
1287  return true;
1288 
1289 noalloc:
1290  /* If for any reason we didn't realloc, shut off future attempts */
1291  state->growmemtuples = false;
1292  return false;
1293 }
1294 
1295 /*
1296  * Accept one tuple while collecting input data for sort.
1297  *
1298  * Note that the input data is always copied; the caller need not save it.
1299  */
1300 void
1302 {
1303  MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext);
1304  SortTuple stup;
1305 
1306  /*
1307  * Copy the given tuple into memory we control, and decrease availMem.
1308  * Then call the common code.
1309  */
1310  COPYTUP(state, &stup, (void *) slot);
1311 
1312  puttuple_common(state, &stup);
1313 
1314  MemoryContextSwitchTo(oldcontext);
1315 }
1316 
1317 /*
1318  * Accept one tuple while collecting input data for sort.
1319  *
1320  * Note that the input data is always copied; the caller need not save it.
1321  */
1322 void
1324 {
1325  MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext);
1326  SortTuple stup;
1327 
1328  /*
1329  * Copy the given tuple into memory we control, and decrease availMem.
1330  * Then call the common code.
1331  */
1332  COPYTUP(state, &stup, (void *) tup);
1333 
1334  puttuple_common(state, &stup);
1335 
1336  MemoryContextSwitchTo(oldcontext);
1337 }
1338 
1339 /*
1340  * Collect one index tuple while collecting input data for sort, building
1341  * it from caller-supplied values.
1342  */
1343 void
1345  ItemPointer self, Datum *values,
1346  bool *isnull)
1347 {
1348  MemoryContext oldcontext = MemoryContextSwitchTo(state->tuplecontext);
1349  SortTuple stup;
1350  Datum original;
1351  IndexTuple tuple;
1352 
1353  stup.tuple = index_form_tuple(RelationGetDescr(rel), values, isnull);
1354  tuple = ((IndexTuple) stup.tuple);
1355  tuple->t_tid = *self;
1356  USEMEM(state, GetMemoryChunkSpace(stup.tuple));
1357  /* set up first-column key value */
1358  original = index_getattr(tuple,
1359  1,
1360  RelationGetDescr(state->indexRel),
1361  &stup.isnull1);
1362 
1364 
1365  if (!state->sortKeys || !state->sortKeys->abbrev_converter || stup.isnull1)
1366  {
1367  /*
1368  * Store ordinary Datum representation, or NULL value. If there is a
1369  * converter it won't expect NULL values, and cost model is not
1370  * required to account for NULL, so in that case we avoid calling
1371  * converter and just set datum1 to zeroed representation (to be
1372  * consistent, and to support cheap inequality tests for NULL
1373  * abbreviated keys).
1374  */
1375  stup.datum1 = original;
1376  }
1377  else if (!consider_abort_common(state))
1378  {
1379  /* Store abbreviated key representation */
1380  stup.datum1 = state->sortKeys->abbrev_converter(original,
1381  state->sortKeys);
1382  }
1383  else
1384  {
1385  /* Abort abbreviation */
1386  int i;
1387 
1388  stup.datum1 = original;
1389 
1390  /*
1391  * Set state to be consistent with never trying abbreviation.
1392  *
1393  * Alter datum1 representation in already-copied tuples, so as to
1394  * ensure a consistent representation (current tuple was just
1395  * handled). It does not matter if some dumped tuples are already
1396  * sorted on tape, since serialized tuples lack abbreviated keys
1397  * (TSS_BUILDRUNS state prevents control reaching here in any case).
1398  */
1399  for (i = 0; i < state->memtupcount; i++)
1400  {
1401  SortTuple *mtup = &state->memtuples[i];
1402 
1403  tuple = mtup->tuple;
1404  mtup->datum1 = index_getattr(tuple,
1405  1,
1406  RelationGetDescr(state->indexRel),
1407  &mtup->isnull1);
1408  }
1409  }
1410 
1411  puttuple_common(state, &stup);
1412 
1413  MemoryContextSwitchTo(oldcontext);
1414 }
1415 
1416 /*
1417  * Accept one Datum while collecting input data for sort.
1418  *
1419  * If the Datum is pass-by-ref type, the value will be copied.
1420  */
1421 void
1423 {
1424  MemoryContext oldcontext = MemoryContextSwitchTo(state->tuplecontext);
1425  SortTuple stup;
1426 
1427  /*
1428  * Pass-by-value types or null values are just stored directly in
1429  * stup.datum1 (and stup.tuple is not used and set to NULL).
1430  *
1431  * Non-null pass-by-reference values need to be copied into memory we
1432  * control, and possibly abbreviated. The copied value is pointed to by
1433  * stup.tuple and is treated as the canonical copy (e.g. to return via
1434  * tuplesort_getdatum or when writing to tape); stup.datum1 gets the
1435  * abbreviated value if abbreviation is happening, otherwise it's
1436  * identical to stup.tuple.
1437  */
1438 
1439  if (isNull || !state->tuples)
1440  {
1441  /*
1442  * Set datum1 to zeroed representation for NULLs (to be consistent,
1443  * and to support cheap inequality tests for NULL abbreviated keys).
1444  */
1445  stup.datum1 = !isNull ? val : (Datum) 0;
1446  stup.isnull1 = isNull;
1447  stup.tuple = NULL; /* no separate storage */
1449  }
1450  else
1451  {
1452  Datum original = datumCopy(val, false, state->datumTypeLen);
1453 
1454  stup.isnull1 = false;
1455  stup.tuple = DatumGetPointer(original);
1456  USEMEM(state, GetMemoryChunkSpace(stup.tuple));
1458 
1459  if (!state->sortKeys->abbrev_converter)
1460  {
1461  stup.datum1 = original;
1462  }
1463  else if (!consider_abort_common(state))
1464  {
1465  /* Store abbreviated key representation */
1466  stup.datum1 = state->sortKeys->abbrev_converter(original,
1467  state->sortKeys);
1468  }
1469  else
1470  {
1471  /* Abort abbreviation */
1472  int i;
1473 
1474  stup.datum1 = original;
1475 
1476  /*
1477  * Set state to be consistent with never trying abbreviation.
1478  *
1479  * Alter datum1 representation in already-copied tuples, so as to
1480  * ensure a consistent representation (current tuple was just
1481  * handled). It does not matter if some dumped tuples are already
1482  * sorted on tape, since serialized tuples lack abbreviated keys
1483  * (TSS_BUILDRUNS state prevents control reaching here in any
1484  * case).
1485  */
1486  for (i = 0; i < state->memtupcount; i++)
1487  {
1488  SortTuple *mtup = &state->memtuples[i];
1489 
1490  mtup->datum1 = PointerGetDatum(mtup->tuple);
1491  }
1492  }
1493  }
1494 
1495  puttuple_common(state, &stup);
1496 
1497  MemoryContextSwitchTo(oldcontext);
1498 }
1499 
1500 /*
1501  * Shared code for tuple and datum cases.
1502  */
1503 static void
1505 {
1506  switch (state->status)
1507  {
1508  case TSS_INITIAL:
1509 
1510  /*
1511  * Save the tuple into the unsorted array. First, grow the array
1512  * as needed. Note that we try to grow the array when there is
1513  * still one free slot remaining --- if we fail, there'll still be
1514  * room to store the incoming tuple, and then we'll switch to
1515  * tape-based operation.
1516  */
1517  if (state->memtupcount >= state->memtupsize - 1)
1518  {
1519  (void) grow_memtuples(state);
1520  Assert(state->memtupcount < state->memtupsize);
1521  }
1522  state->memtuples[state->memtupcount++] = *tuple;
1523 
1524  /*
1525  * Check if it's time to switch over to a bounded heapsort. We do
1526  * so if the input tuple count exceeds twice the desired tuple
1527  * count (this is a heuristic for where heapsort becomes cheaper
1528  * than a quicksort), or if we've just filled workMem and have
1529  * enough tuples to meet the bound.
1530  *
1531  * Note that once we enter TSS_BOUNDED state we will always try to
1532  * complete the sort that way. In the worst case, if later input
1533  * tuples are larger than earlier ones, this might cause us to
1534  * exceed workMem significantly.
1535  */
1536  if (state->bounded &&
1537  (state->memtupcount > state->bound * 2 ||
1538  (state->memtupcount > state->bound && LACKMEM(state))))
1539  {
1540 #ifdef TRACE_SORT
1541  if (trace_sort)
1542  elog(LOG, "switching to bounded heapsort at %d tuples: %s",
1543  state->memtupcount,
1544  pg_rusage_show(&state->ru_start));
1545 #endif
1546  make_bounded_heap(state);
1547  return;
1548  }
1549 
1550  /*
1551  * Done if we still fit in available memory and have array slots.
1552  */
1553  if (state->memtupcount < state->memtupsize && !LACKMEM(state))
1554  return;
1555 
1556  /*
1557  * Nope; time to switch to tape-based operation.
1558  */
1559  inittapes(state);
1560 
1561  /*
1562  * Dump all tuples.
1563  */
1564  dumptuples(state, false);
1565  break;
1566 
1567  case TSS_BOUNDED:
1568 
1569  /*
1570  * We don't want to grow the array here, so check whether the new
1571  * tuple can be discarded before putting it in. This should be a
1572  * good speed optimization, too, since when there are many more
1573  * input tuples than the bound, most input tuples can be discarded
1574  * with just this one comparison. Note that because we currently
1575  * have the sort direction reversed, we must check for <= not >=.
1576  */
1577  if (COMPARETUP(state, tuple, &state->memtuples[0]) <= 0)
1578  {
1579  /* new tuple <= top of the heap, so we can discard it */
1580  free_sort_tuple(state, tuple);
1582  }
1583  else
1584  {
1585  /* discard top of heap, replacing it with the new tuple */
1586  free_sort_tuple(state, &state->memtuples[0]);
1587  tuplesort_heap_replace_top(state, tuple);
1588  }
1589  break;
1590 
1591  case TSS_BUILDRUNS:
1592 
1593  /*
1594  * Save the tuple into the unsorted array (there must be space)
1595  */
1596  state->memtuples[state->memtupcount++] = *tuple;
1597 
1598  /*
1599  * If we are over the memory limit, dump all tuples.
1600  */
1601  dumptuples(state, false);
1602  break;
1603 
1604  default:
1605  elog(ERROR, "invalid tuplesort state");
1606  break;
1607  }
1608 }
1609 
1610 static bool
1612 {
1613  Assert(state->sortKeys[0].abbrev_converter != NULL);
1614  Assert(state->sortKeys[0].abbrev_abort != NULL);
1615  Assert(state->sortKeys[0].abbrev_full_comparator != NULL);
1616 
1617  /*
1618  * Check effectiveness of abbreviation optimization. Consider aborting
1619  * when still within memory limit.
1620  */
1621  if (state->status == TSS_INITIAL &&
1622  state->memtupcount >= state->abbrevNext)
1623  {
1624  state->abbrevNext *= 2;
1625 
1626  /*
1627  * Check opclass-supplied abbreviation abort routine. It may indicate
1628  * that abbreviation should not proceed.
1629  */
1630  if (!state->sortKeys->abbrev_abort(state->memtupcount,
1631  state->sortKeys))
1632  return false;
1633 
1634  /*
1635  * Finally, restore authoritative comparator, and indicate that
1636  * abbreviation is not in play by setting abbrev_converter to NULL
1637  */
1638  state->sortKeys[0].comparator = state->sortKeys[0].abbrev_full_comparator;
1639  state->sortKeys[0].abbrev_converter = NULL;
1640  /* Not strictly necessary, but be tidy */
1641  state->sortKeys[0].abbrev_abort = NULL;
1642  state->sortKeys[0].abbrev_full_comparator = NULL;
1643 
1644  /* Give up - expect original pass-by-value representation */
1645  return true;
1646  }
1647 
1648  return false;
1649 }
1650 
1651 /*
1652  * All tuples have been provided; finish the sort.
1653  */
1654 void
1656 {
1657  MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext);
1658 
1659 #ifdef TRACE_SORT
1660  if (trace_sort)
1661  elog(LOG, "performsort starting: %s",
1662  pg_rusage_show(&state->ru_start));
1663 #endif
1664 
1665  switch (state->status)
1666  {
1667  case TSS_INITIAL:
1668 
1669  /*
1670  * We were able to accumulate all the tuples within the allowed
1671  * amount of memory. Just qsort 'em and we're done.
1672  */
1673  tuplesort_sort_memtuples(state);
1674  state->current = 0;
1675  state->eof_reached = false;
1676  state->markpos_offset = 0;
1677  state->markpos_eof = false;
1678  state->status = TSS_SORTEDINMEM;
1679  break;
1680 
1681  case TSS_BOUNDED:
1682 
1683  /*
1684  * We were able to accumulate all the tuples required for output
1685  * in memory, using a heap to eliminate excess tuples. Now we
1686  * have to transform the heap to a properly-sorted array.
1687  */
1688  sort_bounded_heap(state);
1689  state->current = 0;
1690  state->eof_reached = false;
1691  state->markpos_offset = 0;
1692  state->markpos_eof = false;
1693  state->status = TSS_SORTEDINMEM;
1694  break;
1695 
1696  case TSS_BUILDRUNS:
1697 
1698  /*
1699  * Finish tape-based sort. First, flush all tuples remaining in
1700  * memory out to tape; then merge until we have a single remaining
1701  * run (or, if !randomAccess, one run per tape). Note that
1702  * mergeruns sets the correct state->status.
1703  */
1704  dumptuples(state, true);
1705  mergeruns(state);
1706  state->eof_reached = false;
1707  state->markpos_block = 0L;
1708  state->markpos_offset = 0;
1709  state->markpos_eof = false;
1710  break;
1711 
1712  default:
1713  elog(ERROR, "invalid tuplesort state");
1714  break;
1715  }
1716 
1717 #ifdef TRACE_SORT
1718  if (trace_sort)
1719  {
1720  if (state->status == TSS_FINALMERGE)
1721  elog(LOG, "performsort done (except %d-way final merge): %s",
1722  state->activeTapes,
1723  pg_rusage_show(&state->ru_start));
1724  else
1725  elog(LOG, "performsort done: %s",
1726  pg_rusage_show(&state->ru_start));
1727  }
1728 #endif
1729 
1730  MemoryContextSwitchTo(oldcontext);
1731 }
1732 
1733 /*
1734  * Internal routine to fetch the next tuple in either forward or back
1735  * direction into *stup. Returns false if no more tuples.
1736  * Returned tuple belongs to tuplesort memory context, and must not be freed
1737  * by caller. Note that fetched tuple is stored in memory that may be
1738  * recycled by any future fetch.
1739  */
1740 static bool
1742  SortTuple *stup)
1743 {
1744  unsigned int tuplen;
1745  size_t nmoved;
1746 
1747  switch (state->status)
1748  {
1749  case TSS_SORTEDINMEM:
1750  Assert(forward || state->randomAccess);
1751  Assert(!state->slabAllocatorUsed);
1752  if (forward)
1753  {
1754  if (state->current < state->memtupcount)
1755  {
1756  *stup = state->memtuples[state->current++];
1757  return true;
1758  }
1759  state->eof_reached = true;
1760 
1761  /*
1762  * Complain if caller tries to retrieve more tuples than
1763  * originally asked for in a bounded sort. This is because
1764  * returning EOF here might be the wrong thing.
1765  */
1766  if (state->bounded && state->current >= state->bound)
1767  elog(ERROR, "retrieved too many tuples in a bounded sort");
1768 
1769  return false;
1770  }
1771  else
1772  {
1773  if (state->current <= 0)
1774  return false;
1775 
1776  /*
1777  * if all tuples are fetched already then we return last
1778  * tuple, else - tuple before last returned.
1779  */
1780  if (state->eof_reached)
1781  state->eof_reached = false;
1782  else
1783  {
1784  state->current--; /* last returned tuple */
1785  if (state->current <= 0)
1786  return false;
1787  }
1788  *stup = state->memtuples[state->current - 1];
1789  return true;
1790  }
1791  break;
1792 
1793  case TSS_SORTEDONTAPE:
1794  Assert(forward || state->randomAccess);
1795  Assert(state->slabAllocatorUsed);
1796 
1797  /*
1798  * The slot that held the tuple that we returned in previous
1799  * gettuple call can now be reused.
1800  */
1801  if (state->lastReturnedTuple)
1802  {
1803  RELEASE_SLAB_SLOT(state, state->lastReturnedTuple);
1804  state->lastReturnedTuple = NULL;
1805  }
1806 
1807  if (forward)
1808  {
1809  if (state->eof_reached)
1810  return false;
1811 
1812  if ((tuplen = getlen(state, state->result_tape, true)) != 0)
1813  {
1814  READTUP(state, stup, state->result_tape, tuplen);
1815 
1816  /*
1817  * Remember the tuple we return, so that we can recycle
1818  * its memory on next call. (This can be NULL, in the
1819  * !state->tuples case).
1820  */
1821  state->lastReturnedTuple = stup->tuple;
1822 
1823  return true;
1824  }
1825  else
1826  {
1827  state->eof_reached = true;
1828  return false;
1829  }
1830  }
1831 
1832  /*
1833  * Backward.
1834  *
1835  * if all tuples are fetched already then we return last tuple,
1836  * else - tuple before last returned.
1837  */
1838  if (state->eof_reached)
1839  {
1840  /*
1841  * Seek position is pointing just past the zero tuplen at the
1842  * end of file; back up to fetch last tuple's ending length
1843  * word. If seek fails we must have a completely empty file.
1844  */
1845  nmoved = LogicalTapeBackspace(state->tapeset,
1846  state->result_tape,
1847  2 * sizeof(unsigned int));
1848  if (nmoved == 0)
1849  return false;
1850  else if (nmoved != 2 * sizeof(unsigned int))
1851  elog(ERROR, "unexpected tape position");
1852  state->eof_reached = false;
1853  }
1854  else
1855  {
1856  /*
1857  * Back up and fetch previously-returned tuple's ending length
1858  * word. If seek fails, assume we are at start of file.
1859  */
1860  nmoved = LogicalTapeBackspace(state->tapeset,
1861  state->result_tape,
1862  sizeof(unsigned int));
1863  if (nmoved == 0)
1864  return false;
1865  else if (nmoved != sizeof(unsigned int))
1866  elog(ERROR, "unexpected tape position");
1867  tuplen = getlen(state, state->result_tape, false);
1868 
1869  /*
1870  * Back up to get ending length word of tuple before it.
1871  */
1872  nmoved = LogicalTapeBackspace(state->tapeset,
1873  state->result_tape,
1874  tuplen + 2 * sizeof(unsigned int));
1875  if (nmoved == tuplen + sizeof(unsigned int))
1876  {
1877  /*
1878  * We backed up over the previous tuple, but there was no
1879  * ending length word before it. That means that the prev
1880  * tuple is the first tuple in the file. It is now the
1881  * next to read in forward direction (not obviously right,
1882  * but that is what in-memory case does).
1883  */
1884  return false;
1885  }
1886  else if (nmoved != tuplen + 2 * sizeof(unsigned int))
1887  elog(ERROR, "bogus tuple length in backward scan");
1888  }
1889 
1890  tuplen = getlen(state, state->result_tape, false);
1891 
1892  /*
1893  * Now we have the length of the prior tuple, back up and read it.
1894  * Note: READTUP expects we are positioned after the initial
1895  * length word of the tuple, so back up to that point.
1896  */
1897  nmoved = LogicalTapeBackspace(state->tapeset,
1898  state->result_tape,
1899  tuplen);
1900  if (nmoved != tuplen)
1901  elog(ERROR, "bogus tuple length in backward scan");
1902  READTUP(state, stup, state->result_tape, tuplen);
1903 
1904  /*
1905  * Remember the tuple we return, so that we can recycle its memory
1906  * on next call. (This can be NULL, in the Datum case).
1907  */
1908  state->lastReturnedTuple = stup->tuple;
1909 
1910  return true;
1911 
1912  case TSS_FINALMERGE:
1913  Assert(forward);
1914  /* We are managing memory ourselves, with the slab allocator. */
1915  Assert(state->slabAllocatorUsed);
1916 
1917  /*
1918  * The slab slot holding the tuple that we returned in previous
1919  * gettuple call can now be reused.
1920  */
1921  if (state->lastReturnedTuple)
1922  {
1923  RELEASE_SLAB_SLOT(state, state->lastReturnedTuple);
1924  state->lastReturnedTuple = NULL;
1925  }
1926 
1927  /*
1928  * This code should match the inner loop of mergeonerun().
1929  */
1930  if (state->memtupcount > 0)
1931  {
1932  int srcTape = state->memtuples[0].tupindex;
1933  SortTuple newtup;
1934 
1935  *stup = state->memtuples[0];
1936 
1937  /*
1938  * Remember the tuple we return, so that we can recycle its
1939  * memory on next call. (This can be NULL, in the Datum case).
1940  */
1941  state->lastReturnedTuple = stup->tuple;
1942 
1943  /*
1944  * Pull next tuple from tape, and replace the returned tuple
1945  * at top of the heap with it.
1946  */
1947  if (!mergereadnext(state, srcTape, &newtup))
1948  {
1949  /*
1950  * If no more data, we've reached end of run on this tape.
1951  * Remove the top node from the heap.
1952  */
1954 
1955  /*
1956  * Rewind to free the read buffer. It'd go away at the
1957  * end of the sort anyway, but better to release the
1958  * memory early.
1959  */
1960  LogicalTapeRewindForWrite(state->tapeset, srcTape);
1961  return true;
1962  }
1963  newtup.tupindex = srcTape;
1964  tuplesort_heap_replace_top(state, &newtup);
1965  return true;
1966  }
1967  return false;
1968 
1969  default:
1970  elog(ERROR, "invalid tuplesort state");
1971  return false; /* keep compiler quiet */
1972  }
1973 }
1974 
1975 /*
1976  * Fetch the next tuple in either forward or back direction.
1977  * If successful, put tuple in slot and return true; else, clear the slot
1978  * and return false.
1979  *
1980  * Caller may optionally be passed back abbreviated value (on true return
1981  * value) when abbreviation was used, which can be used to cheaply avoid
1982  * equality checks that might otherwise be required. Caller can safely make a
1983  * determination of "non-equal tuple" based on simple binary inequality. A
1984  * NULL value in leading attribute will set abbreviated value to zeroed
1985  * representation, which caller may rely on in abbreviated inequality check.
1986  *
1987  * If copy is true, the slot receives a copied tuple that will stay valid
1988  * regardless of future manipulations of the tuplesort's state. Memory is
1989  * owned by the caller. If copy is false, the slot will just receive a
1990  * pointer to a tuple held within the tuplesort, which is more efficient, but
1991  * only safe for callers that are prepared to have any subsequent manipulation
1992  * of the tuplesort's state invalidate slot contents.
1993  */
1994 bool
1995 tuplesort_gettupleslot(Tuplesortstate *state, bool forward, bool copy,
1996  TupleTableSlot *slot, Datum *abbrev)
1997 {
1998  MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext);
1999  SortTuple stup;
2000 
2001  if (!tuplesort_gettuple_common(state, forward, &stup))
2002  stup.tuple = NULL;
2003 
2004  MemoryContextSwitchTo(oldcontext);
2005 
2006  if (stup.tuple)
2007  {
2008  /* Record abbreviated key for caller */
2009  if (state->sortKeys->abbrev_converter && abbrev)
2010  *abbrev = stup.datum1;
2011 
2012  if (copy)
2014 
2015  ExecStoreMinimalTuple((MinimalTuple) stup.tuple, slot, copy);
2016  return true;
2017  }
2018  else
2019  {
2020  ExecClearTuple(slot);
2021  return false;
2022  }
2023 }
2024 
2025 /*
2026  * Fetch the next tuple in either forward or back direction.
2027  * Returns NULL if no more tuples. Returned tuple belongs to tuplesort memory
2028  * context, and must not be freed by caller. Caller may not rely on tuple
2029  * remaining valid after any further manipulation of tuplesort.
2030  */
2031 HeapTuple
2033 {
2034  MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext);
2035  SortTuple stup;
2036 
2037  if (!tuplesort_gettuple_common(state, forward, &stup))
2038  stup.tuple = NULL;
2039 
2040  MemoryContextSwitchTo(oldcontext);
2041 
2042  return stup.tuple;
2043 }
2044 
2045 /*
2046  * Fetch the next index tuple in either forward or back direction.
2047  * Returns NULL if no more tuples. Returned tuple belongs to tuplesort memory
2048  * context, and must not be freed by caller. Caller may not rely on tuple
2049  * remaining valid after any further manipulation of tuplesort.
2050  */
2051 IndexTuple
2053 {
2054  MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext);
2055  SortTuple stup;
2056 
2057  if (!tuplesort_gettuple_common(state, forward, &stup))
2058  stup.tuple = NULL;
2059 
2060  MemoryContextSwitchTo(oldcontext);
2061 
2062  return (IndexTuple) stup.tuple;
2063 }
2064 
2065 /*
2066  * Fetch the next Datum in either forward or back direction.
2067  * Returns false if no more datums.
2068  *
2069  * If the Datum is pass-by-ref type, the returned value is freshly palloc'd
2070  * and is now owned by the caller (this differs from similar routines for
2071  * other types of tuplesorts).
2072  *
2073  * Caller may optionally be passed back abbreviated value (on true return
2074  * value) when abbreviation was used, which can be used to cheaply avoid
2075  * equality checks that might otherwise be required. Caller can safely make a
2076  * determination of "non-equal tuple" based on simple binary inequality. A
2077  * NULL value will have a zeroed abbreviated value representation, which caller
2078  * may rely on in abbreviated inequality check.
2079  */
2080 bool
2082  Datum *val, bool *isNull, Datum *abbrev)
2083 {
2084  MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext);
2085  SortTuple stup;
2086 
2087  if (!tuplesort_gettuple_common(state, forward, &stup))
2088  {
2089  MemoryContextSwitchTo(oldcontext);
2090  return false;
2091  }
2092 
2093  /* Record abbreviated key for caller */
2094  if (state->sortKeys->abbrev_converter && abbrev)
2095  *abbrev = stup.datum1;
2096 
2097  if (stup.isnull1 || !state->tuples)
2098  {
2099  *val = stup.datum1;
2100  *isNull = stup.isnull1;
2101  }
2102  else
2103  {
2104  /* use stup.tuple because stup.datum1 may be an abbreviation */
2105  *val = datumCopy(PointerGetDatum(stup.tuple), false, state->datumTypeLen);
2106  *isNull = false;
2107  }
2108 
2109  MemoryContextSwitchTo(oldcontext);
2110 
2111  return true;
2112 }
2113 
2114 /*
2115  * Advance over N tuples in either forward or back direction,
2116  * without returning any data. N==0 is a no-op.
2117  * Returns true if successful, false if ran out of tuples.
2118  */
2119 bool
2120 tuplesort_skiptuples(Tuplesortstate *state, int64 ntuples, bool forward)
2121 {
2122  MemoryContext oldcontext;
2123 
2124  /*
2125  * We don't actually support backwards skip yet, because no callers need
2126  * it. The API is designed to allow for that later, though.
2127  */
2128  Assert(forward);
2129  Assert(ntuples >= 0);
2130 
2131  switch (state->status)
2132  {
2133  case TSS_SORTEDINMEM:
2134  if (state->memtupcount - state->current >= ntuples)
2135  {
2136  state->current += ntuples;
2137  return true;
2138  }
2139  state->current = state->memtupcount;
2140  state->eof_reached = true;
2141 
2142  /*
2143  * Complain if caller tries to retrieve more tuples than
2144  * originally asked for in a bounded sort. This is because
2145  * returning EOF here might be the wrong thing.
2146  */
2147  if (state->bounded && state->current >= state->bound)
2148  elog(ERROR, "retrieved too many tuples in a bounded sort");
2149 
2150  return false;
2151 
2152  case TSS_SORTEDONTAPE:
2153  case TSS_FINALMERGE:
2154 
2155  /*
2156  * We could probably optimize these cases better, but for now it's
2157  * not worth the trouble.
2158  */
2159  oldcontext = MemoryContextSwitchTo(state->sortcontext);
2160  while (ntuples-- > 0)
2161  {
2162  SortTuple stup;
2163 
2164  if (!tuplesort_gettuple_common(state, forward, &stup))
2165  {
2166  MemoryContextSwitchTo(oldcontext);
2167  return false;
2168  }
2170  }
2171  MemoryContextSwitchTo(oldcontext);
2172  return true;
2173 
2174  default:
2175  elog(ERROR, "invalid tuplesort state");
2176  return false; /* keep compiler quiet */
2177  }
2178 }
2179 
2180 /*
2181  * tuplesort_merge_order - report merge order we'll use for given memory
2182  * (note: "merge order" just means the number of input tapes in the merge).
2183  *
2184  * This is exported for use by the planner. allowedMem is in bytes.
2185  */
2186 int
2187 tuplesort_merge_order(int64 allowedMem)
2188 {
2189  int mOrder;
2190 
2191  /*
2192  * We need one tape for each merge input, plus another one for the output,
2193  * and each of these tapes needs buffer space. In addition we want
2194  * MERGE_BUFFER_SIZE workspace per input tape (but the output tape doesn't
2195  * count).
2196  *
2197  * Note: you might be thinking we need to account for the memtuples[]
2198  * array in this calculation, but we effectively treat that as part of the
2199  * MERGE_BUFFER_SIZE workspace.
2200  */
2201  mOrder = (allowedMem - TAPE_BUFFER_OVERHEAD) /
2203 
2204  /*
2205  * Even in minimum memory, use at least a MINORDER merge. On the other
2206  * hand, even when we have lots of memory, do not use more than a MAXORDER
2207  * merge. Tapes are pretty cheap, but they're not entirely free. Each
2208  * additional tape reduces the amount of memory available to build runs,
2209  * which in turn can cause the same sort to need more runs, which makes
2210  * merging slower even if it can still be done in a single pass. Also,
2211  * high order merges are quite slow due to CPU cache effects; it can be
2212  * faster to pay the I/O cost of a polyphase merge than to perform a
2213  * single merge pass across many hundreds of tapes.
2214  */
2215  mOrder = Max(mOrder, MINORDER);
2216  mOrder = Min(mOrder, MAXORDER);
2217 
2218  return mOrder;
2219 }
2220 
2221 /*
2222  * inittapes - initialize for tape sorting.
2223  *
2224  * This is called only if we have found we don't have room to sort in memory.
2225  */
2226 static void
2228 {
2229  int maxTapes,
2230  j;
2231  int64 tapeSpace;
2232 
2233  /* Compute number of tapes to use: merge order plus 1 */
2234  maxTapes = tuplesort_merge_order(state->allowedMem) + 1;
2235 
2236  state->maxTapes = maxTapes;
2237  state->tapeRange = maxTapes - 1;
2238 
2239 #ifdef TRACE_SORT
2240  if (trace_sort)
2241  elog(LOG, "switching to external sort with %d tapes: %s",
2242  maxTapes, pg_rusage_show(&state->ru_start));
2243 #endif
2244 
2245  /*
2246  * Decrease availMem to reflect the space needed for tape buffers, when
2247  * writing the initial runs; but don't decrease it to the point that we
2248  * have no room for tuples. (That case is only likely to occur if sorting
2249  * pass-by-value Datums; in all other scenarios the memtuples[] array is
2250  * unlikely to occupy more than half of allowedMem. In the pass-by-value
2251  * case it's not important to account for tuple space, so we don't care if
2252  * LACKMEM becomes inaccurate.)
2253  */
2254  tapeSpace = (int64) maxTapes * TAPE_BUFFER_OVERHEAD;
2255 
2256  if (tapeSpace + GetMemoryChunkSpace(state->memtuples) < state->allowedMem)
2257  USEMEM(state, tapeSpace);
2258 
2259  /*
2260  * Make sure that the temp file(s) underlying the tape set are created in
2261  * suitable temp tablespaces.
2262  */
2264 
2265  /*
2266  * Create the tape set and allocate the per-tape data arrays.
2267  */
2268  state->tapeset = LogicalTapeSetCreate(maxTapes);
2269 
2270  state->mergeactive = (bool *) palloc0(maxTapes * sizeof(bool));
2271  state->tp_fib = (int *) palloc0(maxTapes * sizeof(int));
2272  state->tp_runs = (int *) palloc0(maxTapes * sizeof(int));
2273  state->tp_dummy = (int *) palloc0(maxTapes * sizeof(int));
2274  state->tp_tapenum = (int *) palloc0(maxTapes * sizeof(int));
2275 
2276  state->currentRun = 0;
2277 
2278  /*
2279  * Initialize variables of Algorithm D (step D1).
2280  */
2281  for (j = 0; j < maxTapes; j++)
2282  {
2283  state->tp_fib[j] = 1;
2284  state->tp_runs[j] = 0;
2285  state->tp_dummy[j] = 1;
2286  state->tp_tapenum[j] = j;
2287  }
2288  state->tp_fib[state->tapeRange] = 0;
2289  state->tp_dummy[state->tapeRange] = 0;
2290 
2291  state->Level = 1;
2292  state->destTape = 0;
2293 
2294  state->status = TSS_BUILDRUNS;
2295 }
2296 
2297 /*
2298  * selectnewtape -- select new tape for new initial run.
2299  *
2300  * This is called after finishing a run when we know another run
2301  * must be started. This implements steps D3, D4 of Algorithm D.
2302  */
2303 static void
2305 {
2306  int j;
2307  int a;
2308 
2309  /* Step D3: advance j (destTape) */
2310  if (state->tp_dummy[state->destTape] < state->tp_dummy[state->destTape + 1])
2311  {
2312  state->destTape++;
2313  return;
2314  }
2315  if (state->tp_dummy[state->destTape] != 0)
2316  {
2317  state->destTape = 0;
2318  return;
2319  }
2320 
2321  /* Step D4: increase level */
2322  state->Level++;
2323  a = state->tp_fib[0];
2324  for (j = 0; j < state->tapeRange; j++)
2325  {
2326  state->tp_dummy[j] = a + state->tp_fib[j + 1] - state->tp_fib[j];
2327  state->tp_fib[j] = a + state->tp_fib[j + 1];
2328  }
2329  state->destTape = 0;
2330 }
2331 
2332 /*
2333  * Initialize the slab allocation arena, for the given number of slots.
2334  */
2335 static void
2337 {
2338  if (numSlots > 0)
2339  {
2340  char *p;
2341  int i;
2342 
2343  state->slabMemoryBegin = palloc(numSlots * SLAB_SLOT_SIZE);
2344  state->slabMemoryEnd = state->slabMemoryBegin +
2345  numSlots * SLAB_SLOT_SIZE;
2346  state->slabFreeHead = (SlabSlot *) state->slabMemoryBegin;
2347  USEMEM(state, numSlots * SLAB_SLOT_SIZE);
2348 
2349  p = state->slabMemoryBegin;
2350  for (i = 0; i < numSlots - 1; i++)
2351  {
2352  ((SlabSlot *) p)->nextfree = (SlabSlot *) (p + SLAB_SLOT_SIZE);
2353  p += SLAB_SLOT_SIZE;
2354  }
2355  ((SlabSlot *) p)->nextfree = NULL;
2356  }
2357  else
2358  {
2359  state->slabMemoryBegin = state->slabMemoryEnd = NULL;
2360  state->slabFreeHead = NULL;
2361  }
2362  state->slabAllocatorUsed = true;
2363 }
2364 
2365 /*
2366  * mergeruns -- merge all the completed initial runs.
2367  *
2368  * This implements steps D5, D6 of Algorithm D. All input data has
2369  * already been written to initial runs on tape (see dumptuples).
2370  */
2371 static void
2373 {
2374  int tapenum,
2375  svTape,
2376  svRuns,
2377  svDummy;
2378  int numTapes;
2379  int numInputTapes;
2380 
2381  Assert(state->status == TSS_BUILDRUNS);
2382  Assert(state->memtupcount == 0);
2383 
2384  if (state->sortKeys != NULL && state->sortKeys->abbrev_converter != NULL)
2385  {
2386  /*
2387  * If there are multiple runs to be merged, when we go to read back
2388  * tuples from disk, abbreviated keys will not have been stored, and
2389  * we don't care to regenerate them. Disable abbreviation from this
2390  * point on.
2391  */
2392  state->sortKeys->abbrev_converter = NULL;
2394 
2395  /* Not strictly necessary, but be tidy */
2396  state->sortKeys->abbrev_abort = NULL;
2397  state->sortKeys->abbrev_full_comparator = NULL;
2398  }
2399 
2400  /*
2401  * Reset tuple memory. We've freed all the tuples that we previously
2402  * allocated. We will use the slab allocator from now on.
2403  */
2405  state->tuplecontext = NULL;
2406 
2407  /*
2408  * We no longer need a large memtuples array. (We will allocate a smaller
2409  * one for the heap later.)
2410  */
2411  FREEMEM(state, GetMemoryChunkSpace(state->memtuples));
2412  pfree(state->memtuples);
2413  state->memtuples = NULL;
2414 
2415  /*
2416  * If we had fewer runs than tapes, refund the memory that we imagined we
2417  * would need for the tape buffers of the unused tapes.
2418  *
2419  * numTapes and numInputTapes reflect the actual number of tapes we will
2420  * use. Note that the output tape's tape number is maxTapes - 1, so the
2421  * tape numbers of the used tapes are not consecutive, and you cannot just
2422  * loop from 0 to numTapes to visit all used tapes!
2423  */
2424  if (state->Level == 1)
2425  {
2426  numInputTapes = state->currentRun;
2427  numTapes = numInputTapes + 1;
2428  FREEMEM(state, (state->maxTapes - numTapes) * TAPE_BUFFER_OVERHEAD);
2429  }
2430  else
2431  {
2432  numInputTapes = state->tapeRange;
2433  numTapes = state->maxTapes;
2434  }
2435 
2436  /*
2437  * Initialize the slab allocator. We need one slab slot per input tape,
2438  * for the tuples in the heap, plus one to hold the tuple last returned
2439  * from tuplesort_gettuple. (If we're sorting pass-by-val Datums,
2440  * however, we don't need to do allocate anything.)
2441  *
2442  * From this point on, we no longer use the USEMEM()/LACKMEM() mechanism
2443  * to track memory usage of individual tuples.
2444  */
2445  if (state->tuples)
2446  init_slab_allocator(state, numInputTapes + 1);
2447  else
2448  init_slab_allocator(state, 0);
2449 
2450  /*
2451  * Allocate a new 'memtuples' array, for the heap. It will hold one tuple
2452  * from each input tape.
2453  */
2454  state->memtupsize = numInputTapes;
2455  state->memtuples = (SortTuple *) palloc(numInputTapes * sizeof(SortTuple));
2456  USEMEM(state, GetMemoryChunkSpace(state->memtuples));
2457 
2458  /*
2459  * Use all the remaining memory we have available for read buffers among
2460  * the input tapes.
2461  *
2462  * We don't try to "rebalance" the memory among tapes, when we start a new
2463  * merge phase, even if some tapes are inactive in the new phase. That
2464  * would be hard, because logtape.c doesn't know where one run ends and
2465  * another begins. When a new merge phase begins, and a tape doesn't
2466  * participate in it, its buffer nevertheless already contains tuples from
2467  * the next run on same tape, so we cannot release the buffer. That's OK
2468  * in practice, merge performance isn't that sensitive to the amount of
2469  * buffers used, and most merge phases use all or almost all tapes,
2470  * anyway.
2471  */
2472 #ifdef TRACE_SORT
2473  if (trace_sort)
2474  elog(LOG, "using " INT64_FORMAT " KB of memory for read buffers among %d input tapes",
2475  (state->availMem) / 1024, numInputTapes);
2476 #endif
2477 
2478  state->read_buffer_size = Max(state->availMem / numInputTapes, 0);
2479  USEMEM(state, state->read_buffer_size * numInputTapes);
2480 
2481  /* End of step D2: rewind all output tapes to prepare for merging */
2482  for (tapenum = 0; tapenum < state->tapeRange; tapenum++)
2483  LogicalTapeRewindForRead(state->tapeset, tapenum, state->read_buffer_size);
2484 
2485  for (;;)
2486  {
2487  /*
2488  * At this point we know that tape[T] is empty. If there's just one
2489  * (real or dummy) run left on each input tape, then only one merge
2490  * pass remains. If we don't have to produce a materialized sorted
2491  * tape, we can stop at this point and do the final merge on-the-fly.
2492  */
2493  if (!state->randomAccess)
2494  {
2495  bool allOneRun = true;
2496 
2497  Assert(state->tp_runs[state->tapeRange] == 0);
2498  for (tapenum = 0; tapenum < state->tapeRange; tapenum++)
2499  {
2500  if (state->tp_runs[tapenum] + state->tp_dummy[tapenum] != 1)
2501  {
2502  allOneRun = false;
2503  break;
2504  }
2505  }
2506  if (allOneRun)
2507  {
2508  /* Tell logtape.c we won't be writing anymore */
2510  /* Initialize for the final merge pass */
2511  beginmerge(state);
2512  state->status = TSS_FINALMERGE;
2513  return;
2514  }
2515  }
2516 
2517  /* Step D5: merge runs onto tape[T] until tape[P] is empty */
2518  while (state->tp_runs[state->tapeRange - 1] ||
2519  state->tp_dummy[state->tapeRange - 1])
2520  {
2521  bool allDummy = true;
2522 
2523  for (tapenum = 0; tapenum < state->tapeRange; tapenum++)
2524  {
2525  if (state->tp_dummy[tapenum] == 0)
2526  {
2527  allDummy = false;
2528  break;
2529  }
2530  }
2531 
2532  if (allDummy)
2533  {
2534  state->tp_dummy[state->tapeRange]++;
2535  for (tapenum = 0; tapenum < state->tapeRange; tapenum++)
2536  state->tp_dummy[tapenum]--;
2537  }
2538  else
2539  mergeonerun(state);
2540  }
2541 
2542  /* Step D6: decrease level */
2543  if (--state->Level == 0)
2544  break;
2545  /* rewind output tape T to use as new input */
2546  LogicalTapeRewindForRead(state->tapeset, state->tp_tapenum[state->tapeRange],
2547  state->read_buffer_size);
2548  /* rewind used-up input tape P, and prepare it for write pass */
2549  LogicalTapeRewindForWrite(state->tapeset, state->tp_tapenum[state->tapeRange - 1]);
2550  state->tp_runs[state->tapeRange - 1] = 0;
2551 
2552  /*
2553  * reassign tape units per step D6; note we no longer care about A[]
2554  */
2555  svTape = state->tp_tapenum[state->tapeRange];
2556  svDummy = state->tp_dummy[state->tapeRange];
2557  svRuns = state->tp_runs[state->tapeRange];
2558  for (tapenum = state->tapeRange; tapenum > 0; tapenum--)
2559  {
2560  state->tp_tapenum[tapenum] = state->tp_tapenum[tapenum - 1];
2561  state->tp_dummy[tapenum] = state->tp_dummy[tapenum - 1];
2562  state->tp_runs[tapenum] = state->tp_runs[tapenum - 1];
2563  }
2564  state->tp_tapenum[0] = svTape;
2565  state->tp_dummy[0] = svDummy;
2566  state->tp_runs[0] = svRuns;
2567  }
2568 
2569  /*
2570  * Done. Knuth says that the result is on TAPE[1], but since we exited
2571  * the loop without performing the last iteration of step D6, we have not
2572  * rearranged the tape unit assignment, and therefore the result is on
2573  * TAPE[T]. We need to do it this way so that we can freeze the final
2574  * output tape while rewinding it. The last iteration of step D6 would be
2575  * a waste of cycles anyway...
2576  */
2577  state->result_tape = state->tp_tapenum[state->tapeRange];
2578  LogicalTapeFreeze(state->tapeset, state->result_tape);
2579  state->status = TSS_SORTEDONTAPE;
2580 
2581  /* Release the read buffers of all the other tapes, by rewinding them. */
2582  for (tapenum = 0; tapenum < state->maxTapes; tapenum++)
2583  {
2584  if (tapenum != state->result_tape)
2585  LogicalTapeRewindForWrite(state->tapeset, tapenum);
2586  }
2587 }
2588 
2589 /*
2590  * Merge one run from each input tape, except ones with dummy runs.
2591  *
2592  * This is the inner loop of Algorithm D step D5. We know that the
2593  * output tape is TAPE[T].
2594  */
2595 static void
2597 {
2598  int destTape = state->tp_tapenum[state->tapeRange];
2599  int srcTape;
2600 
2601  /*
2602  * Start the merge by loading one tuple from each active source tape into
2603  * the heap. We can also decrease the input run/dummy run counts.
2604  */
2605  beginmerge(state);
2606 
2607  /*
2608  * Execute merge by repeatedly extracting lowest tuple in heap, writing it
2609  * out, and replacing it with next tuple from same tape (if there is
2610  * another one).
2611  */
2612  while (state->memtupcount > 0)
2613  {
2614  SortTuple stup;
2615 
2616  /* write the tuple to destTape */
2617  srcTape = state->memtuples[0].tupindex;
2618  WRITETUP(state, destTape, &state->memtuples[0]);
2619 
2620  /* recycle the slot of the tuple we just wrote out, for the next read */
2621  if (state->memtuples[0].tuple)
2622  RELEASE_SLAB_SLOT(state, state->memtuples[0].tuple);
2623 
2624  /*
2625  * pull next tuple from the tape, and replace the written-out tuple in
2626  * the heap with it.
2627  */
2628  if (mergereadnext(state, srcTape, &stup))
2629  {
2630  stup.tupindex = srcTape;
2631  tuplesort_heap_replace_top(state, &stup);
2632 
2633  }
2634  else
2636  }
2637 
2638  /*
2639  * When the heap empties, we're done. Write an end-of-run marker on the
2640  * output tape, and increment its count of real runs.
2641  */
2642  markrunend(state, destTape);
2643  state->tp_runs[state->tapeRange]++;
2644 
2645 #ifdef TRACE_SORT
2646  if (trace_sort)
2647  elog(LOG, "finished %d-way merge step: %s", state->activeTapes,
2648  pg_rusage_show(&state->ru_start));
2649 #endif
2650 }
2651 
2652 /*
2653  * beginmerge - initialize for a merge pass
2654  *
2655  * We decrease the counts of real and dummy runs for each tape, and mark
2656  * which tapes contain active input runs in mergeactive[]. Then, fill the
2657  * merge heap with the first tuple from each active tape.
2658  */
2659 static void
2661 {
2662  int activeTapes;
2663  int tapenum;
2664  int srcTape;
2665 
2666  /* Heap should be empty here */
2667  Assert(state->memtupcount == 0);
2668 
2669  /* Adjust run counts and mark the active tapes */
2670  memset(state->mergeactive, 0,
2671  state->maxTapes * sizeof(*state->mergeactive));
2672  activeTapes = 0;
2673  for (tapenum = 0; tapenum < state->tapeRange; tapenum++)
2674  {
2675  if (state->tp_dummy[tapenum] > 0)
2676  state->tp_dummy[tapenum]--;
2677  else
2678  {
2679  Assert(state->tp_runs[tapenum] > 0);
2680  state->tp_runs[tapenum]--;
2681  srcTape = state->tp_tapenum[tapenum];
2682  state->mergeactive[srcTape] = true;
2683  activeTapes++;
2684  }
2685  }
2686  Assert(activeTapes > 0);
2687  state->activeTapes = activeTapes;
2688 
2689  /* Load the merge heap with the first tuple from each input tape */
2690  for (srcTape = 0; srcTape < state->maxTapes; srcTape++)
2691  {
2692  SortTuple tup;
2693 
2694  if (mergereadnext(state, srcTape, &tup))
2695  {
2696  tup.tupindex = srcTape;
2697  tuplesort_heap_insert(state, &tup);
2698  }
2699  }
2700 }
2701 
2702 /*
2703  * mergereadnext - read next tuple from one merge input tape
2704  *
2705  * Returns false on EOF.
2706  */
2707 static bool
2709 {
2710  unsigned int tuplen;
2711 
2712  if (!state->mergeactive[srcTape])
2713  return false; /* tape's run is already exhausted */
2714 
2715  /* read next tuple, if any */
2716  if ((tuplen = getlen(state, srcTape, true)) == 0)
2717  {
2718  state->mergeactive[srcTape] = false;
2719  return false;
2720  }
2721  READTUP(state, stup, srcTape, tuplen);
2722 
2723  return true;
2724 }
2725 
2726 /*
2727  * dumptuples - remove tuples from memtuples and write initial run to tape
2728  *
2729  * When alltuples = true, dump everything currently in memory. (This case is
2730  * only used at end of input data.)
2731  */
2732 static void
2734 {
2735  int memtupwrite;
2736  int i;
2737 
2738  /*
2739  * Nothing to do if we still fit in available memory and have array slots,
2740  * unless this is the final call during initial run generation.
2741  */
2742  if (state->memtupcount < state->memtupsize && !LACKMEM(state) &&
2743  !alltuples)
2744  return;
2745 
2746  /*
2747  * Final call might require no sorting, in rare cases where we just so
2748  * happen to have previously LACKMEM()'d at the point where exactly all
2749  * remaining tuples are loaded into memory, just before input was
2750  * exhausted.
2751  *
2752  * In general, short final runs are quite possible. Rather than allowing
2753  * a special case where there was a superfluous selectnewtape() call (i.e.
2754  * a call with no subsequent run actually written to destTape), we prefer
2755  * to write out a 0 tuple run.
2756  *
2757  * mergereadnext() is prepared for 0 tuple runs, and will reliably mark
2758  * the tape inactive for the merge when called from beginmerge(). This
2759  * case is therefore similar to the case where mergeonerun() finds a dummy
2760  * run for the tape, and so doesn't need to merge a run from the tape (or
2761  * conceptually "merges" the dummy run, if you prefer). According to
2762  * Knuth, Algorithm D "isn't strictly optimal" in its method of
2763  * distribution and dummy run assignment; this edge case seems very
2764  * unlikely to make that appreciably worse.
2765  */
2766  Assert(state->status == TSS_BUILDRUNS);
2767 
2768  /*
2769  * It seems unlikely that this limit will ever be exceeded, but take no
2770  * chances
2771  */
2772  if (state->currentRun == INT_MAX)
2773  ereport(ERROR,
2774  (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
2775  errmsg("cannot have more than %d runs for an external sort",
2776  INT_MAX)));
2777 
2778  state->currentRun++;
2779 
2780 #ifdef TRACE_SORT
2781  if (trace_sort)
2782  elog(LOG, "starting quicksort of run %d: %s",
2783  state->currentRun, pg_rusage_show(&state->ru_start));
2784 #endif
2785 
2786  /*
2787  * Sort all tuples accumulated within the allowed amount of memory for
2788  * this run using quicksort
2789  */
2790  tuplesort_sort_memtuples(state);
2791 
2792 #ifdef TRACE_SORT
2793  if (trace_sort)
2794  elog(LOG, "finished quicksort of run %d: %s",
2795  state->currentRun, pg_rusage_show(&state->ru_start));
2796 #endif
2797 
2798  memtupwrite = state->memtupcount;
2799  for (i = 0; i < memtupwrite; i++)
2800  {
2801  WRITETUP(state, state->tp_tapenum[state->destTape],
2802  &state->memtuples[i]);
2803  state->memtupcount--;
2804  }
2805 
2806  /*
2807  * Reset tuple memory. We've freed all of the tuples that we previously
2808  * allocated. It's important to avoid fragmentation when there is a stark
2809  * change in the sizes of incoming tuples. Fragmentation due to
2810  * AllocSetFree's bucketing by size class might be particularly bad if
2811  * this step wasn't taken.
2812  */
2814 
2815  markrunend(state, state->tp_tapenum[state->destTape]);
2816  state->tp_runs[state->destTape]++;
2817  state->tp_dummy[state->destTape]--; /* per Alg D step D2 */
2818 
2819 #ifdef TRACE_SORT
2820  if (trace_sort)
2821  elog(LOG, "finished writing run %d to tape %d: %s",
2822  state->currentRun, state->destTape,
2823  pg_rusage_show(&state->ru_start));
2824 #endif
2825 
2826  if (!alltuples)
2827  selectnewtape(state);
2828 }
2829 
2830 /*
2831  * tuplesort_rescan - rewind and replay the scan
2832  */
2833 void
2835 {
2836  MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext);
2837 
2838  Assert(state->randomAccess);
2839 
2840  switch (state->status)
2841  {
2842  case TSS_SORTEDINMEM:
2843  state->current = 0;
2844  state->eof_reached = false;
2845  state->markpos_offset = 0;
2846  state->markpos_eof = false;
2847  break;
2848  case TSS_SORTEDONTAPE:
2850  state->result_tape,
2851  0);
2852  state->eof_reached = false;
2853  state->markpos_block = 0L;
2854  state->markpos_offset = 0;
2855  state->markpos_eof = false;
2856  break;
2857  default:
2858  elog(ERROR, "invalid tuplesort state");
2859  break;
2860  }
2861 
2862  MemoryContextSwitchTo(oldcontext);
2863 }
2864 
2865 /*
2866  * tuplesort_markpos - saves current position in the merged sort file
2867  */
2868 void
2870 {
2871  MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext);
2872 
2873  Assert(state->randomAccess);
2874 
2875  switch (state->status)
2876  {
2877  case TSS_SORTEDINMEM:
2878  state->markpos_offset = state->current;
2879  state->markpos_eof = state->eof_reached;
2880  break;
2881  case TSS_SORTEDONTAPE:
2882  LogicalTapeTell(state->tapeset,
2883  state->result_tape,
2884  &state->markpos_block,
2885  &state->markpos_offset);
2886  state->markpos_eof = state->eof_reached;
2887  break;
2888  default:
2889  elog(ERROR, "invalid tuplesort state");
2890  break;
2891  }
2892 
2893  MemoryContextSwitchTo(oldcontext);
2894 }
2895 
2896 /*
2897  * tuplesort_restorepos - restores current position in merged sort file to
2898  * last saved position
2899  */
2900 void
2902 {
2903  MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext);
2904 
2905  Assert(state->randomAccess);
2906 
2907  switch (state->status)
2908  {
2909  case TSS_SORTEDINMEM:
2910  state->current = state->markpos_offset;
2911  state->eof_reached = state->markpos_eof;
2912  break;
2913  case TSS_SORTEDONTAPE:
2914  LogicalTapeSeek(state->tapeset,
2915  state->result_tape,
2916  state->markpos_block,
2917  state->markpos_offset);
2918  state->eof_reached = state->markpos_eof;
2919  break;
2920  default:
2921  elog(ERROR, "invalid tuplesort state");
2922  break;
2923  }
2924 
2925  MemoryContextSwitchTo(oldcontext);
2926 }
2927 
2928 /*
2929  * tuplesort_get_stats - extract summary statistics
2930  *
2931  * This can be called after tuplesort_performsort() finishes to obtain
2932  * printable summary information about how the sort was performed.
2933  */
2934 void
2936  TuplesortInstrumentation *stats)
2937 {
2938  /*
2939  * Note: it might seem we should provide both memory and disk usage for a
2940  * disk-based sort. However, the current code doesn't track memory space
2941  * accurately once we have begun to return tuples to the caller (since we
2942  * don't account for pfree's the caller is expected to do), so we cannot
2943  * rely on availMem in a disk sort. This does not seem worth the overhead
2944  * to fix. Is it worth creating an API for the memory context code to
2945  * tell us how much is actually used in sortcontext?
2946  */
2947  if (state->tapeset)
2948  {
2950  stats->spaceUsed = LogicalTapeSetBlocks(state->tapeset) * (BLCKSZ / 1024);
2951  }
2952  else
2953  {
2955  stats->spaceUsed = (state->allowedMem - state->availMem + 1023) / 1024;
2956  }
2957 
2958  switch (state->status)
2959  {
2960  case TSS_SORTEDINMEM:
2961  if (state->boundUsed)
2963  else
2965  break;
2966  case TSS_SORTEDONTAPE:
2968  break;
2969  case TSS_FINALMERGE:
2971  break;
2972  default:
2974  break;
2975  }
2976 }
2977 
2978 /*
2979  * Convert TuplesortMethod to a string.
2980  */
2981 const char *
2983 {
2984  switch (m)
2985  {
2987  return "still in progress";
2989  return "top-N heapsort";
2990  case SORT_TYPE_QUICKSORT:
2991  return "quicksort";
2993  return "external sort";
2995  return "external merge";
2996  }
2997 
2998  return "unknown";
2999 }
3000 
3001 /*
3002  * Convert TuplesortSpaceType to a string.
3003  */
3004 const char *
3006 {
3008  return t == SORT_SPACE_TYPE_DISK ? "Disk" : "Memory";
3009 }
3010 
3011 
3012 /*
3013  * Heap manipulation routines, per Knuth's Algorithm 5.2.3H.
3014  */
3015 
3016 /*
3017  * Convert the existing unordered array of SortTuples to a bounded heap,
3018  * discarding all but the smallest "state->bound" tuples.
3019  *
3020  * When working with a bounded heap, we want to keep the largest entry
3021  * at the root (array entry zero), instead of the smallest as in the normal
3022  * sort case. This allows us to discard the largest entry cheaply.
3023  * Therefore, we temporarily reverse the sort direction.
3024  */
3025 static void
3027 {
3028  int tupcount = state->memtupcount;
3029  int i;
3030 
3031  Assert(state->status == TSS_INITIAL);
3032  Assert(state->bounded);
3033  Assert(tupcount >= state->bound);
3034 
3035  /* Reverse sort direction so largest entry will be at root */
3036  reversedirection(state);
3037 
3038  state->memtupcount = 0; /* make the heap empty */
3039  for (i = 0; i < tupcount; i++)
3040  {
3041  if (state->memtupcount < state->bound)
3042  {
3043  /* Insert next tuple into heap */
3044  /* Must copy source tuple to avoid possible overwrite */
3045  SortTuple stup = state->memtuples[i];
3046 
3047  tuplesort_heap_insert(state, &stup);
3048  }
3049  else
3050  {
3051  /*
3052  * The heap is full. Replace the largest entry with the new
3053  * tuple, or just discard it, if it's larger than anything already
3054  * in the heap.
3055  */
3056  if (COMPARETUP(state, &state->memtuples[i], &state->memtuples[0]) <= 0)
3057  {
3058  free_sort_tuple(state, &state->memtuples[i]);
3060  }
3061  else
3062  tuplesort_heap_replace_top(state, &state->memtuples[i]);
3063  }
3064  }
3065 
3066  Assert(state->memtupcount == state->bound);
3067  state->status = TSS_BOUNDED;
3068 }
3069 
3070 /*
3071  * Convert the bounded heap to a properly-sorted array
3072  */
3073 static void
3075 {
3076  int tupcount = state->memtupcount;
3077 
3078  Assert(state->status == TSS_BOUNDED);
3079  Assert(state->bounded);
3080  Assert(tupcount == state->bound);
3081 
3082  /*
3083  * We can unheapify in place because each delete-top call will remove the
3084  * largest entry, which we can promptly store in the newly freed slot at
3085  * the end. Once we're down to a single-entry heap, we're done.
3086  */
3087  while (state->memtupcount > 1)
3088  {
3089  SortTuple stup = state->memtuples[0];
3090 
3091  /* this sifts-up the next-largest entry and decreases memtupcount */
3093  state->memtuples[state->memtupcount] = stup;
3094  }
3095  state->memtupcount = tupcount;
3096 
3097  /*
3098  * Reverse sort direction back to the original state. This is not
3099  * actually necessary but seems like a good idea for tidiness.
3100  */
3101  reversedirection(state);
3102 
3103  state->status = TSS_SORTEDINMEM;
3104  state->boundUsed = true;
3105 }
3106 
3107 /*
3108  * Sort all memtuples using specialized qsort() routines.
3109  *
3110  * Quicksort is used for small in-memory sorts, and external sort runs.
3111  */
3112 static void
3114 {
3115  if (state->memtupcount > 1)
3116  {
3117  /* Can we use the single-key sort function? */
3118  if (state->onlyKey != NULL)
3119  qsort_ssup(state->memtuples, state->memtupcount,
3120  state->onlyKey);
3121  else
3122  qsort_tuple(state->memtuples,
3123  state->memtupcount,
3124  state->comparetup,
3125  state);
3126  }
3127 }
3128 
3129 /*
3130  * Insert a new tuple into an empty or existing heap, maintaining the
3131  * heap invariant. Caller is responsible for ensuring there's room.
3132  *
3133  * Note: For some callers, tuple points to a memtuples[] entry above the
3134  * end of the heap. This is safe as long as it's not immediately adjacent
3135  * to the end of the heap (ie, in the [memtupcount] array entry) --- if it
3136  * is, it might get overwritten before being moved into the heap!
3137  */
3138 static void
3140 {
3141  SortTuple *memtuples;
3142  int j;
3143 
3144  memtuples = state->memtuples;
3145  Assert(state->memtupcount < state->memtupsize);
3146 
3148 
3149  /*
3150  * Sift-up the new entry, per Knuth 5.2.3 exercise 16. Note that Knuth is
3151  * using 1-based array indexes, not 0-based.
3152  */
3153  j = state->memtupcount++;
3154  while (j > 0)
3155  {
3156  int i = (j - 1) >> 1;
3157 
3158  if (COMPARETUP(state, tuple, &memtuples[i]) >= 0)
3159  break;
3160  memtuples[j] = memtuples[i];
3161  j = i;
3162  }
3163  memtuples[j] = *tuple;
3164 }
3165 
3166 /*
3167  * Remove the tuple at state->memtuples[0] from the heap. Decrement
3168  * memtupcount, and sift up to maintain the heap invariant.
3169  *
3170  * The caller has already free'd the tuple the top node points to,
3171  * if necessary.
3172  */
3173 static void
3175 {
3176  SortTuple *memtuples = state->memtuples;
3177  SortTuple *tuple;
3178 
3179  if (--state->memtupcount <= 0)
3180  return;
3181 
3182  /*
3183  * Remove the last tuple in the heap, and re-insert it, by replacing the
3184  * current top node with it.
3185  */
3186  tuple = &memtuples[state->memtupcount];
3187  tuplesort_heap_replace_top(state, tuple);
3188 }
3189 
3190 /*
3191  * Replace the tuple at state->memtuples[0] with a new tuple. Sift up to
3192  * maintain the heap invariant.
3193  *
3194  * This corresponds to Knuth's "sift-up" algorithm (Algorithm 5.2.3H,
3195  * Heapsort, steps H3-H8).
3196  */
3197 static void
3199 {
3200  SortTuple *memtuples = state->memtuples;
3201  unsigned int i,
3202  n;
3203 
3204  Assert(state->memtupcount >= 1);
3205 
3207 
3208  /*
3209  * state->memtupcount is "int", but we use "unsigned int" for i, j, n.
3210  * This prevents overflow in the "2 * i + 1" calculation, since at the top
3211  * of the loop we must have i < n <= INT_MAX <= UINT_MAX/2.
3212  */
3213  n = state->memtupcount;
3214  i = 0; /* i is where the "hole" is */
3215  for (;;)
3216  {
3217  unsigned int j = 2 * i + 1;
3218 
3219  if (j >= n)
3220  break;
3221  if (j + 1 < n &&
3222  COMPARETUP(state, &memtuples[j], &memtuples[j + 1]) > 0)
3223  j++;
3224  if (COMPARETUP(state, tuple, &memtuples[j]) <= 0)
3225  break;
3226  memtuples[i] = memtuples[j];
3227  i = j;
3228  }
3229  memtuples[i] = *tuple;
3230 }
3231 
3232 /*
3233  * Function to reverse the sort direction from its current state
3234  *
3235  * It is not safe to call this when performing hash tuplesorts
3236  */
3237 static void
3239 {
3240  SortSupport sortKey = state->sortKeys;
3241  int nkey;
3242 
3243  for (nkey = 0; nkey < state->nKeys; nkey++, sortKey++)
3244  {
3245  sortKey->ssup_reverse = !sortKey->ssup_reverse;
3246  sortKey->ssup_nulls_first = !sortKey->ssup_nulls_first;
3247  }
3248 }
3249 
3250 
3251 /*
3252  * Tape interface routines
3253  */
3254 
3255 static unsigned int
3256 getlen(Tuplesortstate *state, int tapenum, bool eofOK)
3257 {
3258  unsigned int len;
3259 
3260  if (LogicalTapeRead(state->tapeset, tapenum,
3261  &len, sizeof(len)) != sizeof(len))
3262  elog(ERROR, "unexpected end of tape");
3263  if (len == 0 && !eofOK)
3264  elog(ERROR, "unexpected end of data");
3265  return len;
3266 }
3267 
3268 static void
3270 {
3271  unsigned int len = 0;
3272 
3273  LogicalTapeWrite(state->tapeset, tapenum, (void *) &len, sizeof(len));
3274 }
3275 
3276 /*
3277  * Get memory for tuple from within READTUP() routine.
3278  *
3279  * We use next free slot from the slab allocator, or palloc() if the tuple
3280  * is too large for that.
3281  */
3282 static void *
3284 {
3285  SlabSlot *buf;
3286 
3287  /*
3288  * We pre-allocate enough slots in the slab arena that we should never run
3289  * out.
3290  */
3291  Assert(state->slabFreeHead);
3292 
3293  if (tuplen > SLAB_SLOT_SIZE || !state->slabFreeHead)
3294  return MemoryContextAlloc(state->sortcontext, tuplen);
3295  else
3296  {
3297  buf = state->slabFreeHead;
3298  /* Reuse this slot */
3299  state->slabFreeHead = buf->nextfree;
3300 
3301  return buf;
3302  }
3303 }
3304 
3305 
3306 /*
3307  * Routines specialized for HeapTuple (actually MinimalTuple) case
3308  */
3309 
3310 static int
3312 {
3313  SortSupport sortKey = state->sortKeys;
3314  HeapTupleData ltup;
3315  HeapTupleData rtup;
3316  TupleDesc tupDesc;
3317  int nkey;
3318  int32 compare;
3319  AttrNumber attno;
3320  Datum datum1,
3321  datum2;
3322  bool isnull1,
3323  isnull2;
3324 
3325 
3326  /* Compare the leading sort key */
3327  compare = ApplySortComparator(a->datum1, a->isnull1,
3328  b->datum1, b->isnull1,
3329  sortKey);
3330  if (compare != 0)
3331  return compare;
3332 
3333  /* Compare additional sort keys */
3334  ltup.t_len = ((MinimalTuple) a->tuple)->t_len + MINIMAL_TUPLE_OFFSET;
3335  ltup.t_data = (HeapTupleHeader) ((char *) a->tuple - MINIMAL_TUPLE_OFFSET);
3336  rtup.t_len = ((MinimalTuple) b->tuple)->t_len + MINIMAL_TUPLE_OFFSET;
3337  rtup.t_data = (HeapTupleHeader) ((char *) b->tuple - MINIMAL_TUPLE_OFFSET);
3338  tupDesc = state->tupDesc;
3339 
3340  if (sortKey->abbrev_converter)
3341  {
3342  attno = sortKey->ssup_attno;
3343 
3344  datum1 = heap_getattr(&ltup, attno, tupDesc, &isnull1);
3345  datum2 = heap_getattr(&rtup, attno, tupDesc, &isnull2);
3346 
3347  compare = ApplySortAbbrevFullComparator(datum1, isnull1,
3348  datum2, isnull2,
3349  sortKey);
3350  if (compare != 0)
3351  return compare;
3352  }
3353 
3354  sortKey++;
3355  for (nkey = 1; nkey < state->nKeys; nkey++, sortKey++)
3356  {
3357  attno = sortKey->ssup_attno;
3358 
3359  datum1 = heap_getattr(&ltup, attno, tupDesc, &isnull1);
3360  datum2 = heap_getattr(&rtup, attno, tupDesc, &isnull2);
3361 
3362  compare = ApplySortComparator(datum1, isnull1,
3363  datum2, isnull2,
3364  sortKey);
3365  if (compare != 0)
3366  return compare;
3367  }
3368 
3369  return 0;
3370 }
3371 
3372 static void
3374 {
3375  /*
3376  * We expect the passed "tup" to be a TupleTableSlot, and form a
3377  * MinimalTuple using the exported interface for that.
3378  */
3379  TupleTableSlot *slot = (TupleTableSlot *) tup;
3380  Datum original;
3381  MinimalTuple tuple;
3382  HeapTupleData htup;
3383  MemoryContext oldcontext = MemoryContextSwitchTo(state->tuplecontext);
3384 
3385  /* copy the tuple into sort storage */
3386  tuple = ExecCopySlotMinimalTuple(slot);
3387  stup->tuple = (void *) tuple;
3388  USEMEM(state, GetMemoryChunkSpace(tuple));
3389  /* set up first-column key value */
3390  htup.t_len = tuple->t_len + MINIMAL_TUPLE_OFFSET;
3391  htup.t_data = (HeapTupleHeader) ((char *) tuple - MINIMAL_TUPLE_OFFSET);
3392  original = heap_getattr(&htup,
3393  state->sortKeys[0].ssup_attno,
3394  state->tupDesc,
3395  &stup->isnull1);
3396 
3397  MemoryContextSwitchTo(oldcontext);
3398 
3399  if (!state->sortKeys->abbrev_converter || stup->isnull1)
3400  {
3401  /*
3402  * Store ordinary Datum representation, or NULL value. If there is a
3403  * converter it won't expect NULL values, and cost model is not
3404  * required to account for NULL, so in that case we avoid calling
3405  * converter and just set datum1 to zeroed representation (to be
3406  * consistent, and to support cheap inequality tests for NULL
3407  * abbreviated keys).
3408  */
3409  stup->datum1 = original;
3410  }
3411  else if (!consider_abort_common(state))
3412  {
3413  /* Store abbreviated key representation */
3414  stup->datum1 = state->sortKeys->abbrev_converter(original,
3415  state->sortKeys);
3416  }
3417  else
3418  {
3419  /* Abort abbreviation */
3420  int i;
3421 
3422  stup->datum1 = original;
3423 
3424  /*
3425  * Set state to be consistent with never trying abbreviation.
3426  *
3427  * Alter datum1 representation in already-copied tuples, so as to
3428  * ensure a consistent representation (current tuple was just
3429  * handled). It does not matter if some dumped tuples are already
3430  * sorted on tape, since serialized tuples lack abbreviated keys
3431  * (TSS_BUILDRUNS state prevents control reaching here in any case).
3432  */
3433  for (i = 0; i < state->memtupcount; i++)
3434  {
3435  SortTuple *mtup = &state->memtuples[i];
3436 
3437  htup.t_len = ((MinimalTuple) mtup->tuple)->t_len +
3439  htup.t_data = (HeapTupleHeader) ((char *) mtup->tuple -
3441 
3442  mtup->datum1 = heap_getattr(&htup,
3443  state->sortKeys[0].ssup_attno,
3444  state->tupDesc,
3445  &mtup->isnull1);
3446  }
3447  }
3448 }
3449 
3450 static void
3452 {
3453  MinimalTuple tuple = (MinimalTuple) stup->tuple;
3454 
3455  /* the part of the MinimalTuple we'll write: */
3456  char *tupbody = (char *) tuple + MINIMAL_TUPLE_DATA_OFFSET;
3457  unsigned int tupbodylen = tuple->t_len - MINIMAL_TUPLE_DATA_OFFSET;
3458 
3459  /* total on-disk footprint: */
3460  unsigned int tuplen = tupbodylen + sizeof(int);
3461 
3462  LogicalTapeWrite(state->tapeset, tapenum,
3463  (void *) &tuplen, sizeof(tuplen));
3464  LogicalTapeWrite(state->tapeset, tapenum,
3465  (void *) tupbody, tupbodylen);
3466  if (state->randomAccess) /* need trailing length word? */
3467  LogicalTapeWrite(state->tapeset, tapenum,
3468  (void *) &tuplen, sizeof(tuplen));
3469 
3470  if (!state->slabAllocatorUsed)
3471  {
3472  FREEMEM(state, GetMemoryChunkSpace(tuple));
3473  heap_free_minimal_tuple(tuple);
3474  }
3475 }
3476 
3477 static void
3479  int tapenum, unsigned int len)
3480 {
3481  unsigned int tupbodylen = len - sizeof(int);
3482  unsigned int tuplen = tupbodylen + MINIMAL_TUPLE_DATA_OFFSET;
3483  MinimalTuple tuple = (MinimalTuple) readtup_alloc(state, tuplen);
3484  char *tupbody = (char *) tuple + MINIMAL_TUPLE_DATA_OFFSET;
3485  HeapTupleData htup;
3486 
3487  /* read in the tuple proper */
3488  tuple->t_len = tuplen;
3489  LogicalTapeReadExact(state->tapeset, tapenum,
3490  tupbody, tupbodylen);
3491  if (state->randomAccess) /* need trailing length word? */
3492  LogicalTapeReadExact(state->tapeset, tapenum,
3493  &tuplen, sizeof(tuplen));
3494  stup->tuple = (void *) tuple;
3495  /* set up first-column key value */
3496  htup.t_len = tuple->t_len + MINIMAL_TUPLE_OFFSET;
3497  htup.t_data = (HeapTupleHeader) ((char *) tuple - MINIMAL_TUPLE_OFFSET);
3498  stup->datum1 = heap_getattr(&htup,
3499  state->sortKeys[0].ssup_attno,
3500  state->tupDesc,
3501  &stup->isnull1);
3502 }
3503 
3504 /*
3505  * Routines specialized for the CLUSTER case (HeapTuple data, with
3506  * comparisons per a btree index definition)
3507  */
3508 
3509 static int
3512 {
3513  SortSupport sortKey = state->sortKeys;
3514  HeapTuple ltup;
3515  HeapTuple rtup;
3516  TupleDesc tupDesc;
3517  int nkey;
3518  int32 compare;
3519  Datum datum1,
3520  datum2;
3521  bool isnull1,
3522  isnull2;
3523  AttrNumber leading = state->indexInfo->ii_KeyAttrNumbers[0];
3524 
3525  /* Be prepared to compare additional sort keys */
3526  ltup = (HeapTuple) a->tuple;
3527  rtup = (HeapTuple) b->tuple;
3528  tupDesc = state->tupDesc;
3529 
3530  /* Compare the leading sort key, if it's simple */
3531  if (leading != 0)
3532  {
3533  compare = ApplySortComparator(a->datum1, a->isnull1,
3534  b->datum1, b->isnull1,
3535  sortKey);
3536  if (compare != 0)
3537  return compare;
3538 
3539  if (sortKey->abbrev_converter)
3540  {
3541  datum1 = heap_getattr(ltup, leading, tupDesc, &isnull1);
3542  datum2 = heap_getattr(rtup, leading, tupDesc, &isnull2);
3543 
3544  compare = ApplySortAbbrevFullComparator(datum1, isnull1,
3545  datum2, isnull2,
3546  sortKey);
3547  }
3548  if (compare != 0 || state->nKeys == 1)
3549  return compare;
3550  /* Compare additional columns the hard way */
3551  sortKey++;
3552  nkey = 1;
3553  }
3554  else
3555  {
3556  /* Must compare all keys the hard way */
3557  nkey = 0;
3558  }
3559 
3560  if (state->indexInfo->ii_Expressions == NULL)
3561  {
3562  /* If not expression index, just compare the proper heap attrs */
3563 
3564  for (; nkey < state->nKeys; nkey++, sortKey++)
3565  {
3566  AttrNumber attno = state->indexInfo->ii_KeyAttrNumbers[nkey];
3567 
3568  datum1 = heap_getattr(ltup, attno, tupDesc, &isnull1);
3569  datum2 = heap_getattr(rtup, attno, tupDesc, &isnull2);
3570 
3571  compare = ApplySortComparator(datum1, isnull1,
3572  datum2, isnull2,
3573  sortKey);
3574  if (compare != 0)
3575  return compare;
3576  }
3577  }
3578  else
3579  {
3580  /*
3581  * In the expression index case, compute the whole index tuple and
3582  * then compare values. It would perhaps be faster to compute only as
3583  * many columns as we need to compare, but that would require
3584  * duplicating all the logic in FormIndexDatum.
3585  */
3586  Datum l_index_values[INDEX_MAX_KEYS];
3587  bool l_index_isnull[INDEX_MAX_KEYS];
3588  Datum r_index_values[INDEX_MAX_KEYS];
3589  bool r_index_isnull[INDEX_MAX_KEYS];
3590  TupleTableSlot *ecxt_scantuple;
3591 
3592  /* Reset context each time to prevent memory leakage */
3594 
3595  ecxt_scantuple = GetPerTupleExprContext(state->estate)->ecxt_scantuple;
3596 
3597  ExecStoreTuple(ltup, ecxt_scantuple, InvalidBuffer, false);
3598  FormIndexDatum(state->indexInfo, ecxt_scantuple, state->estate,
3599  l_index_values, l_index_isnull);
3600 
3601  ExecStoreTuple(rtup, ecxt_scantuple, InvalidBuffer, false);
3602  FormIndexDatum(state->indexInfo, ecxt_scantuple, state->estate,
3603  r_index_values, r_index_isnull);
3604 
3605  for (; nkey < state->nKeys; nkey++, sortKey++)
3606  {
3607  compare = ApplySortComparator(l_index_values[nkey],
3608  l_index_isnull[nkey],
3609  r_index_values[nkey],
3610  r_index_isnull[nkey],
3611  sortKey);
3612  if (compare != 0)
3613  return compare;
3614  }
3615  }
3616 
3617  return 0;
3618 }
3619 
3620 static void
3622 {
3623  HeapTuple tuple = (HeapTuple) tup;
3624  Datum original;
3625  MemoryContext oldcontext = MemoryContextSwitchTo(state->tuplecontext);
3626 
3627  /* copy the tuple into sort storage */
3628  tuple = heap_copytuple(tuple);
3629  stup->tuple = (void *) tuple;
3630  USEMEM(state, GetMemoryChunkSpace(tuple));
3631 
3632  MemoryContextSwitchTo(oldcontext);
3633 
3634  /*
3635  * set up first-column key value, and potentially abbreviate, if it's a
3636  * simple column
3637  */
3638  if (state->indexInfo->ii_KeyAttrNumbers[0] == 0)
3639  return;
3640 
3641  original = heap_getattr(tuple,
3642  state->indexInfo->ii_KeyAttrNumbers[0],
3643  state->tupDesc,
3644  &stup->isnull1);
3645 
3646  if (!state->sortKeys->abbrev_converter || stup->isnull1)
3647  {
3648  /*
3649  * Store ordinary Datum representation, or NULL value. If there is a
3650  * converter it won't expect NULL values, and cost model is not
3651  * required to account for NULL, so in that case we avoid calling
3652  * converter and just set datum1 to zeroed representation (to be
3653  * consistent, and to support cheap inequality tests for NULL
3654  * abbreviated keys).
3655  */
3656  stup->datum1 = original;
3657  }
3658  else if (!consider_abort_common(state))
3659  {
3660  /* Store abbreviated key representation */
3661  stup->datum1 = state->sortKeys->abbrev_converter(original,
3662  state->sortKeys);
3663  }
3664  else
3665  {
3666  /* Abort abbreviation */
3667  int i;
3668 
3669  stup->datum1 = original;
3670 
3671  /*
3672  * Set state to be consistent with never trying abbreviation.
3673  *
3674  * Alter datum1 representation in already-copied tuples, so as to
3675  * ensure a consistent representation (current tuple was just
3676  * handled). It does not matter if some dumped tuples are already
3677  * sorted on tape, since serialized tuples lack abbreviated keys
3678  * (TSS_BUILDRUNS state prevents control reaching here in any case).
3679  */
3680  for (i = 0; i < state->memtupcount; i++)
3681  {
3682  SortTuple *mtup = &state->memtuples[i];
3683 
3684  tuple = (HeapTuple) mtup->tuple;
3685  mtup->datum1 = heap_getattr(tuple,
3686  state->indexInfo->ii_KeyAttrNumbers[0],
3687  state->tupDesc,
3688  &mtup->isnull1);
3689  }
3690  }
3691 }
3692 
3693 static void
3695 {
3696  HeapTuple tuple = (HeapTuple) stup->tuple;
3697  unsigned int tuplen = tuple->t_len + sizeof(ItemPointerData) + sizeof(int);
3698 
3699  /* We need to store t_self, but not other fields of HeapTupleData */
3700  LogicalTapeWrite(state->tapeset, tapenum,
3701  &tuplen, sizeof(tuplen));
3702  LogicalTapeWrite(state->tapeset, tapenum,
3703  &tuple->t_self, sizeof(ItemPointerData));
3704  LogicalTapeWrite(state->tapeset, tapenum,
3705  tuple->t_data, tuple->t_len);
3706  if (state->randomAccess) /* need trailing length word? */
3707  LogicalTapeWrite(state->tapeset, tapenum,
3708  &tuplen, sizeof(tuplen));
3709 
3710  if (!state->slabAllocatorUsed)
3711  {
3712  FREEMEM(state, GetMemoryChunkSpace(tuple));
3713  heap_freetuple(tuple);
3714  }
3715 }
3716 
3717 static void
3719  int tapenum, unsigned int tuplen)
3720 {
3721  unsigned int t_len = tuplen - sizeof(ItemPointerData) - sizeof(int);
3722  HeapTuple tuple = (HeapTuple) readtup_alloc(state,
3723  t_len + HEAPTUPLESIZE);
3724 
3725  /* Reconstruct the HeapTupleData header */
3726  tuple->t_data = (HeapTupleHeader) ((char *) tuple + HEAPTUPLESIZE);
3727  tuple->t_len = t_len;
3728  LogicalTapeReadExact(state->tapeset, tapenum,
3729  &tuple->t_self, sizeof(ItemPointerData));
3730  /* We don't currently bother to reconstruct t_tableOid */
3731  tuple->t_tableOid = InvalidOid;
3732  /* Read in the tuple body */
3733  LogicalTapeReadExact(state->tapeset, tapenum,
3734  tuple->t_data, tuple->t_len);
3735  if (state->randomAccess) /* need trailing length word? */
3736  LogicalTapeReadExact(state->tapeset, tapenum,
3737  &tuplen, sizeof(tuplen));
3738  stup->tuple = (void *) tuple;
3739  /* set up first-column key value, if it's a simple column */
3740  if (state->indexInfo->ii_KeyAttrNumbers[0] != 0)
3741  stup->datum1 = heap_getattr(tuple,
3742  state->indexInfo->ii_KeyAttrNumbers[0],
3743  state->tupDesc,
3744  &stup->isnull1);
3745 }
3746 
3747 /*
3748  * Routines specialized for IndexTuple case
3749  *
3750  * The btree and hash cases require separate comparison functions, but the
3751  * IndexTuple representation is the same so the copy/write/read support
3752  * functions can be shared.
3753  */
3754 
3755 static int
3758 {
3759  /*
3760  * This is similar to comparetup_heap(), but expects index tuples. There
3761  * is also special handling for enforcing uniqueness, and special
3762  * treatment for equal keys at the end.
3763  */
3764  SortSupport sortKey = state->sortKeys;
3765  IndexTuple tuple1;
3766  IndexTuple tuple2;
3767  int keysz;
3768  TupleDesc tupDes;
3769  bool equal_hasnull = false;
3770  int nkey;
3771  int32 compare;
3772  Datum datum1,
3773  datum2;
3774  bool isnull1,
3775  isnull2;
3776 
3777 
3778  /* Compare the leading sort key */
3779  compare = ApplySortComparator(a->datum1, a->isnull1,
3780  b->datum1, b->isnull1,
3781  sortKey);
3782  if (compare != 0)
3783  return compare;
3784 
3785  /* Compare additional sort keys */
3786  tuple1 = (IndexTuple) a->tuple;
3787  tuple2 = (IndexTuple) b->tuple;
3788  keysz = state->nKeys;
3789  tupDes = RelationGetDescr(state->indexRel);
3790 
3791  if (sortKey->abbrev_converter)
3792  {
3793  datum1 = index_getattr(tuple1, 1, tupDes, &isnull1);
3794  datum2 = index_getattr(tuple2, 1, tupDes, &isnull2);
3795 
3796  compare = ApplySortAbbrevFullComparator(datum1, isnull1,
3797  datum2, isnull2,
3798  sortKey);
3799  if (compare != 0)
3800  return compare;
3801  }
3802 
3803  /* they are equal, so we only need to examine one null flag */
3804  if (a->isnull1)
3805  equal_hasnull = true;
3806 
3807  sortKey++;
3808  for (nkey = 2; nkey <= keysz; nkey++, sortKey++)
3809  {
3810  datum1 = index_getattr(tuple1, nkey, tupDes, &isnull1);
3811  datum2 = index_getattr(tuple2, nkey, tupDes, &isnull2);
3812 
3813  compare = ApplySortComparator(datum1, isnull1,
3814  datum2, isnull2,
3815  sortKey);
3816  if (compare != 0)
3817  return compare; /* done when we find unequal attributes */
3818 
3819  /* they are equal, so we only need to examine one null flag */
3820  if (isnull1)
3821  equal_hasnull = true;
3822  }
3823 
3824  /*
3825  * If btree has asked us to enforce uniqueness, complain if two equal
3826  * tuples are detected (unless there was at least one NULL field).
3827  *
3828  * It is sufficient to make the test here, because if two tuples are equal
3829  * they *must* get compared at some stage of the sort --- otherwise the
3830  * sort algorithm wouldn't have checked whether one must appear before the
3831  * other.
3832  */
3833  if (state->enforceUnique && !equal_hasnull)
3834  {
3836  bool isnull[INDEX_MAX_KEYS];
3837  char *key_desc;
3838 
3839  /*
3840  * Some rather brain-dead implementations of qsort (such as the one in
3841  * QNX 4) will sometimes call the comparison routine to compare a
3842  * value to itself, but we always use our own implementation, which
3843  * does not.
3844  */
3845  Assert(tuple1 != tuple2);
3846 
3847  index_deform_tuple(tuple1, tupDes, values, isnull);
3848 
3849  key_desc = BuildIndexValueDescription(state->indexRel, values, isnull);
3850 
3851  ereport(ERROR,
3852  (errcode(ERRCODE_UNIQUE_VIOLATION),
3853  errmsg("could not create unique index \"%s\"",
3855  key_desc ? errdetail("Key %s is duplicated.", key_desc) :
3856  errdetail("Duplicate keys exist."),
3857  errtableconstraint(state->heapRel,
3858  RelationGetRelationName(state->indexRel))));
3859  }
3860 
3861  /*
3862  * If key values are equal, we sort on ItemPointer. This does not affect
3863  * validity of the finished index, but it may be useful to have index
3864  * scans in physical order.
3865  */
3866  {
3867  BlockNumber blk1 = ItemPointerGetBlockNumber(&tuple1->t_tid);
3868  BlockNumber blk2 = ItemPointerGetBlockNumber(&tuple2->t_tid);
3869 
3870  if (blk1 != blk2)
3871  return (blk1 < blk2) ? -1 : 1;
3872  }
3873  {
3876 
3877  if (pos1 != pos2)
3878  return (pos1 < pos2) ? -1 : 1;
3879  }
3880 
3881  return 0;
3882 }
3883 
3884 static int
3887 {
3888  Bucket bucket1;
3889  Bucket bucket2;
3890  IndexTuple tuple1;
3891  IndexTuple tuple2;
3892 
3893  /*
3894  * Fetch hash keys and mask off bits we don't want to sort by. We know
3895  * that the first column of the index tuple is the hash key.
3896  */
3897  Assert(!a->isnull1);
3899  state->max_buckets, state->high_mask,
3900  state->low_mask);
3901  Assert(!b->isnull1);
3903  state->max_buckets, state->high_mask,
3904  state->low_mask);
3905  if (bucket1 > bucket2)
3906  return 1;
3907  else if (bucket1 < bucket2)
3908  return -1;
3909 
3910  /*
3911  * If hash values are equal, we sort on ItemPointer. This does not affect
3912  * validity of the finished index, but it may be useful to have index
3913  * scans in physical order.
3914  */
3915  tuple1 = (IndexTuple) a->tuple;
3916  tuple2 = (IndexTuple) b->tuple;
3917 
3918  {
3919  BlockNumber blk1 = ItemPointerGetBlockNumber(&tuple1->t_tid);
3920  BlockNumber blk2 = ItemPointerGetBlockNumber(&tuple2->t_tid);
3921 
3922  if (blk1 != blk2)
3923  return (blk1 < blk2) ? -1 : 1;
3924  }
3925  {
3928 
3929  if (pos1 != pos2)
3930  return (pos1 < pos2) ? -1 : 1;
3931  }
3932 
3933  return 0;
3934 }
3935 
3936 static void
3938 {
3939  IndexTuple tuple = (IndexTuple) tup;
3940  unsigned int tuplen = IndexTupleSize(tuple);
3941  IndexTuple newtuple;
3942  Datum original;
3943 
3944  /* copy the tuple into sort storage */
3945  newtuple = (IndexTuple) MemoryContextAlloc(state->tuplecontext, tuplen);
3946  memcpy(newtuple, tuple, tuplen);
3947  USEMEM(state, GetMemoryChunkSpace(newtuple));
3948  stup->tuple = (void *) newtuple;
3949  /* set up first-column key value */
3950  original = index_getattr(newtuple,
3951  1,
3952  RelationGetDescr(state->indexRel),
3953  &stup->isnull1);
3954 
3955  if (!state->sortKeys->abbrev_converter || stup->isnull1)
3956  {
3957  /*
3958  * Store ordinary Datum representation, or NULL value. If there is a
3959  * converter it won't expect NULL values, and cost model is not
3960  * required to account for NULL, so in that case we avoid calling
3961  * converter and just set datum1 to zeroed representation (to be
3962  * consistent, and to support cheap inequality tests for NULL
3963  * abbreviated keys).
3964  */
3965  stup->datum1 = original;
3966  }
3967  else if (!consider_abort_common(state))
3968  {
3969  /* Store abbreviated key representation */
3970  stup->datum1 = state->sortKeys->abbrev_converter(original,
3971  state->sortKeys);
3972  }
3973  else
3974  {
3975  /* Abort abbreviation */
3976  int i;
3977 
3978  stup->datum1 = original;
3979 
3980  /*
3981  * Set state to be consistent with never trying abbreviation.
3982  *
3983  * Alter datum1 representation in already-copied tuples, so as to
3984  * ensure a consistent representation (current tuple was just
3985  * handled). It does not matter if some dumped tuples are already
3986  * sorted on tape, since serialized tuples lack abbreviated keys
3987  * (TSS_BUILDRUNS state prevents control reaching here in any case).
3988  */
3989  for (i = 0; i < state->memtupcount; i++)
3990  {
3991  SortTuple *mtup = &state->memtuples[i];
3992 
3993  tuple = (IndexTuple) mtup->tuple;
3994  mtup->datum1 = index_getattr(tuple,
3995  1,
3996  RelationGetDescr(state->indexRel),
3997  &mtup->isnull1);
3998  }
3999  }
4000 }
4001 
4002 static void
4004 {
4005  IndexTuple tuple = (IndexTuple) stup->tuple;
4006  unsigned int tuplen;
4007 
4008  tuplen = IndexTupleSize(tuple) + sizeof(tuplen);
4009  LogicalTapeWrite(state->tapeset, tapenum,
4010  (void *) &tuplen, sizeof(tuplen));
4011  LogicalTapeWrite(state->tapeset, tapenum,
4012  (void *) tuple, IndexTupleSize(tuple));
4013  if (state->randomAccess) /* need trailing length word? */
4014  LogicalTapeWrite(state->tapeset, tapenum,
4015  (void *) &tuplen, sizeof(tuplen));
4016 
4017  if (!state->slabAllocatorUsed)
4018  {
4019  FREEMEM(state, GetMemoryChunkSpace(tuple));
4020  pfree(tuple);
4021  }
4022 }
4023 
4024 static void
4026  int tapenum, unsigned int len)
4027 {
4028  unsigned int tuplen = len - sizeof(unsigned int);
4029  IndexTuple tuple = (IndexTuple) readtup_alloc(state, tuplen);
4030 
4031  LogicalTapeReadExact(state->tapeset, tapenum,
4032  tuple, tuplen);
4033  if (state->randomAccess) /* need trailing length word? */
4034  LogicalTapeReadExact(state->tapeset, tapenum,
4035  &tuplen, sizeof(tuplen));
4036  stup->tuple = (void *) tuple;
4037  /* set up first-column key value */
4038  stup->datum1 = index_getattr(tuple,
4039  1,
4040  RelationGetDescr(state->indexRel),
4041  &stup->isnull1);
4042 }
4043 
4044 /*
4045  * Routines specialized for DatumTuple case
4046  */
4047 
4048 static int
4050 {
4051  int compare;
4052 
4053  compare = ApplySortComparator(a->datum1, a->isnull1,
4054  b->datum1, b->isnull1,
4055  state->sortKeys);
4056  if (compare != 0)
4057  return compare;
4058 
4059  /* if we have abbreviations, then "tuple" has the original value */
4060 
4061  if (state->sortKeys->abbrev_converter)
4063  PointerGetDatum(b->tuple), b->isnull1,
4064  state->sortKeys);
4065 
4066  return compare;
4067 }
4068 
4069 static void
4071 {
4072  /* Not currently needed */
4073  elog(ERROR, "copytup_datum() should not be called");
4074 }
4075 
4076 static void
4078 {
4079  void *waddr;
4080  unsigned int tuplen;
4081  unsigned int writtenlen;
4082 
4083  if (stup->isnull1)
4084  {
4085  waddr = NULL;
4086  tuplen = 0;
4087  }
4088  else if (!state->tuples)
4089  {
4090  waddr = &stup->datum1;
4091  tuplen = sizeof(Datum);
4092  }
4093  else
4094  {
4095  waddr = stup->tuple;
4096  tuplen = datumGetSize(PointerGetDatum(stup->tuple), false, state->datumTypeLen);
4097  Assert(tuplen != 0);
4098  }
4099 
4100  writtenlen = tuplen + sizeof(unsigned int);
4101 
4102  LogicalTapeWrite(state->tapeset, tapenum,
4103  (void *) &writtenlen, sizeof(writtenlen));
4104  LogicalTapeWrite(state->tapeset, tapenum,
4105  waddr, tuplen);
4106  if (state->randomAccess) /* need trailing length word? */
4107  LogicalTapeWrite(state->tapeset, tapenum,
4108  (void *) &writtenlen, sizeof(writtenlen));
4109 
4110  if (!state->slabAllocatorUsed && stup->tuple)
4111  {
4112  FREEMEM(state, GetMemoryChunkSpace(stup->tuple));
4113  pfree(stup->tuple);
4114  }
4115 }
4116 
4117 static void
4119  int tapenum, unsigned int len)
4120 {
4121  unsigned int tuplen = len - sizeof(unsigned int);
4122 
4123  if (tuplen == 0)
4124  {
4125  /* it's NULL */
4126  stup->datum1 = (Datum) 0;
4127  stup->isnull1 = true;
4128  stup->tuple = NULL;
4129  }
4130  else if (!state->tuples)
4131  {
4132  Assert(tuplen == sizeof(Datum));
4133  LogicalTapeReadExact(state->tapeset, tapenum,
4134  &stup->datum1, tuplen);
4135  stup->isnull1 = false;
4136  stup->tuple = NULL;
4137  }
4138  else
4139  {
4140  void *raddr = readtup_alloc(state, tuplen);
4141 
4142  LogicalTapeReadExact(state->tapeset, tapenum,
4143  raddr, tuplen);
4144  stup->datum1 = PointerGetDatum(raddr);
4145  stup->isnull1 = false;
4146  stup->tuple = raddr;
4147  }
4148 
4149  if (state->randomAccess) /* need trailing length word? */
4150  LogicalTapeReadExact(state->tapeset, tapenum,
4151  &tuplen, sizeof(tuplen));
4152 }
4153 
4154 /*
4155  * Convenience routine to free a tuple previously loaded into sort memory
4156  */
4157 static void
4159 {
4160  FREEMEM(state, GetMemoryChunkSpace(stup->tuple));
4161  pfree(stup->tuple);
4162 }
struct SortSupportData * SortSupport
Definition: sortsupport.h:58
IndexTuple tuplesort_getindextuple(Tuplesortstate *state, bool forward)
Definition: tuplesort.c:2052
#define MINIMAL_TUPLE_DATA_OFFSET
Definition: htup_details.h:629
signed short int16
Definition: c.h:293
static int comparetup_index_hash(const SortTuple *a, const SortTuple *b, Tuplesortstate *state)
Definition: tuplesort.c:3885
void FormIndexDatum(IndexInfo *indexInfo, TupleTableSlot *slot, EState *estate, Datum *values, bool *isnull)
Definition: index.c:1774
HeapTuple heap_copytuple(HeapTuple tuple)
Definition: heaptuple.c:611
bool ssup_nulls_first
Definition: sortsupport.h:75
#define DatumGetUInt32(X)
Definition: postgres.h:492
ScanKey _bt_mkscankey_nodata(Relation rel)
Definition: nbtutils.c:115
size_t LogicalTapeRead(LogicalTapeSet *lts, int tapenum, void *ptr, size_t size)
Definition: logtape.c:655
TupleTableSlot * ExecStoreTuple(HeapTuple tuple, TupleTableSlot *slot, Buffer buffer, bool shouldFree)
Definition: execTuples.c:320
static int comparetup_heap(const SortTuple *a, const SortTuple *b, Tuplesortstate *state)
Definition: tuplesort.c:3311
static void dumptuples(Tuplesortstate *state, bool alltuples)
Definition: tuplesort.c:2733
static void reversedirection(Tuplesortstate *state)
Definition: tuplesort.c:3238
bool tuplesort_getdatum(Tuplesortstate *state, bool forward, Datum *val, bool *isNull, Datum *abbrev)
Definition: tuplesort.c:2081
int64 availMem
Definition: tuplesort.c:230
size_t read_buffer_size
Definition: tuplesort.c:322
TupSortStatus status
Definition: tuplesort.c:222
static void writetup_cluster(Tuplesortstate *state, int tapenum, SortTuple *stup)
Definition: tuplesort.c:3694
Relation heapRel
Definition: tuplesort.c:411
void tuplesort_performsort(Tuplesortstate *state)
Definition: tuplesort.c:1655
static bool grow_memtuples(Tuplesortstate *state)
Definition: tuplesort.c:1182
void MemoryContextDelete(MemoryContext context)
Definition: mcxt.c:198
HeapTupleData * HeapTuple
Definition: htup.h:70
HeapTuple tuplesort_getheaptuple(Tuplesortstate *state, bool forward)
Definition: tuplesort.c:2032
TupleTableSlot * ExecStoreMinimalTuple(MinimalTuple mtup, TupleTableSlot *slot, bool shouldFree)
Definition: execTuples.c:384
#define BTGreaterStrategyNumber
Definition: stratnum.h:33
const char * tuplesort_space_type_name(TuplesortSpaceType t)
Definition: tuplesort.c:3005
void tuplesort_putdatum(Tuplesortstate *state, Datum val, bool isNull)
Definition: tuplesort.c:1422
#define AssertState(condition)
Definition: c.h:683
char * slabMemoryEnd
Definition: tuplesort.c:318
void tuplesort_restorepos(Tuplesortstate *state)
Definition: tuplesort.c:2901
static void mergeonerun(Tuplesortstate *state)
Definition: tuplesort.c:2596
PGRUsage ru_start
Definition: tuplesort.c:434
static void sort_bounded_heap(Tuplesortstate *state)
Definition: tuplesort.c:3074
TuplesortMethod
Definition: tuplesort.h:39
#define ResetPerTupleExprContext(estate)
Definition: executor.h:477
int64 abbrevNext
Definition: tuplesort.c:397
#define RelationGetDescr(relation)
Definition: rel.h:437
Bucket _hash_hashkey2bucket(uint32 hashkey, uint32 maxbucket, uint32 highmask, uint32 lowmask)
Definition: hashutil.c:125
#define RelationGetNumberOfAttributes(relation)
Definition: rel.h:431
EState * estate
Definition: tuplesort.c:405
#define PointerGetDatum(X)
Definition: postgres.h:562
HeapTupleHeaderData * HeapTupleHeader
Definition: htup.h:23
static void copytup_index(Tuplesortstate *state, SortTuple *stup, void *tup)
Definition: tuplesort.c:3937
int(* SortTupleComparator)(const SortTuple *a, const SortTuple *b, Tuplesortstate *state)
Definition: tuplesort.c:214
SortSupport sortKeys
Definition: tuplesort.c:383
void _bt_freeskey(ScanKey skey)
Definition: nbtutils.c:155
ItemPointerData t_tid
Definition: itup.h:37
void(* copytup)(Tuplesortstate *state, SortTuple *stup, void *tup)
Definition: tuplesort.c:255
#define BTREE_AM_OID
Definition: pg_am.h:70
#define Min(x, y)
Definition: c.h:812
char buffer[SLAB_SLOT_SIZE]
Definition: tuplesort.c:181
MinimalTuple ExecCopySlotMinimalTuple(TupleTableSlot *slot)
Definition: execTuples.c:577
void PrepareSortSupportFromOrderingOp(Oid orderingOp, SortSupport ssup)
Definition: sortsupport.c:133
bool randomAccess
Definition: tuplesort.c:224
TupleTableSlot * ExecClearTuple(TupleTableSlot *slot)
Definition: execTuples.c:439
Tuplesortstate * tuplesort_begin_index_btree(Relation heapRel, Relation indexRel, bool enforceUnique, int workMem, bool randomAccess)
Definition: tuplesort.c:857
static MemoryContext MemoryContextSwitchTo(MemoryContext context)
Definition: palloc.h:109
Datum datum1
Definition: tuplesort.c:160
#define InvalidBuffer
Definition: buf.h:25
SortTupleComparator comparetup
Definition: tuplesort.c:247
#define CLUSTER_SORT
Definition: tuplesort.c:114
int errcode(int sqlerrcode)
Definition: elog.c:575
#define LogicalTapeReadExact(tapeset, tapenum, ptr, len)
Definition: tuplesort.c:517
bool growmemtuples
Definition: tuplesort.c:286
Size GetMemoryChunkSpace(void *pointer)
Definition: mcxt.c:390
#define SLAB_SLOT_SIZE
Definition: tuplesort.c:176
static int comparetup_cluster(const SortTuple *a, const SortTuple *b, Tuplesortstate *state)
Definition: tuplesort.c:3510
void MemoryContextReset(MemoryContext context)
Definition: mcxt.c:134
uint32 BlockNumber
Definition: block.h:31
#define INDEX_SORT
Definition: tuplesort.c:112
IndexInfo * BuildIndexInfo(Relation index)
Definition: index.c:1645
#define LOG
Definition: elog.h:26
Form_pg_class rd_rel
Definition: rel.h:114
void heap_freetuple(HeapTuple htup)
Definition: heaptuple.c:1373
unsigned int Oid
Definition: postgres_ext.h:31
#define HEAP_SORT
Definition: tuplesort.c:111
static void copytup_cluster(Tuplesortstate *state, SortTuple *stup, void *tup)
Definition: tuplesort.c:3621
static int comparetup_index_btree(const SortTuple *a, const SortTuple *b, Tuplesortstate *state)
Definition: tuplesort.c:3756
bool isnull1
Definition: tuplesort.c:161
void tuplesort_get_stats(Tuplesortstate *state, TuplesortInstrumentation *stats)
Definition: tuplesort.c:2935
const char * tuplesort_method_name(TuplesortMethod m)
Definition: tuplesort.c:2982
MinimalTuple heap_copy_minimal_tuple(MinimalTuple mtup)
Definition: heaptuple.c:1480
int markpos_offset
Definition: tuplesort.c:374
bool trace_sort
Definition: tuplesort.c:118
static void mergeruns(Tuplesortstate *state)
Definition: tuplesort.c:2372
void LogicalTapeRewindForWrite(LogicalTapeSet *lts, int tapenum)
Definition: logtape.c:629
static void markrunend(Tuplesortstate *state, int tapenum)
Definition: tuplesort.c:3269
static unsigned int getlen(Tuplesortstate *state, int tapenum, bool eofOK)
Definition: tuplesort.c:3256
#define MaxAllocHugeSize
Definition: memutils.h:44
signed int int32
Definition: c.h:294
static void init_slab_allocator(Tuplesortstate *state, int numSlots)
Definition: tuplesort.c:2336
LogicalTapeSet * LogicalTapeSetCreate(int ntapes)
Definition: logtape.c:379
#define DATUM_SORT
Definition: tuplesort.c:113
uint16 OffsetNumber
Definition: off.h:24
HeapTupleHeader t_data
Definition: htup.h:67
void pg_rusage_init(PGRUsage *ru0)
Definition: pg_rusage.c:27
uint32 Bucket
Definition: hash.h:34
IndexTuple index_form_tuple(TupleDesc tupleDescriptor, Datum *values, bool *isnull)
Definition: indextuple.c:37
void FreeExecutorState(EState *estate)
Definition: execUtils.c:186
#define GetPerTupleExprContext(estate)
Definition: executor.h:468
void * tuple
Definition: tuplesort.c:159
int errtableconstraint(Relation rel, const char *conname)
Definition: relcache.c:5315
#define TAPE_BUFFER_OVERHEAD
Definition: tuplesort.c:211
static bool consider_abort_common(Tuplesortstate *state)
Definition: tuplesort.c:1611
void pfree(void *pointer)
Definition: mcxt.c:936
union SlabSlot * nextfree
Definition: tuplesort.c:180
static int compare(const void *arg1, const void *arg2)
Definition: geqo_pool.c:145
void LogicalTapeWrite(LogicalTapeSet *lts, int tapenum, void *ptr, size_t size)
Definition: logtape.c:464
static void readtup_cluster(Tuplesortstate *state, SortTuple *stup, int tapenum, unsigned int len)
Definition: tuplesort.c:3718
#define ERROR
Definition: elog.h:43
void PrepareTempTablespaces(void)
Definition: tablespace.c:1287
void heap_free_minimal_tuple(MinimalTuple mtup)
Definition: heaptuple.c:1468
void * lastReturnedTuple
Definition: tuplesort.c:330
MemoryContext sortcontext
Definition: tuplesort.c:234
static void writetup_heap(Tuplesortstate *state, int tapenum, SortTuple *stup)
Definition: tuplesort.c:3451
static void writetup_index(Tuplesortstate *state, int tapenum, SortTuple *stup)
Definition: tuplesort.c:4003
MemoryContext ssup_cxt
Definition: sortsupport.h:66
uint32 high_mask
Definition: tuplesort.c:418
#define MERGE_BUFFER_SIZE
Definition: tuplesort.c:212
ItemPointerData t_self
Definition: htup.h:65
static int comparetup_datum(const SortTuple *a, const SortTuple *b, Tuplesortstate *state)
Definition: tuplesort.c:4049
void tuplesort_rescan(Tuplesortstate *state)
Definition: tuplesort.c:2834
void(* readtup)(Tuplesortstate *state, SortTuple *stup, int tapenum, unsigned int len)
Definition: tuplesort.c:273
#define ALLOCSET_DEFAULT_SIZES
Definition: memutils.h:197
bool tuplesort_gettupleslot(Tuplesortstate *state, bool forward, bool copy, TupleTableSlot *slot, Datum *abbrev)
Definition: tuplesort.c:1995
int(* comparator)(Datum x, Datum y, SortSupport ssup)
Definition: sortsupport.h:107
static void free_sort_tuple(Tuplesortstate *state, SortTuple *stup)
Definition: tuplesort.c:4158
static void readtup_index(Tuplesortstate *state, SortTuple *stup, int tapenum, unsigned int len)
Definition: tuplesort.c:4025
Tuplesortstate * tuplesort_begin_datum(Oid datumType, Oid sortOperator, Oid sortCollation, bool nullsFirstFlag, int workMem, bool randomAccess)
Definition: tuplesort.c:975
uint32 t_len
Definition: htup.h:64
static void copytup_datum(Tuplesortstate *state, SortTuple *stup, void *tup)
Definition: tuplesort.c:4070
Tuplesortstate * tuplesort_begin_index_hash(Relation heapRel, Relation indexRel, uint32 high_mask, uint32 low_mask, uint32 max_buckets, int workMem, bool randomAccess)
Definition: tuplesort.c:932
void tuplesort_set_bound(Tuplesortstate *state, int64 bound)
Definition: tuplesort.c:1060
int(* abbrev_full_comparator)(Datum x, Datum y, SortSupport ssup)
Definition: sortsupport.h:192
static char * buf
Definition: pg_test_fsync.c:67
TuplesortMethod sortMethod
Definition: tuplesort.h:56
static void readtup_datum(Tuplesortstate *state, SortTuple *stup, int tapenum, unsigned int len)
Definition: tuplesort.c:4118
void ExecDropSingleTupleTableSlot(TupleTableSlot *slot)
Definition: execTuples.c:216
MinimalTupleData * MinimalTuple
Definition: htup.h:27
IndexTupleData * IndexTuple
Definition: itup.h:53
#define COMPARETUP(state, a, b)
Definition: tuplesort.c:461
#define ALLOCSET_SEPARATE_THRESHOLD
Definition: memutils.h:224
int errdetail(const char *fmt,...)
Definition: elog.c:873
size_t LogicalTapeBackspace(LogicalTapeSet *lts, int tapenum, size_t size)
Definition: logtape.c:768
#define MINORDER
Definition: tuplesort.c:209
union SlabSlot SlabSlot
IndexInfo * indexInfo
Definition: tuplesort.c:404
static void tuplesort_heap_insert(Tuplesortstate *state, SortTuple *tuple)
Definition: tuplesort.c:3139
const char * pg_rusage_show(const PGRUsage *ru0)
Definition: pg_rusage.c:40
#define FREEMEM(state, amt)
Definition: tuplesort.c:467
#define RelationGetRelationName(relation)
Definition: rel.h:445
Datum(* abbrev_converter)(Datum original, SortSupport ssup)
Definition: sortsupport.h:173
static bool tuplesort_gettuple_common(Tuplesortstate *state, bool forward, SortTuple *stup)
Definition: tuplesort.c:1741
unsigned int uint32
Definition: c.h:306
LogicalTapeSet * tapeset
Definition: tuplesort.c:236
Oid t_tableOid
Definition: htup.h:66
void index_deform_tuple(IndexTuple tup, TupleDesc tupleDescriptor, Datum *values, bool *isnull)
Definition: indextuple.c:420
void LogicalTapeTell(LogicalTapeSet *lts, int tapenum, long *blocknum, int *offset)
Definition: logtape.c:870
MemoryContext CurrentMemoryContext
Definition: mcxt.c:37
int64 allowedMem
Definition: tuplesort.c:231
TupleTableSlot * MakeSingleTupleTableSlot(TupleDesc tupdesc)
Definition: execTuples.c:199
void PrepareSortSupportFromIndexRel(Relation indexRel, int16 strategy, SortSupport ssup)
Definition: sortsupport.c:160
#define ereport(elevel, rest)
Definition: elog.h:122
static void selectnewtape(Tuplesortstate *state)
Definition: tuplesort.c:2304
#define AssertArg(condition)
Definition: c.h:682
Datum datumCopy(Datum value, bool typByVal, int typLen)
Definition: datum.c:128
#define COPYTUP(state, stup, tup)
Definition: tuplesort.c:462
EState * CreateExecutorState(void)
Definition: execUtils.c:81
static void copytup_heap(Tuplesortstate *state, SortTuple *stup, void *tup)
Definition: tuplesort.c:3373
#define AllocSetContextCreate(parent, name, allocparams)
Definition: memutils.h:165
#define heap_getattr(tup, attnum, tupleDesc, isnull)
Definition: htup_details.h:774
#define READTUP(state, stup, tape, len)
Definition: tuplesort.c:464
#define SK_BT_NULLS_FIRST
Definition: nbtree.h:428
#define WRITETUP(state, tape, stup)
Definition: tuplesort.c:463
static void writetup_datum(Tuplesortstate *state, int tapenum, SortTuple *stup)
Definition: tuplesort.c:4077
#define RELEASE_SLAB_SLOT(state, tuple)
Definition: tuplesort.c:449
void * palloc0(Size size)
Definition: mcxt.c:864
Relation indexRel
Definition: tuplesort.c:412
char * slabMemoryBegin
Definition: tuplesort.c:317
uintptr_t Datum
Definition: postgres.h:372
AttrNumber ssup_attno
Definition: sortsupport.h:81
static void puttuple_common(Tuplesortstate *state, SortTuple *tuple)
Definition: tuplesort.c:1504
void(* writetup)(Tuplesortstate *state, int tapenum, SortTuple *stup)
Definition: tuplesort.c:265
static void tuplesort_heap_delete_top(Tuplesortstate *state)
Definition: tuplesort.c:3174
#define InvalidOid
Definition: postgres_ext.h:36
void tuplesort_putindextuplevalues(Tuplesortstate *state, Relation rel, ItemPointer self, Datum *values, bool *isnull)
Definition: tuplesort.c:1344
static bool mergereadnext(Tuplesortstate *state, int srcTape, SortTuple *stup)
Definition: tuplesort.c:2708
#define Max(x, y)
Definition: c.h:806
int sk_flags
Definition: skey.h:66
List * ii_Expressions
Definition: execnodes.h:137
#define Assert(condition)
Definition: c.h:680
#define SK_BT_DESC
Definition: nbtree.h:427
Definition: regguts.h:298
static void inittapes(Tuplesortstate *state)
Definition: tuplesort.c:2227
bool enforceUnique
Definition: tuplesort.c:415
TuplesortSpaceType
Definition: tuplesort.h:48
bool(* abbrev_abort)(int memtupcount, SortSupport ssup)
Definition: sortsupport.h:183
long markpos_block
Definition: tuplesort.c:373
struct ItemPointerData ItemPointerData
#define INDEX_MAX_KEYS
size_t Size
Definition: c.h:414
bool eof_reached
Definition: tuplesort.c:370
TupSortStatus
Definition: tuplesort.c:188
int tuplesort_merge_order(int64 allowedMem)
Definition: tuplesort.c:2187
void get_typlenbyval(Oid typid, int16 *typlen, bool *typbyval)
Definition: lsyscache.c:2020
TupleTableSlot * ecxt_scantuple
Definition: execnodes.h:198
#define index_getattr(tup, attnum, tupleDesc, isnull)
Definition: itup.h:100
static void tuplesort_sort_memtuples(Tuplesortstate *state)
Definition: tuplesort.c:3113
AttrNumber ii_KeyAttrNumbers[INDEX_MAX_KEYS]
Definition: execnodes.h:136
#define ItemPointerGetOffsetNumber(pointer)
Definition: itemptr.h:95
int tupindex
Definition: tuplesort.c:162
uint32 max_buckets
Definition: tuplesort.c:420
#define MINIMAL_TUPLE_OFFSET
Definition: htup_details.h:625
#define INT64_FORMAT
Definition: c.h:348
void LogicalTapeRewindForRead(LogicalTapeSet *lts, int tapenum, size_t buffer_size)
Definition: logtape.c:551
int * tp_dummy
Definition: tuplesort.c:359
static void readtup_heap(Tuplesortstate *state, SortTuple *stup, int tapenum, unsigned int len)
Definition: tuplesort.c:3478
bool slabAllocatorUsed
Definition: tuplesort.c:315
#define DatumGetPointer(X)
Definition: postgres.h:555
Tuplesortstate * tuplesort_begin_cluster(TupleDesc tupDesc, Relation indexRel, int workMem, bool randomAccess)
Definition: tuplesort.c:765
static Datum values[MAXATTR]
Definition: bootstrap.c:164
static void * readtup_alloc(Tuplesortstate *state, Size tuplen)
Definition: tuplesort.c:3283
MemoryContext tuplecontext
Definition: tuplesort.c:235
Tuplesortstate * tuplesort_begin_heap(TupleDesc tupDesc, int nkeys, AttrNumber *attNums, Oid *sortOperators, Oid *sortCollations, bool *nullsFirstFlags, int workMem, bool randomAccess)
Definition: tuplesort.c:693
void * repalloc_huge(void *pointer, Size size)
Definition: mcxt.c:1017
void * palloc(Size size)
Definition: mcxt.c:835
int errmsg(const char *fmt,...)
Definition: elog.c:797
int * tp_tapenum
Definition: tuplesort.c:360
#define USEMEM(state, amt)
Definition: tuplesort.c:466
bool markpos_eof
Definition: tuplesort.c:375
#define HEAPTUPLESIZE
Definition: htup.h:72
void * MemoryContextAlloc(MemoryContext context, Size size)
Definition: mcxt.c:693
Oid sk_collation
Definition: skey.h:70
int i
uint32 low_mask
Definition: tuplesort.c:419
void tuplesort_markpos(Tuplesortstate *state)
Definition: tuplesort.c:2869
void tuplesort_putheaptuple(Tuplesortstate *state, HeapTuple tup)
Definition: tuplesort.c:1323
void LogicalTapeSetClose(LogicalTapeSet *lts)
Definition: logtape.c:427
void LogicalTapeSeek(LogicalTapeSet *lts, int tapenum, long blocknum, int offset)
Definition: logtape.c:839
TuplesortSpaceType spaceType
Definition: tuplesort.h:57
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:98
SortSupport onlyKey
Definition: tuplesort.c:389
#define elog
Definition: elog.h:219
void LogicalTapeSetForgetFreeSpace(LogicalTapeSet *lts)
Definition: logtape.c:453
char * BuildIndexValueDescription(Relation indexRelation, Datum *values, bool *isnull)
Definition: genam.c:177
#define ItemPointerGetBlockNumber(pointer)
Definition: itemptr.h:76
SlabSlot * slabFreeHead
Definition: tuplesort.c:319
static Tuplesortstate * tuplesort_begin_common(int workMem, bool randomAccess)
Definition: tuplesort.c:607
long LogicalTapeSetBlocks(LogicalTapeSet *lts)
Definition: logtape.c:889
void tuplesort_end(Tuplesortstate *state)
Definition: tuplesort.c:1104
#define BTLessStrategyNumber
Definition: stratnum.h:29
static int ApplySortComparator(Datum datum1, bool isNull1, Datum datum2, bool isNull2, SortSupport ssup)
Definition: sortsupport.h:201
Size datumGetSize(Datum value, bool typByVal, int typLen)
Definition: datum.c:61
bool tuplesort_skiptuples(Tuplesortstate *state, int64 ntuples, bool forward)
Definition: tuplesort.c:2120
static void make_bounded_heap(Tuplesortstate *state)
Definition: tuplesort.c:3026
static void tuplesort_heap_replace_top(Tuplesortstate *state, SortTuple *tuple)
Definition: tuplesort.c:3198
int16 AttrNumber
Definition: attnum.h:21
#define MAXORDER
Definition: tuplesort.c:210
static void beginmerge(Tuplesortstate *state)
Definition: tuplesort.c:2660
#define LACKMEM(state)
Definition: tuplesort.c:465
void LogicalTapeFreeze(LogicalTapeSet *lts, int tapenum)
Definition: logtape.c:703
long val
Definition: informix.c:689
bool * mergeactive
Definition: tuplesort.c:348
AttrNumber sk_attno
Definition: skey.h:67
TupleDesc tupDesc
Definition: tuplesort.c:382
#define IndexTupleSize(itup)
Definition: itup.h:70
void tuplesort_puttupleslot(Tuplesortstate *state, TupleTableSlot *slot)
Definition: tuplesort.c:1301
SortTuple * memtuples
Definition: tuplesort.c:283
static int ApplySortAbbrevFullComparator(Datum datum1, bool isNull1, Datum datum2, bool isNull2, SortSupport ssup)
Definition: sortsupport.h:239