PostgreSQL Source Code  git master
tuplesort.h
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * tuplesort.h
4  * Generalized tuple sorting routines.
5  *
6  * This module handles sorting of heap tuples, index tuples, or single
7  * Datums (and could easily support other kinds of sortable objects,
8  * if necessary). It works efficiently for both small and large amounts
9  * of data. Small amounts are sorted in-memory using qsort(). Large
10  * amounts are sorted using temporary files and a standard external sort
11  * algorithm. Parallel sorts use a variant of this external sort
12  * algorithm, and are typically only used for large amounts of data.
13  *
14  * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
15  * Portions Copyright (c) 1994, Regents of the University of California
16  *
17  * src/include/utils/tuplesort.h
18  *
19  *-------------------------------------------------------------------------
20  */
21 #ifndef TUPLESORT_H
22 #define TUPLESORT_H
23 
24 #include "access/itup.h"
25 #include "executor/tuptable.h"
26 #include "storage/dsm.h"
27 #include "utils/logtape.h"
28 #include "utils/relcache.h"
29 #include "utils/sortsupport.h"
30 
31 
32 /*
33  * Tuplesortstate and Sharedsort are opaque types whose details are not
34  * known outside tuplesort.c.
35  */
36 typedef struct Tuplesortstate Tuplesortstate;
37 typedef struct Sharedsort Sharedsort;
38 
39 /*
40  * Tuplesort parallel coordination state, allocated by each participant in
41  * local memory. Participant caller initializes everything. See usage notes
42  * below.
43  */
44 typedef struct SortCoordinateData
45 {
46  /* Worker process? If not, must be leader. */
47  bool isWorker;
48 
49  /*
50  * Leader-process-passed number of participants known launched (workers
51  * set this to -1). Includes state within leader needed for it to
52  * participate as a worker, if any.
53  */
55 
56  /* Private opaque state (points to shared memory) */
59 
61 
62 /*
63  * Data structures for reporting sort statistics. Note that
64  * TuplesortInstrumentation can't contain any pointers because we
65  * sometimes put it in shared memory.
66  *
67  * The parallel-sort infrastructure relies on having a zero TuplesortMethod
68  * to indicate that a worker never did anything, so we assign zero to
69  * SORT_TYPE_STILL_IN_PROGRESS. The other values of this enum can be
70  * OR'ed together to represent a situation where different workers used
71  * different methods, so we need a separate bit for each one. Keep the
72  * NUM_TUPLESORTMETHODS constant in sync with the number of bits!
73  */
74 typedef enum
75 {
82 
83 #define NUM_TUPLESORTMETHODS 4
84 
85 typedef enum
86 {
90 
91 /* Bitwise option flags for tuple sorts */
92 #define TUPLESORT_NONE 0
93 
94 /* specifies whether non-sequential access to the sort result is required */
95 #define TUPLESORT_RANDOMACCESS (1 << 0)
96 
97 /* specifies if the tuplesort is able to support bounded sorts */
98 #define TUPLESORT_ALLOWBOUNDED (1 << 1)
99 
101 {
102  TuplesortMethod sortMethod; /* sort algorithm used */
103  TuplesortSpaceType spaceType; /* type of space spaceUsed represents */
104  int64 spaceUsed; /* space consumption, in kB */
106 
107 /*
108  * The objects we actually sort are SortTuple structs. These contain
109  * a pointer to the tuple proper (might be a MinimalTuple or IndexTuple),
110  * which is a separate palloc chunk --- we assume it is just one chunk and
111  * can be freed by a simple pfree() (except during merge, when we use a
112  * simple slab allocator). SortTuples also contain the tuple's first key
113  * column in Datum/nullflag format, and a source/input tape number that
114  * tracks which tape each heap element/slot belongs to during merging.
115  *
116  * Storing the first key column lets us save heap_getattr or index_getattr
117  * calls during tuple comparisons. We could extract and save all the key
118  * columns not just the first, but this would increase code complexity and
119  * overhead, and wouldn't actually save any comparison cycles in the common
120  * case where the first key determines the comparison result. Note that
121  * for a pass-by-reference datatype, datum1 points into the "tuple" storage.
122  *
123  * There is one special case: when the sort support infrastructure provides an
124  * "abbreviated key" representation, where the key is (typically) a pass by
125  * value proxy for a pass by reference type. In this case, the abbreviated key
126  * is stored in datum1 in place of the actual first key column.
127  *
128  * When sorting single Datums, the data value is represented directly by
129  * datum1/isnull1 for pass by value types (or null values). If the datatype is
130  * pass-by-reference and isnull1 is false, then "tuple" points to a separately
131  * palloc'd data value, otherwise "tuple" is NULL. The value of datum1 is then
132  * either the same pointer as "tuple", or is an abbreviated key value as
133  * described above. Accordingly, "tuple" is always used in preference to
134  * datum1 as the authoritative value for pass-by-reference cases.
135  */
136 typedef struct
137 {
138  void *tuple; /* the tuple itself */
139  Datum datum1; /* value of first key column */
140  bool isnull1; /* is first key column NULL? */
141  int srctape; /* source tape number */
142 } SortTuple;
143 
144 typedef int (*SortTupleComparator) (const SortTuple *a, const SortTuple *b,
146 
147 /*
148  * The public part of a Tuple sort operation state. This data structure
149  * containsthe definition of sort-variant-specific interface methods and
150  * the part of Tuple sort operation state required by their implementations.
151  */
152 typedef struct
153 {
154  /*
155  * These function pointers decouple the routines that must know what kind
156  * of tuple we are sorting from the routines that don't need to know it.
157  * They are set up by the tuplesort_begin_xxx routines.
158  *
159  * Function to compare two tuples; result is per qsort() convention, ie:
160  * <0, 0, >0 according as a<b, a=b, a>b. The API must match
161  * qsort_arg_comparator.
162  */
164 
165  /*
166  * Alter datum1 representation in the SortTuple's array back from the
167  * abbreviated key to the first column value.
168  */
169  void (*removeabbrev) (Tuplesortstate *state, SortTuple *stups,
170  int count);
171 
172  /*
173  * Function to write a stored tuple onto tape. The representation of the
174  * tuple on tape need not be the same as it is in memory.
175  */
176  void (*writetup) (Tuplesortstate *state, LogicalTape *tape,
177  SortTuple *stup);
178 
179  /*
180  * Function to read a stored tuple from tape back into memory. 'len' is
181  * the already-read length of the stored tuple. The tuple is allocated
182  * from the slab memory arena, or is palloc'd, see
183  * tuplesort_readtup_alloc().
184  */
185  void (*readtup) (Tuplesortstate *state, SortTuple *stup,
186  LogicalTape *tape, unsigned int len);
187 
188  /*
189  * Function to do some specific release of resources for the sort variant.
190  * In particular, this function should free everything stored in the "arg"
191  * field, which wouldn't be cleared on reset of the Tuple sort memory
192  * contextes. This can be NULL if nothing specific needs to be done.
193  */
195 
196  /*
197  * The subsequent fields are used in the implementations of the functions
198  * above.
199  */
200  MemoryContext maincontext; /* memory context for tuple sort metadata that
201  * persists across multiple batches */
202  MemoryContext sortcontext; /* memory context holding most sort data */
203  MemoryContext tuplecontext; /* sub-context of sortcontext for tuple data */
204 
205  /*
206  * Whether SortTuple's datum1 and isnull1 members are maintained by the
207  * above routines. If not, some sort specializations are disabled.
208  */
210 
211  /*
212  * The sortKeys variable is used by every case other than the hash index
213  * case; it is set by tuplesort_begin_xxx. tupDesc is only used by the
214  * MinimalTuple and CLUSTER routines, though.
215  */
216  int nKeys; /* number of columns in sort key */
217  SortSupport sortKeys; /* array of length nKeys */
218 
219  /*
220  * This variable is shared by the single-key MinimalTuple case and the
221  * Datum case (which both use qsort_ssup()). Otherwise, it's NULL. The
222  * presence of a value in this field is also checked by various sort
223  * specialization functions as an optimization when comparing the leading
224  * key in a tiebreak situation to determine if there are any subsequent
225  * keys to sort on.
226  */
228 
229  int sortopt; /* Bitmask of flags used to setup sort */
230 
231  bool tuples; /* Can SortTuple.tuple ever be set? */
232 
233  void *arg; /* Specific information for the sort variant */
235 
236 /* Sort parallel code from state for sort__start probes */
237 #define PARALLEL_SORT(coordinate) (coordinate == NULL || \
238  (coordinate)->sharedsort == NULL ? 0 : \
239  (coordinate)->isWorker ? 1 : 2)
240 
241 #define TuplesortstateGetPublic(state) ((TuplesortPublic *) state)
242 
243 /* When using this macro, beware of double evaluation of len */
244 #define LogicalTapeReadExact(tape, ptr, len) \
245  do { \
246  if (LogicalTapeRead(tape, ptr, len) != (size_t) (len)) \
247  elog(ERROR, "unexpected end of data"); \
248  } while(0)
249 
250 /*
251  * We provide multiple interfaces to what is essentially the same code,
252  * since different callers have different data to be sorted and want to
253  * specify the sort key information differently. There are two APIs for
254  * sorting HeapTuples and two more for sorting IndexTuples. Yet another
255  * API supports sorting bare Datums.
256  *
257  * Serial sort callers should pass NULL for their coordinate argument.
258  *
259  * The "heap" API actually stores/sorts MinimalTuples, which means it doesn't
260  * preserve the system columns (tuple identity and transaction visibility
261  * info). The sort keys are specified by column numbers within the tuples
262  * and sort operator OIDs. We save some cycles by passing and returning the
263  * tuples in TupleTableSlots, rather than forming actual HeapTuples (which'd
264  * have to be converted to MinimalTuples). This API works well for sorts
265  * executed as parts of plan trees.
266  *
267  * The "cluster" API stores/sorts full HeapTuples including all visibility
268  * info. The sort keys are specified by reference to a btree index that is
269  * defined on the relation to be sorted. Note that putheaptuple/getheaptuple
270  * go with this API, not the "begin_heap" one!
271  *
272  * The "index_btree" API stores/sorts IndexTuples (preserving all their
273  * header fields). The sort keys are specified by a btree index definition.
274  *
275  * The "index_hash" API is similar to index_btree, but the tuples are
276  * actually sorted by their hash codes not the raw data.
277  *
278  * Parallel sort callers are required to coordinate multiple tuplesort states
279  * in a leader process and one or more worker processes. The leader process
280  * must launch workers, and have each perform an independent "partial"
281  * tuplesort, typically fed by the parallel heap interface. The leader later
282  * produces the final output (internally, it merges runs output by workers).
283  *
284  * Callers must do the following to perform a sort in parallel using multiple
285  * worker processes:
286  *
287  * 1. Request tuplesort-private shared memory for n workers. Use
288  * tuplesort_estimate_shared() to get the required size.
289  * 2. Have leader process initialize allocated shared memory using
290  * tuplesort_initialize_shared(). Launch workers.
291  * 3. Initialize a coordinate argument within both the leader process, and
292  * for each worker process. This has a pointer to the shared
293  * tuplesort-private structure, as well as some caller-initialized fields.
294  * Leader's coordinate argument reliably indicates number of workers
295  * launched (this is unused by workers).
296  * 4. Begin a tuplesort using some appropriate tuplesort_begin* routine,
297  * (passing the coordinate argument) within each worker. The workMem
298  * arguments need not be identical. All other arguments should match
299  * exactly, though.
300  * 5. tuplesort_attach_shared() should be called by all workers. Feed tuples
301  * to each worker, and call tuplesort_performsort() within each when input
302  * is exhausted.
303  * 6. Call tuplesort_end() in each worker process. Worker processes can shut
304  * down once tuplesort_end() returns.
305  * 7. Begin a tuplesort in the leader using the same tuplesort_begin*
306  * routine, passing a leader-appropriate coordinate argument (this can
307  * happen as early as during step 3, actually, since we only need to know
308  * the number of workers successfully launched). The leader must now wait
309  * for workers to finish. Caller must use own mechanism for ensuring that
310  * next step isn't reached until all workers have called and returned from
311  * tuplesort_performsort(). (Note that it's okay if workers have already
312  * also called tuplesort_end() by then.)
313  * 8. Call tuplesort_performsort() in leader. Consume output using the
314  * appropriate tuplesort_get* routine. Leader can skip this step if
315  * tuplesort turns out to be unnecessary.
316  * 9. Call tuplesort_end() in leader.
317  *
318  * This division of labor assumes nothing about how input tuples are produced,
319  * but does require that caller combine the state of multiple tuplesorts for
320  * any purpose other than producing the final output. For example, callers
321  * must consider that tuplesort_get_stats() reports on only one worker's role
322  * in a sort (or the leader's role), and not statistics for the sort as a
323  * whole.
324  *
325  * Note that callers may use the leader process to sort runs as if it was an
326  * independent worker process (prior to the process performing a leader sort
327  * to produce the final sorted output). Doing so only requires a second
328  * "partial" tuplesort within the leader process, initialized like that of a
329  * worker process. The steps above don't touch on this directly. The only
330  * difference is that the tuplesort_attach_shared() call is never needed within
331  * leader process, because the backend as a whole holds the shared fileset
332  * reference. A worker Tuplesortstate in leader is expected to do exactly the
333  * same amount of total initial processing work as a worker process
334  * Tuplesortstate, since the leader process has nothing else to do before
335  * workers finish.
336  *
337  * Note that only a very small amount of memory will be allocated prior to
338  * the leader state first consuming input, and that workers will free the
339  * vast majority of their memory upon returning from tuplesort_performsort().
340  * Callers can rely on this to arrange for memory to be used in a way that
341  * respects a workMem-style budget across an entire parallel sort operation.
342  *
343  * Callers are responsible for parallel safety in general. However, they
344  * can at least rely on there being no parallel safety hazards within
345  * tuplesort, because tuplesort thinks of the sort as several independent
346  * sorts whose results are combined. Since, in general, the behavior of
347  * sort operators is immutable, caller need only worry about the parallel
348  * safety of whatever the process is through which input tuples are
349  * generated (typically, caller uses a parallel heap scan).
350  */
351 
352 
353 extern Tuplesortstate *tuplesort_begin_common(int workMem,
354  SortCoordinate coordinate,
355  int sortopt);
356 extern void tuplesort_set_bound(Tuplesortstate *state, int64 bound);
359  SortTuple *tuple, bool useAbbrev);
361 extern bool tuplesort_gettuple_common(Tuplesortstate *state, bool forward,
362  SortTuple *stup);
363 extern bool tuplesort_skiptuples(Tuplesortstate *state, int64 ntuples,
364  bool forward);
365 extern void tuplesort_end(Tuplesortstate *state);
366 extern void tuplesort_reset(Tuplesortstate *state);
367 
369  TuplesortInstrumentation *stats);
370 extern const char *tuplesort_method_name(TuplesortMethod m);
371 extern const char *tuplesort_space_type_name(TuplesortSpaceType t);
372 
373 extern int tuplesort_merge_order(int64 allowedMem);
374 
375 extern Size tuplesort_estimate_shared(int nWorkers);
376 extern void tuplesort_initialize_shared(Sharedsort *shared, int nWorkers,
377  dsm_segment *seg);
378 extern void tuplesort_attach_shared(Sharedsort *shared, dsm_segment *seg);
379 
380 /*
381  * These routines may only be called if TUPLESORT_RANDOMACCESS was specified
382  * during tuplesort_begin_*. Additionally backwards scan in gettuple/getdatum
383  * also require TUPLESORT_RANDOMACCESS. Note that parallel sorts do not
384  * support random access.
385  */
389 
390 extern void *tuplesort_readtup_alloc(Tuplesortstate *state, Size tuplen);
391 
392 
393 /* tuplesortvariants.c */
394 
396  int nkeys, AttrNumber *attNums,
397  Oid *sortOperators, Oid *sortCollations,
398  bool *nullsFirstFlags,
399  int workMem, SortCoordinate coordinate,
400  int sortopt);
402  Relation indexRel, int workMem,
403  SortCoordinate coordinate,
404  int sortopt);
406  Relation indexRel,
407  bool enforceUnique,
408  bool uniqueNullsNotDistinct,
409  int workMem, SortCoordinate coordinate,
410  int sortopt);
412  Relation indexRel,
413  uint32 high_mask,
414  uint32 low_mask,
415  uint32 max_buckets,
416  int workMem, SortCoordinate coordinate,
417  int sortopt);
419  Relation indexRel,
420  int workMem, SortCoordinate coordinate,
421  int sortopt);
422 extern Tuplesortstate *tuplesort_begin_datum(Oid datumType,
423  Oid sortOperator, Oid sortCollation,
424  bool nullsFirstFlag,
425  int workMem, SortCoordinate coordinate,
426  int sortopt);
427 
429  TupleTableSlot *slot);
432  Relation rel, ItemPointer self,
433  Datum *values, bool *isnull);
435  bool isNull);
436 
437 extern bool tuplesort_gettupleslot(Tuplesortstate *state, bool forward,
438  bool copy, TupleTableSlot *slot, Datum *abbrev);
441 extern bool tuplesort_getdatum(Tuplesortstate *state, bool forward, bool copy,
442  Datum *val, bool *isNull, Datum *abbrev);
443 
444 
445 #endif /* TUPLESORT_H */
int16 AttrNumber
Definition: attnum.h:21
static Datum values[MAXATTR]
Definition: bootstrap.c:156
unsigned int uint32
Definition: c.h:442
size_t Size
Definition: c.h:541
long val
Definition: informix.c:664
int b
Definition: isn.c:70
int a
Definition: isn.c:69
const void size_t len
uintptr_t Datum
Definition: postgres.h:412
unsigned int Oid
Definition: postgres_ext.h:31
static void freestate(struct nfa *nfa, struct state *s)
Definition: regc_nfa.c:246
Sharedsort * sharedsort
Definition: tuplesort.h:57
bool isnull1
Definition: tuplesort.h:140
void * tuple
Definition: tuplesort.h:138
int srctape
Definition: tuplesort.h:141
Datum datum1
Definition: tuplesort.h:139
TuplesortMethod sortMethod
Definition: tuplesort.h:102
TuplesortSpaceType spaceType
Definition: tuplesort.h:103
SortSupport onlyKey
Definition: tuplesort.h:227
MemoryContext maincontext
Definition: tuplesort.h:200
MemoryContext tuplecontext
Definition: tuplesort.h:203
MemoryContext sortcontext
Definition: tuplesort.h:202
SortTupleComparator comparetup
Definition: tuplesort.h:163
SortSupport sortKeys
Definition: tuplesort.h:217
Definition: regguts.h:318
IndexTuple tuplesort_getindextuple(Tuplesortstate *state, bool forward)
void tuplesort_rescan(Tuplesortstate *state)
Definition: tuplesort.c:2440
void tuplesort_performsort(Tuplesortstate *state)
Definition: tuplesort.c:1385
struct SortCoordinateData SortCoordinateData
int tuplesort_merge_order(int64 allowedMem)
Definition: tuplesort.c:1804
void tuplesort_initialize_shared(Sharedsort *shared, int nWorkers, dsm_segment *seg)
Definition: tuplesort.c:2976
Tuplesortstate * tuplesort_begin_common(int workMem, SortCoordinate coordinate, int sortopt)
Definition: tuplesort.c:646
HeapTuple tuplesort_getheaptuple(Tuplesortstate *state, bool forward)
void tuplesort_putdatum(Tuplesortstate *state, Datum val, bool isNull)
Tuplesortstate * tuplesort_begin_index_hash(Relation heapRel, Relation indexRel, uint32 high_mask, uint32 low_mask, uint32 max_buckets, int workMem, SortCoordinate coordinate, int sortopt)
void tuplesort_reset(Tuplesortstate *state)
Definition: tuplesort.c:1040
bool tuplesort_skiptuples(Tuplesortstate *state, int64 ntuples, bool forward)
Definition: tuplesort.c:1736
void tuplesort_puttupleslot(Tuplesortstate *state, TupleTableSlot *slot)
Tuplesortstate * tuplesort_begin_index_gist(Relation heapRel, Relation indexRel, int workMem, SortCoordinate coordinate, int sortopt)
bool tuplesort_used_bound(Tuplesortstate *state)
Definition: tuplesort.c:892
Tuplesortstate * tuplesort_begin_index_btree(Relation heapRel, Relation indexRel, bool enforceUnique, bool uniqueNullsNotDistinct, int workMem, SortCoordinate coordinate, int sortopt)
const char * tuplesort_space_type_name(TuplesortSpaceType t)
Definition: tuplesort.c:2604
Tuplesortstate * tuplesort_begin_datum(Oid datumType, Oid sortOperator, Oid sortCollation, bool nullsFirstFlag, int workMem, SortCoordinate coordinate, int sortopt)
Size tuplesort_estimate_shared(int nWorkers)
Definition: tuplesort.c:2955
struct SortCoordinateData * SortCoordinate
Definition: tuplesort.h:60
void tuplesort_get_stats(Tuplesortstate *state, TuplesortInstrumentation *stats)
Definition: tuplesort.c:2537
bool tuplesort_gettupleslot(Tuplesortstate *state, bool forward, bool copy, TupleTableSlot *slot, Datum *abbrev)
void tuplesort_end(Tuplesortstate *state)
Definition: tuplesort.c:972
void tuplesort_markpos(Tuplesortstate *state)
Definition: tuplesort.c:2473
void tuplesort_puttuple_common(Tuplesortstate *state, SortTuple *tuple, bool useAbbrev)
Definition: tuplesort.c:1190
bool tuplesort_gettuple_common(Tuplesortstate *state, bool forward, SortTuple *stup)
Definition: tuplesort.c:1496
Tuplesortstate * tuplesort_begin_cluster(TupleDesc tupDesc, Relation indexRel, int workMem, SortCoordinate coordinate, int sortopt)
void tuplesort_attach_shared(Sharedsort *shared, dsm_segment *seg)
Definition: tuplesort.c:2999
struct TuplesortInstrumentation TuplesortInstrumentation
void tuplesort_restorepos(Tuplesortstate *state)
Definition: tuplesort.c:2504
TuplesortSpaceType
Definition: tuplesort.h:86
@ SORT_SPACE_TYPE_DISK
Definition: tuplesort.h:87
@ SORT_SPACE_TYPE_MEMORY
Definition: tuplesort.h:88
int(* SortTupleComparator)(const SortTuple *a, const SortTuple *b, Tuplesortstate *state)
Definition: tuplesort.h:144
TuplesortMethod
Definition: tuplesort.h:75
@ SORT_TYPE_EXTERNAL_SORT
Definition: tuplesort.h:79
@ SORT_TYPE_TOP_N_HEAPSORT
Definition: tuplesort.h:77
@ SORT_TYPE_QUICKSORT
Definition: tuplesort.h:78
@ SORT_TYPE_STILL_IN_PROGRESS
Definition: tuplesort.h:76
@ SORT_TYPE_EXTERNAL_MERGE
Definition: tuplesort.h:80
void * tuplesort_readtup_alloc(Tuplesortstate *state, Size tuplen)
Definition: tuplesort.c:2921
void tuplesort_putheaptuple(Tuplesortstate *state, HeapTuple tup)
bool tuplesort_getdatum(Tuplesortstate *state, bool forward, bool copy, Datum *val, bool *isNull, Datum *abbrev)
void tuplesort_set_bound(Tuplesortstate *state, int64 bound)
Definition: tuplesort.c:844
void tuplesort_putindextuplevalues(Tuplesortstate *state, Relation rel, ItemPointer self, Datum *values, bool *isnull)
const char * tuplesort_method_name(TuplesortMethod m)
Definition: tuplesort.c:2581
Tuplesortstate * tuplesort_begin_heap(TupleDesc tupDesc, int nkeys, AttrNumber *attNums, Oid *sortOperators, Oid *sortCollations, bool *nullsFirstFlags, int workMem, SortCoordinate coordinate, int sortopt)