PostgreSQL Source Code git master
Loading...
Searching...
No Matches
tuplesort.h
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * tuplesort.h
4 * Generalized tuple sorting routines.
5 *
6 * This module handles sorting of heap tuples, index tuples, or single
7 * Datums (and could easily support other kinds of sortable objects,
8 * if necessary). It works efficiently for both small and large amounts
9 * of data. Small amounts are sorted in-memory using qsort(). Large
10 * amounts are sorted using temporary files and a standard external sort
11 * algorithm. Parallel sorts use a variant of this external sort
12 * algorithm, and are typically only used for large amounts of data.
13 *
14 * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
15 * Portions Copyright (c) 1994, Regents of the University of California
16 *
17 * src/include/utils/tuplesort.h
18 *
19 *-------------------------------------------------------------------------
20 */
21#ifndef TUPLESORT_H
22#define TUPLESORT_H
23
24#include "access/itup.h"
26#include "executor/tuptable.h"
27#include "storage/dsm.h"
28#include "utils/logtape.h"
29#include "utils/relcache.h"
30#include "utils/sortsupport.h"
31
32/* We don't want this file to depend on AM-specific header files */
33typedef struct BrinTuple BrinTuple;
34typedef struct GinTuple GinTuple;
35
36/*
37 * Tuplesortstate and Sharedsort are opaque types whose details are not
38 * known outside tuplesort.c.
39 */
41typedef struct Sharedsort Sharedsort;
42
43/*
44 * Tuplesort parallel coordination state, allocated by each participant in
45 * local memory. Participant caller initializes everything. See usage notes
46 * below.
47 */
48typedef struct SortCoordinateData
49{
50 /* Worker process? If not, must be leader. */
52
53 /*
54 * Leader-process-passed number of participants known launched (workers
55 * set this to -1). Includes state within leader needed for it to
56 * participate as a worker, if any.
57 */
59
60 /* Private opaque state (points to shared memory) */
63
65
66/* Bitwise option flags for tuple sorts */
67#define TUPLESORT_NONE 0
68
69/* specifies whether non-sequential access to the sort result is required */
70#define TUPLESORT_RANDOMACCESS (1 << 0)
71
72/* specifies if the tuplesort is able to support bounded sorts */
73#define TUPLESORT_ALLOWBOUNDED (1 << 1)
74
75/*
76 * For bounded sort, tuples get pfree'd when they fall outside of the bound.
77 * When bounded sorts are not required, we can use a bump context for tuple
78 * allocation as there's no risk that pfree will ever be called for a tuple.
79 * Define a macro to make it easier for code to figure out if we're using a
80 * bump allocator.
81 */
82#define TupleSortUseBumpTupleCxt(opt) (((opt) & TUPLESORT_ALLOWBOUNDED) == 0)
83
84/*
85 * The objects we actually sort are SortTuple structs. These contain
86 * a pointer to the tuple proper (might be a MinimalTuple or IndexTuple),
87 * which is a separate palloc chunk --- we assume it is just one chunk and
88 * can be freed by a simple pfree() (except during merge, where we use a
89 * simple slab allocator, and during a non-bounded sort where we use a bump
90 * allocator). SortTuples also contain the tuple's first key column in
91 * Datum/nullflag format, and a source/input tape number that tracks which
92 * tape each heap element/slot belongs to during merging.
93 *
94 * Storing the first key column lets us save heap_getattr or index_getattr
95 * calls during tuple comparisons. We could extract and save all the key
96 * columns not just the first, but this would increase code complexity and
97 * overhead, and wouldn't actually save any comparison cycles in the common
98 * case where the first key determines the comparison result. Note that
99 * for a pass-by-reference datatype, datum1 points into the "tuple" storage.
100 *
101 * There is one special case: when the sort support infrastructure provides an
102 * "abbreviated key" representation, where the key is (typically) a pass by
103 * value proxy for a pass by reference type. In this case, the abbreviated key
104 * is stored in datum1 in place of the actual first key column.
105 *
106 * When sorting single Datums, the data value is represented directly by
107 * datum1/isnull1 for pass by value types (or null values). If the datatype is
108 * pass-by-reference and isnull1 is false, then "tuple" points to a separately
109 * palloc'd data value, otherwise "tuple" is NULL. The value of datum1 is then
110 * either the same pointer as "tuple", or is an abbreviated key value as
111 * described above. Accordingly, "tuple" is always used in preference to
112 * datum1 as the authoritative value for pass-by-reference cases.
113 */
114typedef struct
115{
116 void *tuple; /* the tuple itself */
117 Datum datum1; /* value of first key column */
118 bool isnull1; /* is first key column NULL? */
119 uint8 curbyte; /* chunk of datum1 for current radix sort pass */
120 int srctape; /* source tape number */
121} SortTuple;
122
123typedef int (*SortTupleComparator) (const SortTuple *a, const SortTuple *b,
125
126/*
127 * The public part of a Tuple sort operation state. This data structure
128 * contains the definition of sort-variant-specific interface methods and
129 * the part of Tuple sort operation state required by their implementations.
130 */
131typedef struct
132{
133 /*
134 * These function pointers decouple the routines that must know what kind
135 * of tuple we are sorting from the routines that don't need to know it.
136 * They are set up by the tuplesort_begin_xxx routines.
137 *
138 * Function to compare two tuples; result is per qsort() convention, ie:
139 * <0, 0, >0 according as a<b, a=b, a>b. The API must match
140 * qsort_arg_comparator.
141 */
143
144 /*
145 * Fall back to the full tuple for comparison, but only compare the first
146 * sortkey if it was abbreviated. Otherwise, only compare second and later
147 * sortkeys.
148 */
150
151 /*
152 * Alter datum1 representation in the SortTuple's array back from the
153 * abbreviated key to the first column value.
154 */
156 int count);
157
158 /*
159 * Function to write a stored tuple onto tape. The representation of the
160 * tuple on tape need not be the same as it is in memory.
161 */
163 SortTuple *stup);
164
165 /*
166 * Function to read a stored tuple from tape back into memory. 'len' is
167 * the already-read length of the stored tuple. The tuple is allocated
168 * from the slab memory arena, or is palloc'd, see
169 * tuplesort_readtup_alloc().
170 */
172 LogicalTape *tape, unsigned int len);
173
174 /*
175 * Function to do some specific release of resources for the sort variant.
176 * In particular, this function should free everything stored in the "arg"
177 * field, which wouldn't be cleared on reset of the Tuple sort memory
178 * contexts. This can be NULL if nothing specific needs to be done.
179 */
181
182 /*
183 * The subsequent fields are used in the implementations of the functions
184 * above.
185 */
186 MemoryContext maincontext; /* memory context for tuple sort metadata that
187 * persists across multiple batches */
188 MemoryContext sortcontext; /* memory context holding most sort data */
189 MemoryContext tuplecontext; /* sub-context of sortcontext for tuple data */
190
191 /*
192 * Whether SortTuple's datum1 and isnull1 members are maintained by the
193 * above routines. If not, some sort specializations are disabled.
194 */
196
197 /*
198 * The sortKeys variable is used by every case other than the hash index
199 * case; it is set by tuplesort_begin_xxx. tupDesc is only used by the
200 * MinimalTuple and CLUSTER routines, though.
201 */
202 int nKeys; /* number of columns in sort key */
203 SortSupport sortKeys; /* array of length nKeys */
204
205 /*
206 * This variable is shared by the single-key MinimalTuple case and the
207 * Datum case (which both use qsort_ssup()). Otherwise, it's NULL. The
208 * presence of a value in this field is also checked by various sort
209 * specialization functions as an optimization when comparing the leading
210 * key in a tiebreak situation to determine if there are any subsequent
211 * keys to sort on.
212 */
214
215 int sortopt; /* Bitmask of flags used to setup sort */
216
217 bool tuples; /* Can SortTuple.tuple ever be set? */
218
219 void *arg; /* Specific information for the sort variant */
221
222/* Sort parallel code from state for sort__start probes */
223#define PARALLEL_SORT(coordinate) (coordinate == NULL || \
224 (coordinate)->sharedsort == NULL ? 0 : \
225 (coordinate)->isWorker ? 1 : 2)
226
227#define TuplesortstateGetPublic(state) ((TuplesortPublic *) state)
228
229/* When using this macro, beware of double evaluation of len */
230#define LogicalTapeReadExact(tape, ptr, len) \
231 do { \
232 if (LogicalTapeRead(tape, ptr, len) != (size_t) (len)) \
233 elog(ERROR, "unexpected end of data"); \
234 } while(0)
235
236/*
237 * We provide multiple interfaces to what is essentially the same code,
238 * since different callers have different data to be sorted and want to
239 * specify the sort key information differently. There are two APIs for
240 * sorting HeapTuples and two more for sorting IndexTuples. Yet another
241 * API supports sorting bare Datums.
242 *
243 * Serial sort callers should pass NULL for their coordinate argument.
244 *
245 * The "heap" API actually stores/sorts MinimalTuples, which means it doesn't
246 * preserve the system columns (tuple identity and transaction visibility
247 * info). The sort keys are specified by column numbers within the tuples
248 * and sort operator OIDs. We save some cycles by passing and returning the
249 * tuples in TupleTableSlots, rather than forming actual HeapTuples (which'd
250 * have to be converted to MinimalTuples). This API works well for sorts
251 * executed as parts of plan trees.
252 *
253 * The "cluster" API stores/sorts full HeapTuples including all visibility
254 * info. The sort keys are specified by reference to a btree index that is
255 * defined on the relation to be sorted. Note that putheaptuple/getheaptuple
256 * go with this API, not the "begin_heap" one!
257 *
258 * The "index_btree" API stores/sorts IndexTuples (preserving all their
259 * header fields). The sort keys are specified by a btree index definition.
260 *
261 * The "index_hash" API is similar to index_btree, but the tuples are
262 * actually sorted by their hash codes not the raw data.
263 *
264 * The "index_brin" API is similar to index_btree, but the tuples are
265 * BrinTuple and are sorted by their block number not the raw data.
266 *
267 * Parallel sort callers are required to coordinate multiple tuplesort states
268 * in a leader process and one or more worker processes. The leader process
269 * must launch workers, and have each perform an independent "partial"
270 * tuplesort, typically fed by the parallel heap interface. The leader later
271 * produces the final output (internally, it merges runs output by workers).
272 *
273 * Callers must do the following to perform a sort in parallel using multiple
274 * worker processes:
275 *
276 * 1. Request tuplesort-private shared memory for n workers. Use
277 * tuplesort_estimate_shared() to get the required size.
278 * 2. Have leader process initialize allocated shared memory using
279 * tuplesort_initialize_shared(). Launch workers.
280 * 3. Initialize a coordinate argument within both the leader process, and
281 * for each worker process. This has a pointer to the shared
282 * tuplesort-private structure, as well as some caller-initialized fields.
283 * Leader's coordinate argument reliably indicates number of workers
284 * launched (this is unused by workers).
285 * 4. Begin a tuplesort using some appropriate tuplesort_begin* routine,
286 * (passing the coordinate argument) within each worker. The workMem
287 * arguments need not be identical. All other arguments should match
288 * exactly, though.
289 * 5. tuplesort_attach_shared() should be called by all workers. Feed tuples
290 * to each worker, and call tuplesort_performsort() within each when input
291 * is exhausted.
292 * 6. Call tuplesort_end() in each worker process. Worker processes can shut
293 * down once tuplesort_end() returns.
294 * 7. Begin a tuplesort in the leader using the same tuplesort_begin*
295 * routine, passing a leader-appropriate coordinate argument (this can
296 * happen as early as during step 3, actually, since we only need to know
297 * the number of workers successfully launched). The leader must now wait
298 * for workers to finish. Caller must use own mechanism for ensuring that
299 * next step isn't reached until all workers have called and returned from
300 * tuplesort_performsort(). (Note that it's okay if workers have already
301 * also called tuplesort_end() by then.)
302 * 8. Call tuplesort_performsort() in leader. Consume output using the
303 * appropriate tuplesort_get* routine. Leader can skip this step if
304 * tuplesort turns out to be unnecessary.
305 * 9. Call tuplesort_end() in leader.
306 *
307 * This division of labor assumes nothing about how input tuples are produced,
308 * but does require that caller combine the state of multiple tuplesorts for
309 * any purpose other than producing the final output. For example, callers
310 * must consider that tuplesort_get_stats() reports on only one worker's role
311 * in a sort (or the leader's role), and not statistics for the sort as a
312 * whole.
313 *
314 * Note that callers may use the leader process to sort runs as if it was an
315 * independent worker process (prior to the process performing a leader sort
316 * to produce the final sorted output). Doing so only requires a second
317 * "partial" tuplesort within the leader process, initialized like that of a
318 * worker process. The steps above don't touch on this directly. The only
319 * difference is that the tuplesort_attach_shared() call is never needed within
320 * leader process, because the backend as a whole holds the shared fileset
321 * reference. A worker Tuplesortstate in leader is expected to do exactly the
322 * same amount of total initial processing work as a worker process
323 * Tuplesortstate, since the leader process has nothing else to do before
324 * workers finish.
325 *
326 * Note that only a very small amount of memory will be allocated prior to
327 * the leader state first consuming input, and that workers will free the
328 * vast majority of their memory upon returning from tuplesort_performsort().
329 * Callers can rely on this to arrange for memory to be used in a way that
330 * respects a workMem-style budget across an entire parallel sort operation.
331 *
332 * Callers are responsible for parallel safety in general. However, they
333 * can at least rely on there being no parallel safety hazards within
334 * tuplesort, because tuplesort thinks of the sort as several independent
335 * sorts whose results are combined. Since, in general, the behavior of
336 * sort operators is immutable, caller need only worry about the parallel
337 * safety of whatever the process is through which input tuples are
338 * generated (typically, caller uses a parallel heap scan).
339 */
340
341
344 int sortopt);
345extern void tuplesort_set_bound(Tuplesortstate *state, int64 bound);
348 SortTuple *tuple, bool useAbbrev,
349 Size tuplen);
352 SortTuple *stup);
353extern bool tuplesort_skiptuples(Tuplesortstate *state, int64 ntuples,
354 bool forward);
357
360extern const char *tuplesort_method_name(TuplesortMethod m);
362
363extern int tuplesort_merge_order(int64 allowedMem);
364
366extern void tuplesort_initialize_shared(Sharedsort *shared, int nWorkers,
367 dsm_segment *seg);
368extern void tuplesort_attach_shared(Sharedsort *shared, dsm_segment *seg);
369
370/*
371 * These routines may only be called if TUPLESORT_RANDOMACCESS was specified
372 * during tuplesort_begin_*. Additionally backwards scan in gettuple/getdatum
373 * also require TUPLESORT_RANDOMACCESS. Note that parallel sorts do not
374 * support random access.
375 */
379
380extern void *tuplesort_readtup_alloc(Tuplesortstate *state, Size tuplen);
381
382
383/* tuplesortvariants.c */
384
386 int nkeys, AttrNumber *attNums,
387 Oid *sortOperators, Oid *sortCollations,
388 bool *nullsFirstFlags,
390 int sortopt);
392 Relation indexRel, int workMem,
394 int sortopt);
396 Relation indexRel,
397 bool enforceUnique,
398 bool uniqueNullsNotDistinct,
400 int sortopt);
402 Relation indexRel,
403 uint32 high_mask,
404 uint32 low_mask,
405 uint32 max_buckets,
407 int sortopt);
409 Relation indexRel,
411 int sortopt);
413 int sortopt);
415 Relation indexRel,
417 int sortopt);
419 Oid sortOperator, Oid sortCollation,
420 bool nullsFirstFlag,
422 int sortopt);
423
425 TupleTableSlot *slot);
428 Relation rel, const ItemPointerData *self,
429 const Datum *values, const bool *isnull);
430extern void tuplesort_putbrintuple(Tuplesortstate *state, BrinTuple *tuple, Size size);
431extern void tuplesort_putgintuple(Tuplesortstate *state, GinTuple *tuple, Size size);
433 bool isNull);
434
436 bool copy, TupleTableSlot *slot, Datum *abbrev);
440 bool forward);
442 bool forward);
443extern bool tuplesort_getdatum(Tuplesortstate *state, bool forward, bool copy,
444 Datum *val, bool *isNull, Datum *abbrev);
445
446
447#endif /* TUPLESORT_H */
int16 AttrNumber
Definition attnum.h:21
static Datum values[MAXATTR]
Definition bootstrap.c:147
uint8_t uint8
Definition c.h:577
int64_t int64
Definition c.h:576
uint32_t uint32
Definition c.h:579
size_t Size
Definition c.h:652
long val
Definition informix.c:689
TuplesortSpaceType
TuplesortMethod
int b
Definition isn.c:74
int a
Definition isn.c:73
const void size_t len
uint64_t Datum
Definition postgres.h:70
unsigned int Oid
static int fb(int x)
static void freestate(struct nfa *nfa, struct state *s)
Definition regc_nfa.c:242
Sharedsort * sharedsort
Definition tuplesort.h:61
bool isnull1
Definition tuplesort.h:118
uint8 curbyte
Definition tuplesort.h:119
void * tuple
Definition tuplesort.h:116
int srctape
Definition tuplesort.h:120
Datum datum1
Definition tuplesort.h:117
SortSupport onlyKey
Definition tuplesort.h:213
MemoryContext maincontext
Definition tuplesort.h:186
MemoryContext tuplecontext
Definition tuplesort.h:189
MemoryContext sortcontext
Definition tuplesort.h:188
SortTupleComparator comparetup
Definition tuplesort.h:142
SortSupport sortKeys
Definition tuplesort.h:203
SortTupleComparator comparetup_tiebreak
Definition tuplesort.h:149
IndexTuple tuplesort_getindextuple(Tuplesortstate *state, bool forward)
void tuplesort_rescan(Tuplesortstate *state)
Definition tuplesort.c:2298
void tuplesort_performsort(Tuplesortstate *state)
Definition tuplesort.c:1259
int tuplesort_merge_order(int64 allowedMem)
Definition tuplesort.c:1674
Tuplesortstate * tuplesort_begin_index_gin(Relation heapRel, Relation indexRel, int workMem, SortCoordinate coordinate, int sortopt)
void tuplesort_initialize_shared(Sharedsort *shared, int nWorkers, dsm_segment *seg)
Definition tuplesort.c:3210
HeapTuple tuplesort_getheaptuple(Tuplesortstate *state, bool forward)
void tuplesort_putdatum(Tuplesortstate *state, Datum val, bool isNull)
GinTuple * tuplesort_getgintuple(Tuplesortstate *state, Size *len, bool forward)
void tuplesort_reset(Tuplesortstate *state)
Definition tuplesort.c:915
bool tuplesort_skiptuples(Tuplesortstate *state, int64 ntuples, bool forward)
Definition tuplesort.c:1606
void tuplesort_puttupleslot(Tuplesortstate *state, TupleTableSlot *slot)
Tuplesortstate * tuplesort_begin_index_brin(int workMem, SortCoordinate coordinate, int sortopt)
Tuplesortstate * tuplesort_begin_heap(TupleDesc tupDesc, int nkeys, AttrNumber *attNums, Oid *sortOperators, Oid *sortCollations, bool *nullsFirstFlags, int workMem, SortCoordinate coordinate, int sortopt)
bool tuplesort_used_bound(Tuplesortstate *state)
Definition tuplesort.c:782
Tuplesortstate * tuplesort_begin_cluster(TupleDesc tupDesc, Relation indexRel, int workMem, SortCoordinate coordinate, int sortopt)
BrinTuple * tuplesort_getbrintuple(Tuplesortstate *state, Size *len, bool forward)
Tuplesortstate * tuplesort_begin_index_btree(Relation heapRel, Relation indexRel, bool enforceUnique, bool uniqueNullsNotDistinct, int workMem, SortCoordinate coordinate, int sortopt)
Tuplesortstate * tuplesort_begin_index_gist(Relation heapRel, Relation indexRel, int workMem, SortCoordinate coordinate, int sortopt)
void tuplesort_putindextuplevalues(Tuplesortstate *state, Relation rel, const ItemPointerData *self, const Datum *values, const bool *isnull)
Size tuplesort_estimate_shared(int nWorkers)
Definition tuplesort.c:3189
struct SortCoordinateData * SortCoordinate
Definition tuplesort.h:64
void tuplesort_get_stats(Tuplesortstate *state, TuplesortInstrumentation *stats)
Definition tuplesort.c:2395
bool tuplesort_gettupleslot(Tuplesortstate *state, bool forward, bool copy, TupleTableSlot *slot, Datum *abbrev)
Tuplesortstate * tuplesort_begin_common(int workMem, SortCoordinate coordinate, int sortopt)
Definition tuplesort.c:546
void tuplesort_end(Tuplesortstate *state)
Definition tuplesort.c:847
void tuplesort_putgintuple(Tuplesortstate *state, GinTuple *tuple, Size size)
void tuplesort_markpos(Tuplesortstate *state)
Definition tuplesort.c:2331
void tuplesort_puttuple_common(Tuplesortstate *state, SortTuple *tuple, bool useAbbrev, Size tuplen)
Definition tuplesort.c:1065
const char * tuplesort_space_type_name(TuplesortSpaceType t)
Definition tuplesort.c:2462
bool tuplesort_gettuple_common(Tuplesortstate *state, bool forward, SortTuple *stup)
Definition tuplesort.c:1366
void tuplesort_attach_shared(Sharedsort *shared, dsm_segment *seg)
Definition tuplesort.c:3233
Tuplesortstate * tuplesort_begin_datum(Oid datumType, Oid sortOperator, Oid sortCollation, bool nullsFirstFlag, int workMem, SortCoordinate coordinate, int sortopt)
void tuplesort_putbrintuple(Tuplesortstate *state, BrinTuple *tuple, Size size)
const char * tuplesort_method_name(TuplesortMethod m)
Definition tuplesort.c:2439
void tuplesort_restorepos(Tuplesortstate *state)
Definition tuplesort.c:2362
Tuplesortstate * tuplesort_begin_index_hash(Relation heapRel, Relation indexRel, uint32 high_mask, uint32 low_mask, uint32 max_buckets, int workMem, SortCoordinate coordinate, int sortopt)
void * tuplesort_readtup_alloc(Tuplesortstate *state, Size tuplen)
Definition tuplesort.c:3155
int(* SortTupleComparator)(const SortTuple *a, const SortTuple *b, Tuplesortstate *state)
Definition tuplesort.h:123
void tuplesort_putheaptuple(Tuplesortstate *state, HeapTuple tup)
bool tuplesort_getdatum(Tuplesortstate *state, bool forward, bool copy, Datum *val, bool *isNull, Datum *abbrev)
void tuplesort_set_bound(Tuplesortstate *state, int64 bound)
Definition tuplesort.c:734