PostgreSQL Source Code  git master
tuplesort.c File Reference
#include "postgres.h"
#include <limits.h>
#include "catalog/pg_am.h"
#include "commands/tablespace.h"
#include "executor/executor.h"
#include "miscadmin.h"
#include "pg_trace.h"
#include "storage/shmem.h"
#include "utils/memutils.h"
#include "utils/pg_rusage.h"
#include "utils/rel.h"
#include "utils/tuplesort.h"
#include "lib/sort_template.h"
Include dependency graph for tuplesort.c:

Go to the source code of this file.

Data Structures

union  SlabSlot
 
struct  Tuplesortstate
 
struct  Sharedsort
 

Macros

#define INITIAL_MEMTUPSIZE
 
#define SLAB_SLOT_SIZE   1024
 
#define MINORDER   6 /* minimum merge order */
 
#define MAXORDER   500 /* maximum merge order */
 
#define TAPE_BUFFER_OVERHEAD   BLCKSZ
 
#define MERGE_BUFFER_SIZE   (BLCKSZ * 32)
 
#define IS_SLAB_SLOT(state, tuple)
 
#define RELEASE_SLAB_SLOT(state, tuple)
 
#define REMOVEABBREV(state, stup, count)   ((*(state)->base.removeabbrev) (state, stup, count))
 
#define COMPARETUP(state, a, b)   ((*(state)->base.comparetup) (a, b, state))
 
#define WRITETUP(state, tape, stup)   ((*(state)->base.writetup) (state, tape, stup))
 
#define READTUP(state, stup, tape, len)   ((*(state)->base.readtup) (state, stup, tape, len))
 
#define FREESTATE(state)   ((state)->base.freestate ? (*(state)->base.freestate) (state) : (void) 0)
 
#define LACKMEM(state)   ((state)->availMem < 0 && !(state)->slabAllocatorUsed)
 
#define USEMEM(state, amt)   ((state)->availMem -= (amt))
 
#define FREEMEM(state, amt)   ((state)->availMem += (amt))
 
#define SERIAL(state)   ((state)->shared == NULL)
 
#define WORKER(state)   ((state)->shared && (state)->worker != -1)
 
#define LEADER(state)   ((state)->shared && (state)->worker == -1)
 
#define ST_SORT   qsort_tuple_unsigned
 
#define ST_ELEMENT_TYPE   SortTuple
 
#define ST_COMPARE(a, b, state)   qsort_tuple_unsigned_compare(a, b, state)
 
#define ST_COMPARE_ARG_TYPE   Tuplesortstate
 
#define ST_CHECK_FOR_INTERRUPTS
 
#define ST_SCOPE   static
 
#define ST_DEFINE
 
#define ST_SORT   qsort_tuple_int32
 
#define ST_ELEMENT_TYPE   SortTuple
 
#define ST_COMPARE(a, b, state)   qsort_tuple_int32_compare(a, b, state)
 
#define ST_COMPARE_ARG_TYPE   Tuplesortstate
 
#define ST_CHECK_FOR_INTERRUPTS
 
#define ST_SCOPE   static
 
#define ST_DEFINE
 
#define ST_SORT   qsort_tuple
 
#define ST_ELEMENT_TYPE   SortTuple
 
#define ST_COMPARE_RUNTIME_POINTER
 
#define ST_COMPARE_ARG_TYPE   Tuplesortstate
 
#define ST_CHECK_FOR_INTERRUPTS
 
#define ST_SCOPE   static
 
#define ST_DECLARE
 
#define ST_DEFINE
 
#define ST_SORT   qsort_ssup
 
#define ST_ELEMENT_TYPE   SortTuple
 
#define ST_COMPARE(a, b, ssup)
 
#define ST_COMPARE_ARG_TYPE   SortSupportData
 
#define ST_CHECK_FOR_INTERRUPTS
 
#define ST_SCOPE   static
 
#define ST_DEFINE
 

Typedefs

typedef union SlabSlot SlabSlot
 

Enumerations

enum  TupSortStatus {
  TSS_INITIAL , TSS_BOUNDED , TSS_BUILDRUNS , TSS_SORTEDINMEM ,
  TSS_SORTEDONTAPE , TSS_FINALMERGE
}
 

Functions

static void tuplesort_begin_batch (Tuplesortstate *state)
 
static bool consider_abort_common (Tuplesortstate *state)
 
static void inittapes (Tuplesortstate *state, bool mergeruns)
 
static void inittapestate (Tuplesortstate *state, int maxTapes)
 
static void selectnewtape (Tuplesortstate *state)
 
static void init_slab_allocator (Tuplesortstate *state, int numSlots)
 
static void mergeruns (Tuplesortstate *state)
 
static void mergeonerun (Tuplesortstate *state)
 
static void beginmerge (Tuplesortstate *state)
 
static bool mergereadnext (Tuplesortstate *state, LogicalTape *srcTape, SortTuple *stup)
 
static void dumptuples (Tuplesortstate *state, bool alltuples)
 
static void make_bounded_heap (Tuplesortstate *state)
 
static void sort_bounded_heap (Tuplesortstate *state)
 
static void tuplesort_sort_memtuples (Tuplesortstate *state)
 
static void tuplesort_heap_insert (Tuplesortstate *state, SortTuple *tuple)
 
static void tuplesort_heap_replace_top (Tuplesortstate *state, SortTuple *tuple)
 
static void tuplesort_heap_delete_top (Tuplesortstate *state)
 
static void reversedirection (Tuplesortstate *state)
 
static unsigned int getlen (LogicalTape *tape, bool eofOK)
 
static void markrunend (LogicalTape *tape)
 
static int worker_get_identifier (Tuplesortstate *state)
 
static void worker_freeze_result_tape (Tuplesortstate *state)
 
static void worker_nomergeruns (Tuplesortstate *state)
 
static void leader_takeover_tapes (Tuplesortstate *state)
 
static void free_sort_tuple (Tuplesortstate *state, SortTuple *stup)
 
static void tuplesort_free (Tuplesortstate *state)
 
static void tuplesort_updatemax (Tuplesortstate *state)
 
static pg_attribute_always_inline int qsort_tuple_unsigned_compare (SortTuple *a, SortTuple *b, Tuplesortstate *state)
 
static pg_attribute_always_inline int qsort_tuple_int32_compare (SortTuple *a, SortTuple *b, Tuplesortstate *state)
 
Tuplesortstatetuplesort_begin_common (int workMem, SortCoordinate coordinate, int sortopt)
 
void tuplesort_set_bound (Tuplesortstate *state, int64 bound)
 
bool tuplesort_used_bound (Tuplesortstate *state)
 
void tuplesort_end (Tuplesortstate *state)
 
void tuplesort_reset (Tuplesortstate *state)
 
static bool grow_memtuples (Tuplesortstate *state)
 
void tuplesort_puttuple_common (Tuplesortstate *state, SortTuple *tuple, bool useAbbrev)
 
void tuplesort_performsort (Tuplesortstate *state)
 
bool tuplesort_gettuple_common (Tuplesortstate *state, bool forward, SortTuple *stup)
 
bool tuplesort_skiptuples (Tuplesortstate *state, int64 ntuples, bool forward)
 
int tuplesort_merge_order (int64 allowedMem)
 
static int64 merge_read_buffer_size (int64 avail_mem, int nInputTapes, int nInputRuns, int maxOutputTapes)
 
void tuplesort_rescan (Tuplesortstate *state)
 
void tuplesort_markpos (Tuplesortstate *state)
 
void tuplesort_restorepos (Tuplesortstate *state)
 
void tuplesort_get_stats (Tuplesortstate *state, TuplesortInstrumentation *stats)
 
const char * tuplesort_method_name (TuplesortMethod m)
 
const char * tuplesort_space_type_name (TuplesortSpaceType t)
 
void * tuplesort_readtup_alloc (Tuplesortstate *state, Size tuplen)
 
Size tuplesort_estimate_shared (int nWorkers)
 
void tuplesort_initialize_shared (Sharedsort *shared, int nWorkers, dsm_segment *seg)
 
void tuplesort_attach_shared (Sharedsort *shared, dsm_segment *seg)
 
int ssup_datum_unsigned_cmp (Datum x, Datum y, SortSupport ssup)
 
int ssup_datum_int32_cmp (Datum x, Datum y, SortSupport ssup)
 

Variables

bool trace_sort = false
 

Macro Definition Documentation

◆ COMPARETUP

#define COMPARETUP (   state,
  a,
  b 
)    ((*(state)->base.comparetup) (a, b, state))

Definition at line 397 of file tuplesort.c.

◆ FREEMEM

#define FREEMEM (   state,
  amt 
)    ((state)->availMem += (amt))

Definition at line 403 of file tuplesort.c.

◆ FREESTATE

#define FREESTATE (   state)    ((state)->base.freestate ? (*(state)->base.freestate) (state) : (void) 0)

Definition at line 400 of file tuplesort.c.

◆ INITIAL_MEMTUPSIZE

#define INITIAL_MEMTUPSIZE
Value:
Max(1024, \
#define Max(x, y)
Definition: c.h:931
#define ALLOCSET_SEPARATE_THRESHOLD
Definition: memutils.h:180

Definition at line 122 of file tuplesort.c.

◆ IS_SLAB_SLOT

#define IS_SLAB_SLOT (   state,
  tuple 
)
Value:
((char *) (tuple) >= (state)->slabMemoryBegin && \
(char *) (tuple) < (state)->slabMemoryEnd)
Definition: regguts.h:318

Definition at line 376 of file tuplesort.c.

◆ LACKMEM

#define LACKMEM (   state)    ((state)->availMem < 0 && !(state)->slabAllocatorUsed)

Definition at line 401 of file tuplesort.c.

◆ LEADER

#define LEADER (   state)    ((state)->shared && (state)->worker == -1)

Definition at line 406 of file tuplesort.c.

◆ MAXORDER

#define MAXORDER   500 /* maximum merge order */

Definition at line 181 of file tuplesort.c.

◆ MERGE_BUFFER_SIZE

#define MERGE_BUFFER_SIZE   (BLCKSZ * 32)

Definition at line 183 of file tuplesort.c.

◆ MINORDER

#define MINORDER   6 /* minimum merge order */

Definition at line 180 of file tuplesort.c.

◆ READTUP

#define READTUP (   state,
  stup,
  tape,
  len 
)    ((*(state)->base.readtup) (state, stup, tape, len))

Definition at line 399 of file tuplesort.c.

◆ RELEASE_SLAB_SLOT

#define RELEASE_SLAB_SLOT (   state,
  tuple 
)
Value:
do { \
SlabSlot *buf = (SlabSlot *) tuple; \
{ \
buf->nextfree = (state)->slabFreeHead; \
(state)->slabFreeHead = buf; \
} while(0)
if(TABLE==NULL||TABLE_index==NULL)
Definition: isn.c:77
void pfree(void *pointer)
Definition: mcxt.c:1306
static char * buf
Definition: pg_test_fsync.c:67
#define IS_SLAB_SLOT(state, tuple)
Definition: tuplesort.c:376

Definition at line 384 of file tuplesort.c.

◆ REMOVEABBREV

#define REMOVEABBREV (   state,
  stup,
  count 
)    ((*(state)->base.removeabbrev) (state, stup, count))

Definition at line 396 of file tuplesort.c.

◆ SERIAL

#define SERIAL (   state)    ((state)->shared == NULL)

Definition at line 404 of file tuplesort.c.

◆ SLAB_SLOT_SIZE

#define SLAB_SLOT_SIZE   1024

Definition at line 146 of file tuplesort.c.

◆ ST_CHECK_FOR_INTERRUPTS [1/4]

#define ST_CHECK_FOR_INTERRUPTS

Definition at line 621 of file tuplesort.c.

◆ ST_CHECK_FOR_INTERRUPTS [2/4]

#define ST_CHECK_FOR_INTERRUPTS

Definition at line 621 of file tuplesort.c.

◆ ST_CHECK_FOR_INTERRUPTS [3/4]

#define ST_CHECK_FOR_INTERRUPTS

Definition at line 621 of file tuplesort.c.

◆ ST_CHECK_FOR_INTERRUPTS [4/4]

#define ST_CHECK_FOR_INTERRUPTS

Definition at line 621 of file tuplesort.c.

◆ ST_COMPARE [1/3]

#define ST_COMPARE (   a,
  b,
  ssup 
)
Value:
ApplySortComparator((a)->datum1, (a)->isnull1, \
(b)->datum1, (b)->isnull1, (ssup))
int b
Definition: isn.c:70
int a
Definition: isn.c:69
static int ApplySortComparator(Datum datum1, bool isNull1, Datum datum2, bool isNull2, SortSupport ssup)
Definition: sortsupport.h:200

Definition at line 617 of file tuplesort.c.

◆ ST_COMPARE [2/3]

#define ST_COMPARE (   a,
  b,
  state 
)    qsort_tuple_unsigned_compare(a, b, state)

Definition at line 617 of file tuplesort.c.

◆ ST_COMPARE [3/3]

#define ST_COMPARE (   a,
  b,
  state 
)    qsort_tuple_int32_compare(a, b, state)

Definition at line 617 of file tuplesort.c.

◆ ST_COMPARE_ARG_TYPE [1/4]

#define ST_COMPARE_ARG_TYPE   Tuplesortstate

Definition at line 620 of file tuplesort.c.

◆ ST_COMPARE_ARG_TYPE [2/4]

#define ST_COMPARE_ARG_TYPE   Tuplesortstate

Definition at line 620 of file tuplesort.c.

◆ ST_COMPARE_ARG_TYPE [3/4]

#define ST_COMPARE_ARG_TYPE   Tuplesortstate

Definition at line 620 of file tuplesort.c.

◆ ST_COMPARE_ARG_TYPE [4/4]

#define ST_COMPARE_ARG_TYPE   SortSupportData

Definition at line 620 of file tuplesort.c.

◆ ST_COMPARE_RUNTIME_POINTER

#define ST_COMPARE_RUNTIME_POINTER

Definition at line 607 of file tuplesort.c.

◆ ST_DECLARE

#define ST_DECLARE

Definition at line 611 of file tuplesort.c.

◆ ST_DEFINE [1/4]

#define ST_DEFINE

Definition at line 623 of file tuplesort.c.

◆ ST_DEFINE [2/4]

#define ST_DEFINE

Definition at line 623 of file tuplesort.c.

◆ ST_DEFINE [3/4]

#define ST_DEFINE

Definition at line 623 of file tuplesort.c.

◆ ST_DEFINE [4/4]

#define ST_DEFINE

Definition at line 623 of file tuplesort.c.

◆ ST_ELEMENT_TYPE [1/4]

#define ST_ELEMENT_TYPE   SortTuple

Definition at line 616 of file tuplesort.c.

◆ ST_ELEMENT_TYPE [2/4]

#define ST_ELEMENT_TYPE   SortTuple

Definition at line 616 of file tuplesort.c.

◆ ST_ELEMENT_TYPE [3/4]

#define ST_ELEMENT_TYPE   SortTuple

Definition at line 616 of file tuplesort.c.

◆ ST_ELEMENT_TYPE [4/4]

#define ST_ELEMENT_TYPE   SortTuple

Definition at line 616 of file tuplesort.c.

◆ ST_SCOPE [1/4]

#define ST_SCOPE   static

Definition at line 622 of file tuplesort.c.

◆ ST_SCOPE [2/4]

#define ST_SCOPE   static

Definition at line 622 of file tuplesort.c.

◆ ST_SCOPE [3/4]

#define ST_SCOPE   static

Definition at line 622 of file tuplesort.c.

◆ ST_SCOPE [4/4]

#define ST_SCOPE   static

Definition at line 622 of file tuplesort.c.

◆ ST_SORT [1/4]

#define ST_SORT   qsort_tuple_unsigned

Definition at line 615 of file tuplesort.c.

◆ ST_SORT [2/4]

#define ST_SORT   qsort_tuple_int32

Definition at line 615 of file tuplesort.c.

◆ ST_SORT [3/4]

#define ST_SORT   qsort_tuple

Definition at line 615 of file tuplesort.c.

◆ ST_SORT [4/4]

#define ST_SORT   qsort_ssup

Definition at line 615 of file tuplesort.c.

◆ TAPE_BUFFER_OVERHEAD

#define TAPE_BUFFER_OVERHEAD   BLCKSZ

Definition at line 182 of file tuplesort.c.

◆ USEMEM

#define USEMEM (   state,
  amt 
)    ((state)->availMem -= (amt))

Definition at line 402 of file tuplesort.c.

◆ WORKER

#define WORKER (   state)    ((state)->shared && (state)->worker != -1)

Definition at line 405 of file tuplesort.c.

◆ WRITETUP

#define WRITETUP (   state,
  tape,
  stup 
)    ((*(state)->base.writetup) (state, tape, stup))

Definition at line 398 of file tuplesort.c.

Typedef Documentation

◆ SlabSlot

typedef union SlabSlot SlabSlot

Enumeration Type Documentation

◆ TupSortStatus

Enumerator
TSS_INITIAL 
TSS_BOUNDED 
TSS_BUILDRUNS 
TSS_SORTEDINMEM 
TSS_SORTEDONTAPE 
TSS_FINALMERGE 

Definition at line 158 of file tuplesort.c.

159 {
160  TSS_INITIAL, /* Loading tuples; still within memory limit */
161  TSS_BOUNDED, /* Loading tuples into bounded-size heap */
162  TSS_BUILDRUNS, /* Loading tuples; writing to tape */
163  TSS_SORTEDINMEM, /* Sort completed entirely in memory */
164  TSS_SORTEDONTAPE, /* Sort completed, final run is on tape */
165  TSS_FINALMERGE /* Performing final merge on-the-fly */
166 } TupSortStatus;
TupSortStatus
Definition: tuplesort.c:159
@ TSS_SORTEDONTAPE
Definition: tuplesort.c:164
@ TSS_SORTEDINMEM
Definition: tuplesort.c:163
@ TSS_INITIAL
Definition: tuplesort.c:160
@ TSS_FINALMERGE
Definition: tuplesort.c:165
@ TSS_BUILDRUNS
Definition: tuplesort.c:162
@ TSS_BOUNDED
Definition: tuplesort.c:161

Function Documentation

◆ beginmerge()

static void beginmerge ( Tuplesortstate state)
static

Definition at line 2292 of file tuplesort.c.

2293 {
2294  int activeTapes;
2295  int srcTapeIndex;
2296 
2297  /* Heap should be empty here */
2298  Assert(state->memtupcount == 0);
2299 
2300  activeTapes = Min(state->nInputTapes, state->nInputRuns);
2301 
2302  for (srcTapeIndex = 0; srcTapeIndex < activeTapes; srcTapeIndex++)
2303  {
2304  SortTuple tup;
2305 
2306  if (mergereadnext(state, state->inputTapes[srcTapeIndex], &tup))
2307  {
2308  tup.srctape = srcTapeIndex;
2310  }
2311  }
2312 }
#define Min(x, y)
Definition: c.h:937
Assert(fmt[strlen(fmt) - 1] !='\n')
int srctape
Definition: tuplesort.h:141
static void tuplesort_heap_insert(Tuplesortstate *state, SortTuple *tuple)
Definition: tuplesort.c:2777
static bool mergereadnext(Tuplesortstate *state, LogicalTape *srcTape, SortTuple *stup)
Definition: tuplesort.c:2320

References Assert(), mergereadnext(), Min, SortTuple::srctape, and tuplesort_heap_insert().

Referenced by mergeonerun(), and mergeruns().

◆ consider_abort_common()

static bool consider_abort_common ( Tuplesortstate state)
static

Definition at line 1341 of file tuplesort.c.

1342 {
1343  Assert(state->base.sortKeys[0].abbrev_converter != NULL);
1344  Assert(state->base.sortKeys[0].abbrev_abort != NULL);
1345  Assert(state->base.sortKeys[0].abbrev_full_comparator != NULL);
1346 
1347  /*
1348  * Check effectiveness of abbreviation optimization. Consider aborting
1349  * when still within memory limit.
1350  */
1351  if (state->status == TSS_INITIAL &&
1352  state->memtupcount >= state->abbrevNext)
1353  {
1354  state->abbrevNext *= 2;
1355 
1356  /*
1357  * Check opclass-supplied abbreviation abort routine. It may indicate
1358  * that abbreviation should not proceed.
1359  */
1360  if (!state->base.sortKeys->abbrev_abort(state->memtupcount,
1361  state->base.sortKeys))
1362  return false;
1363 
1364  /*
1365  * Finally, restore authoritative comparator, and indicate that
1366  * abbreviation is not in play by setting abbrev_converter to NULL
1367  */
1368  state->base.sortKeys[0].comparator = state->base.sortKeys[0].abbrev_full_comparator;
1369  state->base.sortKeys[0].abbrev_converter = NULL;
1370  /* Not strictly necessary, but be tidy */
1371  state->base.sortKeys[0].abbrev_abort = NULL;
1372  state->base.sortKeys[0].abbrev_full_comparator = NULL;
1373 
1374  /* Give up - expect original pass-by-value representation */
1375  return true;
1376  }
1377 
1378  return false;
1379 }

References Assert(), and TSS_INITIAL.

Referenced by tuplesort_puttuple_common().

◆ dumptuples()

static void dumptuples ( Tuplesortstate state,
bool  alltuples 
)
static

Definition at line 2339 of file tuplesort.c.

2340 {
2341  int memtupwrite;
2342  int i;
2343 
2344  /*
2345  * Nothing to do if we still fit in available memory and have array slots,
2346  * unless this is the final call during initial run generation.
2347  */
2348  if (state->memtupcount < state->memtupsize && !LACKMEM(state) &&
2349  !alltuples)
2350  return;
2351 
2352  /*
2353  * Final call might require no sorting, in rare cases where we just so
2354  * happen to have previously LACKMEM()'d at the point where exactly all
2355  * remaining tuples are loaded into memory, just before input was
2356  * exhausted. In general, short final runs are quite possible, but avoid
2357  * creating a completely empty run. In a worker, though, we must produce
2358  * at least one tape, even if it's empty.
2359  */
2360  if (state->memtupcount == 0 && state->currentRun > 0)
2361  return;
2362 
2363  Assert(state->status == TSS_BUILDRUNS);
2364 
2365  /*
2366  * It seems unlikely that this limit will ever be exceeded, but take no
2367  * chances
2368  */
2369  if (state->currentRun == INT_MAX)
2370  ereport(ERROR,
2371  (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
2372  errmsg("cannot have more than %d runs for an external sort",
2373  INT_MAX)));
2374 
2375  if (state->currentRun > 0)
2377 
2378  state->currentRun++;
2379 
2380 #ifdef TRACE_SORT
2381  if (trace_sort)
2382  elog(LOG, "worker %d starting quicksort of run %d: %s",
2383  state->worker, state->currentRun,
2384  pg_rusage_show(&state->ru_start));
2385 #endif
2386 
2387  /*
2388  * Sort all tuples accumulated within the allowed amount of memory for
2389  * this run using quicksort
2390  */
2392 
2393 #ifdef TRACE_SORT
2394  if (trace_sort)
2395  elog(LOG, "worker %d finished quicksort of run %d: %s",
2396  state->worker, state->currentRun,
2397  pg_rusage_show(&state->ru_start));
2398 #endif
2399 
2400  memtupwrite = state->memtupcount;
2401  for (i = 0; i < memtupwrite; i++)
2402  {
2403  SortTuple *stup = &state->memtuples[i];
2404 
2405  WRITETUP(state, state->destTape, stup);
2406 
2407  /*
2408  * Account for freeing the tuple, but no need to do the actual pfree
2409  * since the tuplecontext is being reset after the loop.
2410  */
2411  if (stup->tuple != NULL)
2413  }
2414 
2415  state->memtupcount = 0;
2416 
2417  /*
2418  * Reset tuple memory. We've freed all of the tuples that we previously
2419  * allocated. It's important to avoid fragmentation when there is a stark
2420  * change in the sizes of incoming tuples. Fragmentation due to
2421  * AllocSetFree's bucketing by size class might be particularly bad if
2422  * this step wasn't taken.
2423  */
2424  MemoryContextReset(state->base.tuplecontext);
2425 
2426  markrunend(state->destTape);
2427 
2428 #ifdef TRACE_SORT
2429  if (trace_sort)
2430  elog(LOG, "worker %d finished writing run %d to tape %d: %s",
2431  state->worker, state->currentRun, (state->currentRun - 1) % state->nOutputTapes + 1,
2432  pg_rusage_show(&state->ru_start));
2433 #endif
2434 }
int errcode(int sqlerrcode)
Definition: elog.c:695
int errmsg(const char *fmt,...)
Definition: elog.c:906
#define LOG
Definition: elog.h:27
#define ERROR
Definition: elog.h:35
#define ereport(elevel,...)
Definition: elog.h:145
int i
Definition: isn.c:73
void MemoryContextReset(MemoryContext context)
Definition: mcxt.c:303
Size GetMemoryChunkSpace(void *pointer)
Definition: mcxt.c:603
const char * pg_rusage_show(const PGRUsage *ru0)
Definition: pg_rusage.c:40
void * tuple
Definition: tuplesort.h:138
static void selectnewtape(Tuplesortstate *state)
Definition: tuplesort.c:1976
static void markrunend(LogicalTape *tape)
Definition: tuplesort.c:2907
#define LACKMEM(state)
Definition: tuplesort.c:401
#define WRITETUP(state, tape, stup)
Definition: tuplesort.c:398
#define FREEMEM(state, amt)
Definition: tuplesort.c:403
static void tuplesort_sort_memtuples(Tuplesortstate *state)
Definition: tuplesort.c:2714
bool trace_sort
Definition: tuplesort.c:127

References Assert(), elog(), ereport, errcode(), errmsg(), ERROR, FREEMEM, GetMemoryChunkSpace(), i, LACKMEM, LOG, markrunend(), MemoryContextReset(), pg_rusage_show(), selectnewtape(), trace_sort, TSS_BUILDRUNS, SortTuple::tuple, tuplesort_sort_memtuples(), and WRITETUP.

Referenced by tuplesort_performsort(), and tuplesort_puttuple_common().

◆ free_sort_tuple()

static void free_sort_tuple ( Tuplesortstate state,
SortTuple stup 
)
static

Definition at line 3166 of file tuplesort.c.

3167 {
3168  if (stup->tuple)
3169  {
3171  pfree(stup->tuple);
3172  stup->tuple = NULL;
3173  }
3174 }

References FREEMEM, GetMemoryChunkSpace(), pfree(), and SortTuple::tuple.

Referenced by make_bounded_heap(), and tuplesort_puttuple_common().

◆ getlen()

static unsigned int getlen ( LogicalTape tape,
bool  eofOK 
)
static

Definition at line 2894 of file tuplesort.c.

2895 {
2896  unsigned int len;
2897 
2898  if (LogicalTapeRead(tape,
2899  &len, sizeof(len)) != sizeof(len))
2900  elog(ERROR, "unexpected end of tape");
2901  if (len == 0 && !eofOK)
2902  elog(ERROR, "unexpected end of data");
2903  return len;
2904 }
size_t LogicalTapeRead(LogicalTape *lt, void *ptr, size_t size)
Definition: logtape.c:929
const void size_t len

References elog(), ERROR, len, and LogicalTapeRead().

Referenced by mergereadnext(), and tuplesort_gettuple_common().

◆ grow_memtuples()

static bool grow_memtuples ( Tuplesortstate state)
static

Definition at line 1073 of file tuplesort.c.

1074 {
1075  int newmemtupsize;
1076  int memtupsize = state->memtupsize;
1077  int64 memNowUsed = state->allowedMem - state->availMem;
1078 
1079  /* Forget it if we've already maxed out memtuples, per comment above */
1080  if (!state->growmemtuples)
1081  return false;
1082 
1083  /* Select new value of memtupsize */
1084  if (memNowUsed <= state->availMem)
1085  {
1086  /*
1087  * We've used no more than half of allowedMem; double our usage,
1088  * clamping at INT_MAX tuples.
1089  */
1090  if (memtupsize < INT_MAX / 2)
1091  newmemtupsize = memtupsize * 2;
1092  else
1093  {
1094  newmemtupsize = INT_MAX;
1095  state->growmemtuples = false;
1096  }
1097  }
1098  else
1099  {
1100  /*
1101  * This will be the last increment of memtupsize. Abandon doubling
1102  * strategy and instead increase as much as we safely can.
1103  *
1104  * To stay within allowedMem, we can't increase memtupsize by more
1105  * than availMem / sizeof(SortTuple) elements. In practice, we want
1106  * to increase it by considerably less, because we need to leave some
1107  * space for the tuples to which the new array slots will refer. We
1108  * assume the new tuples will be about the same size as the tuples
1109  * we've already seen, and thus we can extrapolate from the space
1110  * consumption so far to estimate an appropriate new size for the
1111  * memtuples array. The optimal value might be higher or lower than
1112  * this estimate, but it's hard to know that in advance. We again
1113  * clamp at INT_MAX tuples.
1114  *
1115  * This calculation is safe against enlarging the array so much that
1116  * LACKMEM becomes true, because the memory currently used includes
1117  * the present array; thus, there would be enough allowedMem for the
1118  * new array elements even if no other memory were currently used.
1119  *
1120  * We do the arithmetic in float8, because otherwise the product of
1121  * memtupsize and allowedMem could overflow. Any inaccuracy in the
1122  * result should be insignificant; but even if we computed a
1123  * completely insane result, the checks below will prevent anything
1124  * really bad from happening.
1125  */
1126  double grow_ratio;
1127 
1128  grow_ratio = (double) state->allowedMem / (double) memNowUsed;
1129  if (memtupsize * grow_ratio < INT_MAX)
1130  newmemtupsize = (int) (memtupsize * grow_ratio);
1131  else
1132  newmemtupsize = INT_MAX;
1133 
1134  /* We won't make any further enlargement attempts */
1135  state->growmemtuples = false;
1136  }
1137 
1138  /* Must enlarge array by at least one element, else report failure */
1139  if (newmemtupsize <= memtupsize)
1140  goto noalloc;
1141 
1142  /*
1143  * On a 32-bit machine, allowedMem could exceed MaxAllocHugeSize. Clamp
1144  * to ensure our request won't be rejected. Note that we can easily
1145  * exhaust address space before facing this outcome. (This is presently
1146  * impossible due to guc.c's MAX_KILOBYTES limitation on work_mem, but
1147  * don't rely on that at this distance.)
1148  */
1149  if ((Size) newmemtupsize >= MaxAllocHugeSize / sizeof(SortTuple))
1150  {
1151  newmemtupsize = (int) (MaxAllocHugeSize / sizeof(SortTuple));
1152  state->growmemtuples = false; /* can't grow any more */
1153  }
1154 
1155  /*
1156  * We need to be sure that we do not cause LACKMEM to become true, else
1157  * the space management algorithm will go nuts. The code above should
1158  * never generate a dangerous request, but to be safe, check explicitly
1159  * that the array growth fits within availMem. (We could still cause
1160  * LACKMEM if the memory chunk overhead associated with the memtuples
1161  * array were to increase. That shouldn't happen because we chose the
1162  * initial array size large enough to ensure that palloc will be treating
1163  * both old and new arrays as separate chunks. But we'll check LACKMEM
1164  * explicitly below just in case.)
1165  */
1166  if (state->availMem < (int64) ((newmemtupsize - memtupsize) * sizeof(SortTuple)))
1167  goto noalloc;
1168 
1169  /* OK, do it */
1170  FREEMEM(state, GetMemoryChunkSpace(state->memtuples));
1171  state->memtupsize = newmemtupsize;
1172  state->memtuples = (SortTuple *)
1173  repalloc_huge(state->memtuples,
1174  state->memtupsize * sizeof(SortTuple));
1175  USEMEM(state, GetMemoryChunkSpace(state->memtuples));
1176  if (LACKMEM(state))
1177  elog(ERROR, "unexpected out-of-memory situation in tuplesort");
1178  return true;
1179 
1180 noalloc:
1181  /* If for any reason we didn't realloc, shut off future attempts */
1182  state->growmemtuples = false;
1183  return false;
1184 }
size_t Size
Definition: c.h:541
void * repalloc_huge(void *pointer, Size size)
Definition: mcxt.c:1459
#define MaxAllocHugeSize
Definition: memutils.h:45
#define USEMEM(state, amt)
Definition: tuplesort.c:402

References elog(), ERROR, FREEMEM, GetMemoryChunkSpace(), LACKMEM, MaxAllocHugeSize, repalloc_huge(), and USEMEM.

Referenced by tuplesort_puttuple_common().

◆ init_slab_allocator()

static void init_slab_allocator ( Tuplesortstate state,
int  numSlots 
)
static

Definition at line 2009 of file tuplesort.c.

2010 {
2011  if (numSlots > 0)
2012  {
2013  char *p;
2014  int i;
2015 
2016  state->slabMemoryBegin = palloc(numSlots * SLAB_SLOT_SIZE);
2017  state->slabMemoryEnd = state->slabMemoryBegin +
2018  numSlots * SLAB_SLOT_SIZE;
2019  state->slabFreeHead = (SlabSlot *) state->slabMemoryBegin;
2020  USEMEM(state, numSlots * SLAB_SLOT_SIZE);
2021 
2022  p = state->slabMemoryBegin;
2023  for (i = 0; i < numSlots - 1; i++)
2024  {
2025  ((SlabSlot *) p)->nextfree = (SlabSlot *) (p + SLAB_SLOT_SIZE);
2026  p += SLAB_SLOT_SIZE;
2027  }
2028  ((SlabSlot *) p)->nextfree = NULL;
2029  }
2030  else
2031  {
2032  state->slabMemoryBegin = state->slabMemoryEnd = NULL;
2033  state->slabFreeHead = NULL;
2034  }
2035  state->slabAllocatorUsed = true;
2036 }
void * palloc(Size size)
Definition: mcxt.c:1199
#define SLAB_SLOT_SIZE
Definition: tuplesort.c:146

References i, palloc(), SLAB_SLOT_SIZE, and USEMEM.

Referenced by mergeruns().

◆ inittapes()

static void inittapes ( Tuplesortstate state,
bool  mergeruns 
)
static

Definition at line 1891 of file tuplesort.c.

1892 {
1893  Assert(!LEADER(state));
1894 
1895  if (mergeruns)
1896  {
1897  /* Compute number of input tapes to use when merging */
1898  state->maxTapes = tuplesort_merge_order(state->allowedMem);
1899  }
1900  else
1901  {
1902  /* Workers can sometimes produce single run, output without merge */
1903  Assert(WORKER(state));
1904  state->maxTapes = MINORDER;
1905  }
1906 
1907 #ifdef TRACE_SORT
1908  if (trace_sort)
1909  elog(LOG, "worker %d switching to external sort with %d tapes: %s",
1910  state->worker, state->maxTapes, pg_rusage_show(&state->ru_start));
1911 #endif
1912 
1913  /* Create the tape set */
1914  inittapestate(state, state->maxTapes);
1915  state->tapeset =
1916  LogicalTapeSetCreate(false,
1917  state->shared ? &state->shared->fileset : NULL,
1918  state->worker);
1919 
1920  state->currentRun = 0;
1921 
1922  /*
1923  * Initialize logical tape arrays.
1924  */
1925  state->inputTapes = NULL;
1926  state->nInputTapes = 0;
1927  state->nInputRuns = 0;
1928 
1929  state->outputTapes = palloc0(state->maxTapes * sizeof(LogicalTape *));
1930  state->nOutputTapes = 0;
1931  state->nOutputRuns = 0;
1932 
1933  state->status = TSS_BUILDRUNS;
1934 
1936 }
LogicalTapeSet * LogicalTapeSetCreate(bool preallocate, SharedFileSet *fileset, int worker)
Definition: logtape.c:557
void * palloc0(Size size)
Definition: mcxt.c:1230
int tuplesort_merge_order(int64 allowedMem)
Definition: tuplesort.c:1804
static void inittapestate(Tuplesortstate *state, int maxTapes)
Definition: tuplesort.c:1942
#define LEADER(state)
Definition: tuplesort.c:406
#define WORKER(state)
Definition: tuplesort.c:405
static void mergeruns(Tuplesortstate *state)
Definition: tuplesort.c:2045
#define MINORDER
Definition: tuplesort.c:180

References Assert(), elog(), inittapestate(), LEADER, LOG, LogicalTapeSetCreate(), mergeruns(), MINORDER, palloc0(), pg_rusage_show(), selectnewtape(), trace_sort, TSS_BUILDRUNS, tuplesort_merge_order(), and WORKER.

Referenced by tuplesort_performsort(), and tuplesort_puttuple_common().

◆ inittapestate()

static void inittapestate ( Tuplesortstate state,
int  maxTapes 
)
static

Definition at line 1942 of file tuplesort.c.

1943 {
1944  int64 tapeSpace;
1945 
1946  /*
1947  * Decrease availMem to reflect the space needed for tape buffers; but
1948  * don't decrease it to the point that we have no room for tuples. (That
1949  * case is only likely to occur if sorting pass-by-value Datums; in all
1950  * other scenarios the memtuples[] array is unlikely to occupy more than
1951  * half of allowedMem. In the pass-by-value case it's not important to
1952  * account for tuple space, so we don't care if LACKMEM becomes
1953  * inaccurate.)
1954  */
1955  tapeSpace = (int64) maxTapes * TAPE_BUFFER_OVERHEAD;
1956 
1957  if (tapeSpace + GetMemoryChunkSpace(state->memtuples) < state->allowedMem)
1958  USEMEM(state, tapeSpace);
1959 
1960  /*
1961  * Make sure that the temp file(s) underlying the tape set are created in
1962  * suitable temp tablespaces. For parallel sorts, this should have been
1963  * called already, but it doesn't matter if it is called a second time.
1964  */
1966 }
void PrepareTempTablespaces(void)
Definition: tablespace.c:1337
#define TAPE_BUFFER_OVERHEAD
Definition: tuplesort.c:182

References GetMemoryChunkSpace(), PrepareTempTablespaces(), TAPE_BUFFER_OVERHEAD, and USEMEM.

Referenced by inittapes(), and leader_takeover_tapes().

◆ leader_takeover_tapes()

static void leader_takeover_tapes ( Tuplesortstate state)
static

Definition at line 3107 of file tuplesort.c.

3108 {
3109  Sharedsort *shared = state->shared;
3110  int nParticipants = state->nParticipants;
3111  int workersFinished;
3112  int j;
3113 
3114  Assert(LEADER(state));
3115  Assert(nParticipants >= 1);
3116 
3117  SpinLockAcquire(&shared->mutex);
3118  workersFinished = shared->workersFinished;
3119  SpinLockRelease(&shared->mutex);
3120 
3121  if (nParticipants != workersFinished)
3122  elog(ERROR, "cannot take over tapes before all workers finish");
3123 
3124  /*
3125  * Create the tapeset from worker tapes, including a leader-owned tape at
3126  * the end. Parallel workers are far more expensive than logical tapes,
3127  * so the number of tapes allocated here should never be excessive.
3128  */
3129  inittapestate(state, nParticipants);
3130  state->tapeset = LogicalTapeSetCreate(false, &shared->fileset, -1);
3131 
3132  /*
3133  * Set currentRun to reflect the number of runs we will merge (it's not
3134  * used for anything, this is just pro forma)
3135  */
3136  state->currentRun = nParticipants;
3137 
3138  /*
3139  * Initialize the state to look the same as after building the initial
3140  * runs.
3141  *
3142  * There will always be exactly 1 run per worker, and exactly one input
3143  * tape per run, because workers always output exactly 1 run, even when
3144  * there were no input tuples for workers to sort.
3145  */
3146  state->inputTapes = NULL;
3147  state->nInputTapes = 0;
3148  state->nInputRuns = 0;
3149 
3150  state->outputTapes = palloc0(nParticipants * sizeof(LogicalTape *));
3151  state->nOutputTapes = nParticipants;
3152  state->nOutputRuns = nParticipants;
3153 
3154  for (j = 0; j < nParticipants; j++)
3155  {
3156  state->outputTapes[j] = LogicalTapeImport(state->tapeset, j, &shared->tapes[j]);
3157  }
3158 
3159  state->status = TSS_BUILDRUNS;
3160 }
int j
Definition: isn.c:74
LogicalTape * LogicalTapeImport(LogicalTapeSet *lts, int worker, TapeShare *shared)
Definition: logtape.c:610
#define SpinLockRelease(lock)
Definition: spin.h:64
#define SpinLockAcquire(lock)
Definition: spin.h:62
SharedFileSet fileset
Definition: tuplesort.c:361
TapeShare tapes[FLEXIBLE_ARRAY_MEMBER]
Definition: tuplesort.c:370
int workersFinished
Definition: tuplesort.c:358
slock_t mutex
Definition: tuplesort.c:347

References Assert(), elog(), ERROR, Sharedsort::fileset, inittapestate(), j, LEADER, LogicalTapeImport(), LogicalTapeSetCreate(), Sharedsort::mutex, palloc0(), SpinLockAcquire, SpinLockRelease, Sharedsort::tapes, TSS_BUILDRUNS, and Sharedsort::workersFinished.

Referenced by tuplesort_performsort().

◆ make_bounded_heap()

static void make_bounded_heap ( Tuplesortstate state)
static

Definition at line 2625 of file tuplesort.c.

2626 {
2627  int tupcount = state->memtupcount;
2628  int i;
2629 
2630  Assert(state->status == TSS_INITIAL);
2631  Assert(state->bounded);
2632  Assert(tupcount >= state->bound);
2633  Assert(SERIAL(state));
2634 
2635  /* Reverse sort direction so largest entry will be at root */
2637 
2638  state->memtupcount = 0; /* make the heap empty */
2639  for (i = 0; i < tupcount; i++)
2640  {
2641  if (state->memtupcount < state->bound)
2642  {
2643  /* Insert next tuple into heap */
2644  /* Must copy source tuple to avoid possible overwrite */
2645  SortTuple stup = state->memtuples[i];
2646 
2647  tuplesort_heap_insert(state, &stup);
2648  }
2649  else
2650  {
2651  /*
2652  * The heap is full. Replace the largest entry with the new
2653  * tuple, or just discard it, if it's larger than anything already
2654  * in the heap.
2655  */
2656  if (COMPARETUP(state, &state->memtuples[i], &state->memtuples[0]) <= 0)
2657  {
2658  free_sort_tuple(state, &state->memtuples[i]);
2660  }
2661  else
2662  tuplesort_heap_replace_top(state, &state->memtuples[i]);
2663  }
2664  }
2665 
2666  Assert(state->memtupcount == state->bound);
2667  state->status = TSS_BOUNDED;
2668 }
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:121
#define COMPARETUP(state, a, b)
Definition: tuplesort.c:397
#define SERIAL(state)
Definition: tuplesort.c:404
static void free_sort_tuple(Tuplesortstate *state, SortTuple *stup)
Definition: tuplesort.c:3166
static void reversedirection(Tuplesortstate *state)
Definition: tuplesort.c:2876
static void tuplesort_heap_replace_top(Tuplesortstate *state, SortTuple *tuple)
Definition: tuplesort.c:2836

References Assert(), CHECK_FOR_INTERRUPTS, COMPARETUP, free_sort_tuple(), i, reversedirection(), SERIAL, TSS_BOUNDED, TSS_INITIAL, tuplesort_heap_insert(), and tuplesort_heap_replace_top().

Referenced by tuplesort_puttuple_common().

◆ markrunend()

static void markrunend ( LogicalTape tape)
static

Definition at line 2907 of file tuplesort.c.

2908 {
2909  unsigned int len = 0;
2910 
2911  LogicalTapeWrite(tape, (void *) &len, sizeof(len));
2912 }
void LogicalTapeWrite(LogicalTape *lt, void *ptr, size_t size)
Definition: logtape.c:762

References len, and LogicalTapeWrite().

Referenced by dumptuples(), and mergeonerun().

◆ merge_read_buffer_size()

static int64 merge_read_buffer_size ( int64  avail_mem,
int  nInputTapes,
int  nInputRuns,
int  maxOutputTapes 
)
static

Definition at line 1859 of file tuplesort.c.

1861 {
1862  int nOutputRuns;
1863  int nOutputTapes;
1864 
1865  /*
1866  * How many output tapes will we produce in this pass?
1867  *
1868  * This is nInputRuns / nInputTapes, rounded up.
1869  */
1870  nOutputRuns = (nInputRuns + nInputTapes - 1) / nInputTapes;
1871 
1872  nOutputTapes = Min(nOutputRuns, maxOutputTapes);
1873 
1874  /*
1875  * Each output tape consumes TAPE_BUFFER_OVERHEAD bytes of memory. All
1876  * remaining memory is divided evenly between the input tapes.
1877  *
1878  * This also follows from the formula in tuplesort_merge_order, but here
1879  * we derive the input buffer size from the amount of memory available,
1880  * and M and N.
1881  */
1882  return Max((avail_mem - TAPE_BUFFER_OVERHEAD * nOutputTapes) / nInputTapes, 0);
1883 }

References Max, Min, and TAPE_BUFFER_OVERHEAD.

Referenced by mergeruns().

◆ mergeonerun()

static void mergeonerun ( Tuplesortstate state)
static

Definition at line 2232 of file tuplesort.c.

2233 {
2234  int srcTapeIndex;
2235  LogicalTape *srcTape;
2236 
2237  /*
2238  * Start the merge by loading one tuple from each active source tape into
2239  * the heap.
2240  */
2241  beginmerge(state);
2242 
2243  Assert(state->slabAllocatorUsed);
2244 
2245  /*
2246  * Execute merge by repeatedly extracting lowest tuple in heap, writing it
2247  * out, and replacing it with next tuple from same tape (if there is
2248  * another one).
2249  */
2250  while (state->memtupcount > 0)
2251  {
2252  SortTuple stup;
2253 
2254  /* write the tuple to destTape */
2255  srcTapeIndex = state->memtuples[0].srctape;
2256  srcTape = state->inputTapes[srcTapeIndex];
2257  WRITETUP(state, state->destTape, &state->memtuples[0]);
2258 
2259  /* recycle the slot of the tuple we just wrote out, for the next read */
2260  if (state->memtuples[0].tuple)
2261  RELEASE_SLAB_SLOT(state, state->memtuples[0].tuple);
2262 
2263  /*
2264  * pull next tuple from the tape, and replace the written-out tuple in
2265  * the heap with it.
2266  */
2267  if (mergereadnext(state, srcTape, &stup))
2268  {
2269  stup.srctape = srcTapeIndex;
2271  }
2272  else
2273  {
2275  state->nInputRuns--;
2276  }
2277  }
2278 
2279  /*
2280  * When the heap empties, we're done. Write an end-of-run marker on the
2281  * output tape.
2282  */
2283  markrunend(state->destTape);
2284 }
static void tuplesort_heap_delete_top(Tuplesortstate *state)
Definition: tuplesort.c:2812
static void beginmerge(Tuplesortstate *state)
Definition: tuplesort.c:2292
#define RELEASE_SLAB_SLOT(state, tuple)
Definition: tuplesort.c:384

References Assert(), beginmerge(), markrunend(), mergereadnext(), RELEASE_SLAB_SLOT, SortTuple::srctape, tuplesort_heap_delete_top(), tuplesort_heap_replace_top(), and WRITETUP.

Referenced by mergeruns().

◆ mergereadnext()

static bool mergereadnext ( Tuplesortstate state,
LogicalTape srcTape,
SortTuple stup 
)
static

Definition at line 2320 of file tuplesort.c.

2321 {
2322  unsigned int tuplen;
2323 
2324  /* read next tuple, if any */
2325  if ((tuplen = getlen(srcTape, true)) == 0)
2326  return false;
2327  READTUP(state, stup, srcTape, tuplen);
2328 
2329  return true;
2330 }
static unsigned int getlen(LogicalTape *tape, bool eofOK)
Definition: tuplesort.c:2894
#define READTUP(state, stup, tape, len)
Definition: tuplesort.c:399

References getlen(), and READTUP.

Referenced by beginmerge(), mergeonerun(), and tuplesort_gettuple_common().

◆ mergeruns()

static void mergeruns ( Tuplesortstate state)
static

Definition at line 2045 of file tuplesort.c.

2046 {
2047  int tapenum;
2048 
2049  Assert(state->status == TSS_BUILDRUNS);
2050  Assert(state->memtupcount == 0);
2051 
2052  if (state->base.sortKeys != NULL && state->base.sortKeys->abbrev_converter != NULL)
2053  {
2054  /*
2055  * If there are multiple runs to be merged, when we go to read back
2056  * tuples from disk, abbreviated keys will not have been stored, and
2057  * we don't care to regenerate them. Disable abbreviation from this
2058  * point on.
2059  */
2060  state->base.sortKeys->abbrev_converter = NULL;
2061  state->base.sortKeys->comparator = state->base.sortKeys->abbrev_full_comparator;
2062 
2063  /* Not strictly necessary, but be tidy */
2064  state->base.sortKeys->abbrev_abort = NULL;
2065  state->base.sortKeys->abbrev_full_comparator = NULL;
2066  }
2067 
2068  /*
2069  * Reset tuple memory. We've freed all the tuples that we previously
2070  * allocated. We will use the slab allocator from now on.
2071  */
2072  MemoryContextResetOnly(state->base.tuplecontext);
2073 
2074  /*
2075  * We no longer need a large memtuples array. (We will allocate a smaller
2076  * one for the heap later.)
2077  */
2078  FREEMEM(state, GetMemoryChunkSpace(state->memtuples));
2079  pfree(state->memtuples);
2080  state->memtuples = NULL;
2081 
2082  /*
2083  * Initialize the slab allocator. We need one slab slot per input tape,
2084  * for the tuples in the heap, plus one to hold the tuple last returned
2085  * from tuplesort_gettuple. (If we're sorting pass-by-val Datums,
2086  * however, we don't need to do allocate anything.)
2087  *
2088  * In a multi-pass merge, we could shrink this allocation for the last
2089  * merge pass, if it has fewer tapes than previous passes, but we don't
2090  * bother.
2091  *
2092  * From this point on, we no longer use the USEMEM()/LACKMEM() mechanism
2093  * to track memory usage of individual tuples.
2094  */
2095  if (state->base.tuples)
2096  init_slab_allocator(state, state->nOutputTapes + 1);
2097  else
2099 
2100  /*
2101  * Allocate a new 'memtuples' array, for the heap. It will hold one tuple
2102  * from each input tape.
2103  *
2104  * We could shrink this, too, between passes in a multi-pass merge, but we
2105  * don't bother. (The initial input tapes are still in outputTapes. The
2106  * number of input tapes will not increase between passes.)
2107  */
2108  state->memtupsize = state->nOutputTapes;
2109  state->memtuples = (SortTuple *) MemoryContextAlloc(state->base.maincontext,
2110  state->nOutputTapes * sizeof(SortTuple));
2111  USEMEM(state, GetMemoryChunkSpace(state->memtuples));
2112 
2113  /*
2114  * Use all the remaining memory we have available for tape buffers among
2115  * all the input tapes. At the beginning of each merge pass, we will
2116  * divide this memory between the input and output tapes in the pass.
2117  */
2118  state->tape_buffer_mem = state->availMem;
2119  USEMEM(state, state->tape_buffer_mem);
2120 #ifdef TRACE_SORT
2121  if (trace_sort)
2122  elog(LOG, "worker %d using %zu KB of memory for tape buffers",
2123  state->worker, state->tape_buffer_mem / 1024);
2124 #endif
2125 
2126  for (;;)
2127  {
2128  /*
2129  * On the first iteration, or if we have read all the runs from the
2130  * input tapes in a multi-pass merge, it's time to start a new pass.
2131  * Rewind all the output tapes, and make them inputs for the next
2132  * pass.
2133  */
2134  if (state->nInputRuns == 0)
2135  {
2136  int64 input_buffer_size;
2137 
2138  /* Close the old, emptied, input tapes */
2139  if (state->nInputTapes > 0)
2140  {
2141  for (tapenum = 0; tapenum < state->nInputTapes; tapenum++)
2142  LogicalTapeClose(state->inputTapes[tapenum]);
2143  pfree(state->inputTapes);
2144  }
2145 
2146  /* Previous pass's outputs become next pass's inputs. */
2147  state->inputTapes = state->outputTapes;
2148  state->nInputTapes = state->nOutputTapes;
2149  state->nInputRuns = state->nOutputRuns;
2150 
2151  /*
2152  * Reset output tape variables. The actual LogicalTapes will be
2153  * created as needed, here we only allocate the array to hold
2154  * them.
2155  */
2156  state->outputTapes = palloc0(state->nInputTapes * sizeof(LogicalTape *));
2157  state->nOutputTapes = 0;
2158  state->nOutputRuns = 0;
2159 
2160  /*
2161  * Redistribute the memory allocated for tape buffers, among the
2162  * new input and output tapes.
2163  */
2164  input_buffer_size = merge_read_buffer_size(state->tape_buffer_mem,
2165  state->nInputTapes,
2166  state->nInputRuns,
2167  state->maxTapes);
2168 
2169 #ifdef TRACE_SORT
2170  if (trace_sort)
2171  elog(LOG, "starting merge pass of %d input runs on %d tapes, " INT64_FORMAT " KB of memory for each input tape: %s",
2172  state->nInputRuns, state->nInputTapes, input_buffer_size / 1024,
2173  pg_rusage_show(&state->ru_start));
2174 #endif
2175 
2176  /* Prepare the new input tapes for merge pass. */
2177  for (tapenum = 0; tapenum < state->nInputTapes; tapenum++)
2178  LogicalTapeRewindForRead(state->inputTapes[tapenum], input_buffer_size);
2179 
2180  /*
2181  * If there's just one run left on each input tape, then only one
2182  * merge pass remains. If we don't have to produce a materialized
2183  * sorted tape, we can stop at this point and do the final merge
2184  * on-the-fly.
2185  */
2186  if ((state->base.sortopt & TUPLESORT_RANDOMACCESS) == 0
2187  && state->nInputRuns <= state->nInputTapes
2188  && !WORKER(state))
2189  {
2190  /* Tell logtape.c we won't be writing anymore */
2192  /* Initialize for the final merge pass */
2193  beginmerge(state);
2194  state->status = TSS_FINALMERGE;
2195  return;
2196  }
2197  }
2198 
2199  /* Select an output tape */
2201 
2202  /* Merge one run from each input tape. */
2203  mergeonerun(state);
2204 
2205  /*
2206  * If the input tapes are empty, and we output only one output run,
2207  * we're done. The current output tape contains the final result.
2208  */
2209  if (state->nInputRuns == 0 && state->nOutputRuns <= 1)
2210  break;
2211  }
2212 
2213  /*
2214  * Done. The result is on a single run on a single tape.
2215  */
2216  state->result_tape = state->outputTapes[0];
2217  if (!WORKER(state))
2218  LogicalTapeFreeze(state->result_tape, NULL);
2219  else
2221  state->status = TSS_SORTEDONTAPE;
2222 
2223  /* Close all the now-empty input tapes, to release their read buffers. */
2224  for (tapenum = 0; tapenum < state->nInputTapes; tapenum++)
2225  LogicalTapeClose(state->inputTapes[tapenum]);
2226 }
#define INT64_FORMAT
Definition: c.h:484
void LogicalTapeRewindForRead(LogicalTape *lt, size_t buffer_size)
Definition: logtape.c:847
void LogicalTapeSetForgetFreeSpace(LogicalTapeSet *lts)
Definition: logtape.c:751
void LogicalTapeClose(LogicalTape *lt)
Definition: logtape.c:734
void LogicalTapeFreeze(LogicalTape *lt, TapeShare *share)
Definition: logtape.c:982
void * MemoryContextAlloc(MemoryContext context, Size size)
Definition: mcxt.c:994
void MemoryContextResetOnly(MemoryContext context)
Definition: mcxt.c:322
static void mergeonerun(Tuplesortstate *state)
Definition: tuplesort.c:2232
static int64 merge_read_buffer_size(int64 avail_mem, int nInputTapes, int nInputRuns, int maxOutputTapes)
Definition: tuplesort.c:1859
static void worker_freeze_result_tape(Tuplesortstate *state)
Definition: tuplesort.c:3047
static void init_slab_allocator(Tuplesortstate *state, int numSlots)
Definition: tuplesort.c:2009
#define TUPLESORT_RANDOMACCESS
Definition: tuplesort.h:95

References Assert(), beginmerge(), elog(), FREEMEM, GetMemoryChunkSpace(), init_slab_allocator(), INT64_FORMAT, LOG, LogicalTapeClose(), LogicalTapeFreeze(), LogicalTapeRewindForRead(), LogicalTapeSetForgetFreeSpace(), MemoryContextAlloc(), MemoryContextResetOnly(), merge_read_buffer_size(), mergeonerun(), palloc0(), pfree(), pg_rusage_show(), selectnewtape(), trace_sort, TSS_BUILDRUNS, TSS_FINALMERGE, TSS_SORTEDONTAPE, TUPLESORT_RANDOMACCESS, USEMEM, WORKER, and worker_freeze_result_tape().

Referenced by inittapes(), and tuplesort_performsort().

◆ qsort_tuple_int32_compare()

static pg_attribute_always_inline int qsort_tuple_int32_compare ( SortTuple a,
SortTuple b,
Tuplesortstate state 
)
static

Definition at line 546 of file tuplesort.c.

547 {
548  int compare;
549 
550  compare = ApplyInt32SortComparator(a->datum1, a->isnull1,
551  b->datum1, b->isnull1,
552  &state->base.sortKeys[0]);
553 
554  if (compare != 0)
555  return compare;
556 
557  /*
558  * No need to waste effort calling the tiebreak function when there are no
559  * other keys to sort on.
560  */
561  if (state->base.onlyKey != NULL)
562  return 0;
563 
564  return state->base.comparetup(a, b, state);
565 }
static int compare(const void *arg1, const void *arg2)
Definition: geqo_pool.c:145
static int ApplyInt32SortComparator(Datum datum1, bool isNull1, Datum datum2, bool isNull2, SortSupport ssup)
Definition: sortsupport.h:302

References a, ApplyInt32SortComparator(), b, and compare().

◆ qsort_tuple_unsigned_compare()

static pg_attribute_always_inline int qsort_tuple_unsigned_compare ( SortTuple a,
SortTuple b,
Tuplesortstate state 
)
static

Definition at line 499 of file tuplesort.c.

500 {
501  int compare;
502 
503  compare = ApplyUnsignedSortComparator(a->datum1, a->isnull1,
504  b->datum1, b->isnull1,
505  &state->base.sortKeys[0]);
506  if (compare != 0)
507  return compare;
508 
509  /*
510  * No need to waste effort calling the tiebreak function when there are no
511  * other keys to sort on.
512  */
513  if (state->base.onlyKey != NULL)
514  return 0;
515 
516  return state->base.comparetup(a, b, state);
517 }
static int ApplyUnsignedSortComparator(Datum datum1, bool isNull1, Datum datum2, bool isNull2, SortSupport ssup)
Definition: sortsupport.h:233

References a, ApplyUnsignedSortComparator(), b, and compare().

◆ reversedirection()

static void reversedirection ( Tuplesortstate state)
static

Definition at line 2876 of file tuplesort.c.

2877 {
2878  SortSupport sortKey = state->base.sortKeys;
2879  int nkey;
2880 
2881  for (nkey = 0; nkey < state->base.nKeys; nkey++, sortKey++)
2882  {
2883  sortKey->ssup_reverse = !sortKey->ssup_reverse;
2884  sortKey->ssup_nulls_first = !sortKey->ssup_nulls_first;
2885  }
2886 }
bool ssup_nulls_first
Definition: sortsupport.h:75

References SortSupportData::ssup_nulls_first, and SortSupportData::ssup_reverse.

Referenced by make_bounded_heap(), and sort_bounded_heap().

◆ selectnewtape()

static void selectnewtape ( Tuplesortstate state)
static

Definition at line 1976 of file tuplesort.c.

1977 {
1978  /*
1979  * At the beginning of each merge pass, nOutputTapes and nOutputRuns are
1980  * both zero. On each call, we create a new output tape to hold the next
1981  * run, until maxTapes is reached. After that, we assign new runs to the
1982  * existing tapes in a round robin fashion.
1983  */
1984  if (state->nOutputTapes < state->maxTapes)
1985  {
1986  /* Create a new tape to hold the next run */
1987  Assert(state->outputTapes[state->nOutputRuns] == NULL);
1988  Assert(state->nOutputRuns == state->nOutputTapes);
1989  state->destTape = LogicalTapeCreate(state->tapeset);
1990  state->outputTapes[state->nOutputTapes] = state->destTape;
1991  state->nOutputTapes++;
1992  state->nOutputRuns++;
1993  }
1994  else
1995  {
1996  /*
1997  * We have reached the max number of tapes. Append to an existing
1998  * tape.
1999  */
2000  state->destTape = state->outputTapes[state->nOutputRuns % state->nOutputTapes];
2001  state->nOutputRuns++;
2002  }
2003 }
LogicalTape * LogicalTapeCreate(LogicalTapeSet *lts)
Definition: logtape.c:681

References Assert(), and LogicalTapeCreate().

Referenced by dumptuples(), inittapes(), and mergeruns().

◆ sort_bounded_heap()

static void sort_bounded_heap ( Tuplesortstate state)
static

Definition at line 2674 of file tuplesort.c.

2675 {
2676  int tupcount = state->memtupcount;
2677 
2678  Assert(state->status == TSS_BOUNDED);
2679  Assert(state->bounded);
2680  Assert(tupcount == state->bound);
2681  Assert(SERIAL(state));
2682 
2683  /*
2684  * We can unheapify in place because each delete-top call will remove the
2685  * largest entry, which we can promptly store in the newly freed slot at
2686  * the end. Once we're down to a single-entry heap, we're done.
2687  */
2688  while (state->memtupcount > 1)
2689  {
2690  SortTuple stup = state->memtuples[0];
2691 
2692  /* this sifts-up the next-largest entry and decreases memtupcount */
2694  state->memtuples[state->memtupcount] = stup;
2695  }
2696  state->memtupcount = tupcount;
2697 
2698  /*
2699  * Reverse sort direction back to the original state. This is not
2700  * actually necessary but seems like a good idea for tidiness.
2701  */
2703 
2704  state->status = TSS_SORTEDINMEM;
2705  state->boundUsed = true;
2706 }

References Assert(), reversedirection(), SERIAL, TSS_BOUNDED, TSS_SORTEDINMEM, and tuplesort_heap_delete_top().

Referenced by tuplesort_performsort().

◆ ssup_datum_int32_cmp()

int ssup_datum_int32_cmp ( Datum  x,
Datum  y,
SortSupport  ssup 
)

Definition at line 3204 of file tuplesort.c.

3205 {
3206  int32 xx = DatumGetInt32(x);
3207  int32 yy = DatumGetInt32(y);
3208 
3209  if (xx < yy)
3210  return -1;
3211  else if (xx > yy)
3212  return 1;
3213  else
3214  return 0;
3215 }
signed int int32
Definition: c.h:430
int y
Definition: isn.c:72
int x
Definition: isn.c:71
static int32 DatumGetInt32(Datum X)
Definition: postgres.h:550

References DatumGetInt32(), x, and y.

Referenced by btint4sortsupport(), date_sortsupport(), and tuplesort_sort_memtuples().

◆ ssup_datum_unsigned_cmp()

int ssup_datum_unsigned_cmp ( Datum  x,
Datum  y,
SortSupport  ssup 
)

Definition at line 3177 of file tuplesort.c.

3178 {
3179  if (x < y)
3180  return -1;
3181  else if (x > y)
3182  return 1;
3183  else
3184  return 0;
3185 }

References x, and y.

Referenced by gist_point_sortsupport(), macaddr_sortsupport(), network_sortsupport(), tuplesort_sort_memtuples(), uuid_sortsupport(), and varstr_sortsupport().

◆ tuplesort_attach_shared()

void tuplesort_attach_shared ( Sharedsort shared,
dsm_segment seg 
)

Definition at line 2999 of file tuplesort.c.

3000 {
3001  /* Attach to SharedFileSet */
3002  SharedFileSetAttach(&shared->fileset, seg);
3003 }
void SharedFileSetAttach(SharedFileSet *fileset, dsm_segment *seg)
Definition: sharedfileset.c:62

References Sharedsort::fileset, and SharedFileSetAttach().

Referenced by _bt_parallel_build_main().

◆ tuplesort_begin_batch()

static void tuplesort_begin_batch ( Tuplesortstate state)
static

Definition at line 758 of file tuplesort.c.

759 {
760  MemoryContext oldcontext;
761 
762  oldcontext = MemoryContextSwitchTo(state->base.maincontext);
763 
764  /*
765  * Caller tuple (e.g. IndexTuple) memory context.
766  *
767  * A dedicated child context used exclusively for caller passed tuples
768  * eases memory management. Resetting at key points reduces
769  * fragmentation. Note that the memtuples array of SortTuples is allocated
770  * in the parent context, not this context, because there is no need to
771  * free memtuples early. For bounded sorts, tuples may be pfreed in any
772  * order, so we use a regular aset.c context so that it can make use of
773  * free'd memory. When the sort is not bounded, we make use of a
774  * generation.c context as this keeps allocations more compact with less
775  * wastage. Allocations are also slightly more CPU efficient.
776  */
777  if (state->base.sortopt & TUPLESORT_ALLOWBOUNDED)
778  state->base.tuplecontext = AllocSetContextCreate(state->base.sortcontext,
779  "Caller tuples",
781  else
782  state->base.tuplecontext = GenerationContextCreate(state->base.sortcontext,
783  "Caller tuples",
785 
786 
787  state->status = TSS_INITIAL;
788  state->bounded = false;
789  state->boundUsed = false;
790 
791  state->availMem = state->allowedMem;
792 
793  state->tapeset = NULL;
794 
795  state->memtupcount = 0;
796 
797  /*
798  * Initial size of array must be more than ALLOCSET_SEPARATE_THRESHOLD;
799  * see comments in grow_memtuples().
800  */
801  state->growmemtuples = true;
802  state->slabAllocatorUsed = false;
803  if (state->memtuples != NULL && state->memtupsize != INITIAL_MEMTUPSIZE)
804  {
805  pfree(state->memtuples);
806  state->memtuples = NULL;
807  state->memtupsize = INITIAL_MEMTUPSIZE;
808  }
809  if (state->memtuples == NULL)
810  {
811  state->memtuples = (SortTuple *) palloc(state->memtupsize * sizeof(SortTuple));
812  USEMEM(state, GetMemoryChunkSpace(state->memtuples));
813  }
814 
815  /* workMem must be large enough for the minimal memtuples array */
816  if (LACKMEM(state))
817  elog(ERROR, "insufficient memory allowed for sort");
818 
819  state->currentRun = 0;
820 
821  /*
822  * Tape variables (inputTapes, outputTapes, etc.) will be initialized by
823  * inittapes(), if needed.
824  */
825 
826  state->result_tape = NULL; /* flag that result tape has not been formed */
827 
828  MemoryContextSwitchTo(oldcontext);
829 }
MemoryContext GenerationContextCreate(MemoryContext parent, const char *name, Size minContextSize, Size initBlockSize, Size maxBlockSize)
Definition: generation.c:158
#define AllocSetContextCreate
Definition: memutils.h:129
#define ALLOCSET_DEFAULT_SIZES
Definition: memutils.h:153
static MemoryContext MemoryContextSwitchTo(MemoryContext context)
Definition: palloc.h:135
#define INITIAL_MEMTUPSIZE
Definition: tuplesort.c:122
#define TUPLESORT_ALLOWBOUNDED
Definition: tuplesort.h:98

References ALLOCSET_DEFAULT_SIZES, AllocSetContextCreate, elog(), ERROR, GenerationContextCreate(), GetMemoryChunkSpace(), INITIAL_MEMTUPSIZE, LACKMEM, MemoryContextSwitchTo(), palloc(), pfree(), TSS_INITIAL, TUPLESORT_ALLOWBOUNDED, and USEMEM.

Referenced by tuplesort_begin_common(), and tuplesort_reset().

◆ tuplesort_begin_common()

Tuplesortstate* tuplesort_begin_common ( int  workMem,
SortCoordinate  coordinate,
int  sortopt 
)

Definition at line 646 of file tuplesort.c.

647 {
649  MemoryContext maincontext;
650  MemoryContext sortcontext;
651  MemoryContext oldcontext;
652 
653  /* See leader_takeover_tapes() remarks on random access support */
654  if (coordinate && (sortopt & TUPLESORT_RANDOMACCESS))
655  elog(ERROR, "random access disallowed under parallel sort");
656 
657  /*
658  * Memory context surviving tuplesort_reset. This memory context holds
659  * data which is useful to keep while sorting multiple similar batches.
660  */
662  "TupleSort main",
664 
665  /*
666  * Create a working memory context for one sort operation. The content of
667  * this context is deleted by tuplesort_reset.
668  */
669  sortcontext = AllocSetContextCreate(maincontext,
670  "TupleSort sort",
672 
673  /*
674  * Additionally a working memory context for tuples is setup in
675  * tuplesort_begin_batch.
676  */
677 
678  /*
679  * Make the Tuplesortstate within the per-sortstate context. This way, we
680  * don't need a separate pfree() operation for it at shutdown.
681  */
682  oldcontext = MemoryContextSwitchTo(maincontext);
683 
685 
686 #ifdef TRACE_SORT
687  if (trace_sort)
688  pg_rusage_init(&state->ru_start);
689 #endif
690 
691  state->base.sortopt = sortopt;
692  state->base.tuples = true;
693  state->abbrevNext = 10;
694 
695  /*
696  * workMem is forced to be at least 64KB, the current minimum valid value
697  * for the work_mem GUC. This is a defense against parallel sort callers
698  * that divide out memory among many workers in a way that leaves each
699  * with very little memory.
700  */
701  state->allowedMem = Max(workMem, 64) * (int64) 1024;
702  state->base.sortcontext = sortcontext;
703  state->base.maincontext = maincontext;
704 
705  /*
706  * Initial size of array must be more than ALLOCSET_SEPARATE_THRESHOLD;
707  * see comments in grow_memtuples().
708  */
709  state->memtupsize = INITIAL_MEMTUPSIZE;
710  state->memtuples = NULL;
711 
712  /*
713  * After all of the other non-parallel-related state, we setup all of the
714  * state needed for each batch.
715  */
717 
718  /*
719  * Initialize parallel-related state based on coordination information
720  * from caller
721  */
722  if (!coordinate)
723  {
724  /* Serial sort */
725  state->shared = NULL;
726  state->worker = -1;
727  state->nParticipants = -1;
728  }
729  else if (coordinate->isWorker)
730  {
731  /* Parallel worker produces exactly one final run from all input */
732  state->shared = coordinate->sharedsort;
733  state->worker = worker_get_identifier(state);
734  state->nParticipants = -1;
735  }
736  else
737  {
738  /* Parallel leader state only used for final merge */
739  state->shared = coordinate->sharedsort;
740  state->worker = -1;
741  state->nParticipants = coordinate->nParticipants;
742  Assert(state->nParticipants >= 1);
743  }
744 
745  MemoryContextSwitchTo(oldcontext);
746 
747  return state;
748 }
MemoryContext CurrentMemoryContext
Definition: mcxt.c:124
void pg_rusage_init(PGRUsage *ru0)
Definition: pg_rusage.c:27
Sharedsort * sharedsort
Definition: tuplesort.h:57
static int worker_get_identifier(Tuplesortstate *state)
Definition: tuplesort.c:3019
static void tuplesort_begin_batch(Tuplesortstate *state)
Definition: tuplesort.c:758

References ALLOCSET_DEFAULT_SIZES, AllocSetContextCreate, Assert(), CurrentMemoryContext, elog(), ERROR, INITIAL_MEMTUPSIZE, SortCoordinateData::isWorker, Max, MemoryContextSwitchTo(), SortCoordinateData::nParticipants, palloc0(), pg_rusage_init(), SortCoordinateData::sharedsort, trace_sort, tuplesort_begin_batch(), TUPLESORT_RANDOMACCESS, and worker_get_identifier().

Referenced by tuplesort_begin_cluster(), tuplesort_begin_datum(), tuplesort_begin_heap(), tuplesort_begin_index_btree(), tuplesort_begin_index_gist(), and tuplesort_begin_index_hash().

◆ tuplesort_end()

void tuplesort_end ( Tuplesortstate state)

Definition at line 972 of file tuplesort.c.

973 {
975 
976  /*
977  * Free the main memory context, including the Tuplesortstate struct
978  * itself.
979  */
980  MemoryContextDelete(state->base.maincontext);
981 }
void MemoryContextDelete(MemoryContext context)
Definition: mcxt.c:376
static void tuplesort_free(Tuplesortstate *state)
Definition: tuplesort.c:903

References MemoryContextDelete(), and tuplesort_free().

Referenced by _bt_parallel_scan_and_sort(), _bt_spooldestroy(), _h_spooldestroy(), ExecEndAgg(), ExecEndIncrementalSort(), ExecEndSort(), ExecReScanAgg(), ExecReScanSort(), gistbuild(), heapam_relation_copy_for_cluster(), initialize_aggregate(), initialize_phase(), ordered_set_shutdown(), process_ordered_aggregate_multi(), process_ordered_aggregate_single(), and validate_index().

◆ tuplesort_estimate_shared()

Size tuplesort_estimate_shared ( int  nWorkers)

Definition at line 2955 of file tuplesort.c.

2956 {
2957  Size tapesSize;
2958 
2959  Assert(nWorkers > 0);
2960 
2961  /* Make sure that BufFile shared state is MAXALIGN'd */
2962  tapesSize = mul_size(sizeof(TapeShare), nWorkers);
2963  tapesSize = MAXALIGN(add_size(tapesSize, offsetof(Sharedsort, tapes)));
2964 
2965  return tapesSize;
2966 }
#define MAXALIGN(LEN)
Definition: c.h:747
Size add_size(Size s1, Size s2)
Definition: shmem.c:502
Size mul_size(Size s1, Size s2)
Definition: shmem.c:519

References add_size(), Assert(), MAXALIGN, and mul_size().

Referenced by _bt_begin_parallel().

◆ tuplesort_free()

static void tuplesort_free ( Tuplesortstate state)
static

Definition at line 903 of file tuplesort.c.

904 {
905  /* context swap probably not needed, but let's be safe */
906  MemoryContext oldcontext = MemoryContextSwitchTo(state->base.sortcontext);
907 
908 #ifdef TRACE_SORT
909  long spaceUsed;
910 
911  if (state->tapeset)
912  spaceUsed = LogicalTapeSetBlocks(state->tapeset);
913  else
914  spaceUsed = (state->allowedMem - state->availMem + 1023) / 1024;
915 #endif
916 
917  /*
918  * Delete temporary "tape" files, if any.
919  *
920  * Note: want to include this in reported total cost of sort, hence need
921  * for two #ifdef TRACE_SORT sections.
922  *
923  * We don't bother to destroy the individual tapes here. They will go away
924  * with the sortcontext. (In TSS_FINALMERGE state, we have closed
925  * finished tapes already.)
926  */
927  if (state->tapeset)
928  LogicalTapeSetClose(state->tapeset);
929 
930 #ifdef TRACE_SORT
931  if (trace_sort)
932  {
933  if (state->tapeset)
934  elog(LOG, "%s of worker %d ended, %ld disk blocks used: %s",
935  SERIAL(state) ? "external sort" : "parallel external sort",
936  state->worker, spaceUsed, pg_rusage_show(&state->ru_start));
937  else
938  elog(LOG, "%s of worker %d ended, %ld KB used: %s",
939  SERIAL(state) ? "internal sort" : "unperformed parallel sort",
940  state->worker, spaceUsed, pg_rusage_show(&state->ru_start));
941  }
942 
943  TRACE_POSTGRESQL_SORT_DONE(state->tapeset != NULL, spaceUsed);
944 #else
945 
946  /*
947  * If you disabled TRACE_SORT, you can still probe sort__done, but you
948  * ain't getting space-used stats.
949  */
950  TRACE_POSTGRESQL_SORT_DONE(state->tapeset != NULL, 0L);
951 #endif
952 
953  FREESTATE(state);
954  MemoryContextSwitchTo(oldcontext);
955 
956  /*
957  * Free the per-sort memory context, thereby releasing all working memory.
958  */
959  MemoryContextReset(state->base.sortcontext);
960 }
void LogicalTapeSetClose(LogicalTapeSet *lts)
Definition: logtape.c:668
long LogicalTapeSetBlocks(LogicalTapeSet *lts)
Definition: logtape.c:1184
#define FREESTATE(state)
Definition: tuplesort.c:400

References elog(), FREESTATE, LOG, LogicalTapeSetBlocks(), LogicalTapeSetClose(), MemoryContextReset(), MemoryContextSwitchTo(), pg_rusage_show(), SERIAL, and trace_sort.

Referenced by tuplesort_end(), and tuplesort_reset().

◆ tuplesort_get_stats()

void tuplesort_get_stats ( Tuplesortstate state,
TuplesortInstrumentation stats 
)

Definition at line 2537 of file tuplesort.c.

2539 {
2540  /*
2541  * Note: it might seem we should provide both memory and disk usage for a
2542  * disk-based sort. However, the current code doesn't track memory space
2543  * accurately once we have begun to return tuples to the caller (since we
2544  * don't account for pfree's the caller is expected to do), so we cannot
2545  * rely on availMem in a disk sort. This does not seem worth the overhead
2546  * to fix. Is it worth creating an API for the memory context code to
2547  * tell us how much is actually used in sortcontext?
2548  */
2550 
2551  if (state->isMaxSpaceDisk)
2553  else
2555  stats->spaceUsed = (state->maxSpace + 1023) / 1024;
2556 
2557  switch (state->maxSpaceStatus)
2558  {
2559  case TSS_SORTEDINMEM:
2560  if (state->boundUsed)
2562  else
2564  break;
2565  case TSS_SORTEDONTAPE:
2567  break;
2568  case TSS_FINALMERGE:
2570  break;
2571  default:
2573  break;
2574  }
2575 }
TuplesortMethod sortMethod
Definition: tuplesort.h:102
TuplesortSpaceType spaceType
Definition: tuplesort.h:103
static void tuplesort_updatemax(Tuplesortstate *state)
Definition: tuplesort.c:989
@ SORT_SPACE_TYPE_DISK
Definition: tuplesort.h:87
@ SORT_SPACE_TYPE_MEMORY
Definition: tuplesort.h:88
@ SORT_TYPE_EXTERNAL_SORT
Definition: tuplesort.h:79
@ SORT_TYPE_TOP_N_HEAPSORT
Definition: tuplesort.h:77
@ SORT_TYPE_QUICKSORT
Definition: tuplesort.h:78
@ SORT_TYPE_STILL_IN_PROGRESS
Definition: tuplesort.h:76
@ SORT_TYPE_EXTERNAL_MERGE
Definition: tuplesort.h:80

References SORT_SPACE_TYPE_DISK, SORT_SPACE_TYPE_MEMORY, SORT_TYPE_EXTERNAL_MERGE, SORT_TYPE_EXTERNAL_SORT, SORT_TYPE_QUICKSORT, SORT_TYPE_STILL_IN_PROGRESS, SORT_TYPE_TOP_N_HEAPSORT, TuplesortInstrumentation::sortMethod, TuplesortInstrumentation::spaceType, TuplesortInstrumentation::spaceUsed, TSS_FINALMERGE, TSS_SORTEDINMEM, TSS_SORTEDONTAPE, and tuplesort_updatemax().

Referenced by ExecSort(), instrumentSortedGroup(), and show_sort_info().

◆ tuplesort_gettuple_common()

bool tuplesort_gettuple_common ( Tuplesortstate state,
bool  forward,
SortTuple stup 
)

Definition at line 1496 of file tuplesort.c.

1498 {
1499  unsigned int tuplen;
1500  size_t nmoved;
1501 
1502  Assert(!WORKER(state));
1503 
1504  switch (state->status)
1505  {
1506  case TSS_SORTEDINMEM:
1507  Assert(forward || state->base.sortopt & TUPLESORT_RANDOMACCESS);
1508  Assert(!state->slabAllocatorUsed);
1509  if (forward)
1510  {
1511  if (state->current < state->memtupcount)
1512  {
1513  *stup = state->memtuples[state->current++];
1514  return true;
1515  }
1516  state->eof_reached = true;
1517 
1518  /*
1519  * Complain if caller tries to retrieve more tuples than
1520  * originally asked for in a bounded sort. This is because
1521  * returning EOF here might be the wrong thing.
1522  */
1523  if (state->bounded && state->current >= state->bound)
1524  elog(ERROR, "retrieved too many tuples in a bounded sort");
1525 
1526  return false;
1527  }
1528  else
1529  {
1530  if (state->current <= 0)
1531  return false;
1532 
1533  /*
1534  * if all tuples are fetched already then we return last
1535  * tuple, else - tuple before last returned.
1536  */
1537  if (state->eof_reached)
1538  state->eof_reached = false;
1539  else
1540  {
1541  state->current--; /* last returned tuple */
1542  if (state->current <= 0)
1543  return false;
1544  }
1545  *stup = state->memtuples[state->current - 1];
1546  return true;
1547  }
1548  break;
1549 
1550  case TSS_SORTEDONTAPE:
1551  Assert(forward || state->base.sortopt & TUPLESORT_RANDOMACCESS);
1552  Assert(state->slabAllocatorUsed);
1553 
1554  /*
1555  * The slot that held the tuple that we returned in previous
1556  * gettuple call can now be reused.
1557  */
1558  if (state->lastReturnedTuple)
1559  {
1560  RELEASE_SLAB_SLOT(state, state->lastReturnedTuple);
1561  state->lastReturnedTuple = NULL;
1562  }
1563 
1564  if (forward)
1565  {
1566  if (state->eof_reached)
1567  return false;
1568 
1569  if ((tuplen = getlen(state->result_tape, true)) != 0)
1570  {
1571  READTUP(state, stup, state->result_tape, tuplen);
1572 
1573  /*
1574  * Remember the tuple we return, so that we can recycle
1575  * its memory on next call. (This can be NULL, in the
1576  * !state->tuples case).
1577  */
1578  state->lastReturnedTuple = stup->tuple;
1579 
1580  return true;
1581  }
1582  else
1583  {
1584  state->eof_reached = true;
1585  return false;
1586  }
1587  }
1588 
1589  /*
1590  * Backward.
1591  *
1592  * if all tuples are fetched already then we return last tuple,
1593  * else - tuple before last returned.
1594  */
1595  if (state->eof_reached)
1596  {
1597  /*
1598  * Seek position is pointing just past the zero tuplen at the
1599  * end of file; back up to fetch last tuple's ending length
1600  * word. If seek fails we must have a completely empty file.
1601  */
1602  nmoved = LogicalTapeBackspace(state->result_tape,
1603  2 * sizeof(unsigned int));
1604  if (nmoved == 0)
1605  return false;
1606  else if (nmoved != 2 * sizeof(unsigned int))
1607  elog(ERROR, "unexpected tape position");
1608  state->eof_reached = false;
1609  }
1610  else
1611  {
1612  /*
1613  * Back up and fetch previously-returned tuple's ending length
1614  * word. If seek fails, assume we are at start of file.
1615  */
1616  nmoved = LogicalTapeBackspace(state->result_tape,
1617  sizeof(unsigned int));
1618  if (nmoved == 0)
1619  return false;
1620  else if (nmoved != sizeof(unsigned int))
1621  elog(ERROR, "unexpected tape position");
1622  tuplen = getlen(state->result_tape, false);
1623 
1624  /*
1625  * Back up to get ending length word of tuple before it.
1626  */
1627  nmoved = LogicalTapeBackspace(state->result_tape,
1628  tuplen + 2 * sizeof(unsigned int));
1629  if (nmoved == tuplen + sizeof(unsigned int))
1630  {
1631  /*
1632  * We backed up over the previous tuple, but there was no
1633  * ending length word before it. That means that the prev
1634  * tuple is the first tuple in the file. It is now the
1635  * next to read in forward direction (not obviously right,
1636  * but that is what in-memory case does).
1637  */
1638  return false;
1639  }
1640  else if (nmoved != tuplen + 2 * sizeof(unsigned int))
1641  elog(ERROR, "bogus tuple length in backward scan");
1642  }
1643 
1644  tuplen = getlen(state->result_tape, false);
1645 
1646  /*
1647  * Now we have the length of the prior tuple, back up and read it.
1648  * Note: READTUP expects we are positioned after the initial
1649  * length word of the tuple, so back up to that point.
1650  */
1651  nmoved = LogicalTapeBackspace(state->result_tape,
1652  tuplen);
1653  if (nmoved != tuplen)
1654  elog(ERROR, "bogus tuple length in backward scan");
1655  READTUP(state, stup, state->result_tape, tuplen);
1656 
1657  /*
1658  * Remember the tuple we return, so that we can recycle its memory
1659  * on next call. (This can be NULL, in the Datum case).
1660  */
1661  state->lastReturnedTuple = stup->tuple;
1662 
1663  return true;
1664 
1665  case TSS_FINALMERGE:
1666  Assert(forward);
1667  /* We are managing memory ourselves, with the slab allocator. */
1668  Assert(state->slabAllocatorUsed);
1669 
1670  /*
1671  * The slab slot holding the tuple that we returned in previous
1672  * gettuple call can now be reused.
1673  */
1674  if (state->lastReturnedTuple)
1675  {
1676  RELEASE_SLAB_SLOT(state, state->lastReturnedTuple);
1677  state->lastReturnedTuple = NULL;
1678  }
1679 
1680  /*
1681  * This code should match the inner loop of mergeonerun().
1682  */
1683  if (state->memtupcount > 0)
1684  {
1685  int srcTapeIndex = state->memtuples[0].srctape;
1686  LogicalTape *srcTape = state->inputTapes[srcTapeIndex];
1687  SortTuple newtup;
1688 
1689  *stup = state->memtuples[0];
1690 
1691  /*
1692  * Remember the tuple we return, so that we can recycle its
1693  * memory on next call. (This can be NULL, in the Datum case).
1694  */
1695  state->lastReturnedTuple = stup->tuple;
1696 
1697  /*
1698  * Pull next tuple from tape, and replace the returned tuple
1699  * at top of the heap with it.
1700  */
1701  if (!mergereadnext(state, srcTape, &newtup))
1702  {
1703  /*
1704  * If no more data, we've reached end of run on this tape.
1705  * Remove the top node from the heap.
1706  */
1708  state->nInputRuns--;
1709 
1710  /*
1711  * Close the tape. It'd go away at the end of the sort
1712  * anyway, but better to release the memory early.
1713  */
1714  LogicalTapeClose(srcTape);
1715  return true;
1716  }
1717  newtup.srctape = srcTapeIndex;
1719  return true;
1720  }
1721  return false;
1722 
1723  default:
1724  elog(ERROR, "invalid tuplesort state");
1725  return false; /* keep compiler quiet */
1726  }
1727 }
size_t LogicalTapeBackspace(LogicalTape *lt, size_t size)
Definition: logtape.c:1063

References Assert(), elog(), ERROR, getlen(), LogicalTapeBackspace(), LogicalTapeClose(), mergereadnext(), READTUP, RELEASE_SLAB_SLOT, SortTuple::srctape, TSS_FINALMERGE, TSS_SORTEDINMEM, TSS_SORTEDONTAPE, SortTuple::tuple, tuplesort_heap_delete_top(), tuplesort_heap_replace_top(), TUPLESORT_RANDOMACCESS, and WORKER.

Referenced by tuplesort_getdatum(), tuplesort_getheaptuple(), tuplesort_getindextuple(), tuplesort_gettupleslot(), and tuplesort_skiptuples().

◆ tuplesort_heap_delete_top()

static void tuplesort_heap_delete_top ( Tuplesortstate state)
static

Definition at line 2812 of file tuplesort.c.

2813 {
2814  SortTuple *memtuples = state->memtuples;
2815  SortTuple *tuple;
2816 
2817  if (--state->memtupcount <= 0)
2818  return;
2819 
2820  /*
2821  * Remove the last tuple in the heap, and re-insert it, by replacing the
2822  * current top node with it.
2823  */
2824  tuple = &memtuples[state->memtupcount];
2826 }

References tuplesort_heap_replace_top().

Referenced by mergeonerun(), sort_bounded_heap(), and tuplesort_gettuple_common().

◆ tuplesort_heap_insert()

static void tuplesort_heap_insert ( Tuplesortstate state,
SortTuple tuple 
)
static

Definition at line 2777 of file tuplesort.c.

2778 {
2779  SortTuple *memtuples;
2780  int j;
2781 
2782  memtuples = state->memtuples;
2783  Assert(state->memtupcount < state->memtupsize);
2784 
2786 
2787  /*
2788  * Sift-up the new entry, per Knuth 5.2.3 exercise 16. Note that Knuth is
2789  * using 1-based array indexes, not 0-based.
2790  */
2791  j = state->memtupcount++;
2792  while (j > 0)
2793  {
2794  int i = (j - 1) >> 1;
2795 
2796  if (COMPARETUP(state, tuple, &memtuples[i]) >= 0)
2797  break;
2798  memtuples[j] = memtuples[i];
2799  j = i;
2800  }
2801  memtuples[j] = *tuple;
2802 }

References Assert(), CHECK_FOR_INTERRUPTS, COMPARETUP, i, and j.

Referenced by beginmerge(), and make_bounded_heap().

◆ tuplesort_heap_replace_top()

static void tuplesort_heap_replace_top ( Tuplesortstate state,
SortTuple tuple 
)
static

Definition at line 2836 of file tuplesort.c.

2837 {
2838  SortTuple *memtuples = state->memtuples;
2839  unsigned int i,
2840  n;
2841 
2842  Assert(state->memtupcount >= 1);
2843 
2845 
2846  /*
2847  * state->memtupcount is "int", but we use "unsigned int" for i, j, n.
2848  * This prevents overflow in the "2 * i + 1" calculation, since at the top
2849  * of the loop we must have i < n <= INT_MAX <= UINT_MAX/2.
2850  */
2851  n = state->memtupcount;
2852  i = 0; /* i is where the "hole" is */
2853  for (;;)
2854  {
2855  unsigned int j = 2 * i + 1;
2856 
2857  if (j >= n)
2858  break;
2859  if (j + 1 < n &&
2860  COMPARETUP(state, &memtuples[j], &memtuples[j + 1]) > 0)
2861  j++;
2862  if (COMPARETUP(state, tuple, &memtuples[j]) <= 0)
2863  break;
2864  memtuples[i] = memtuples[j];
2865  i = j;
2866  }
2867  memtuples[i] = *tuple;
2868 }

References Assert(), CHECK_FOR_INTERRUPTS, COMPARETUP, i, and j.

Referenced by make_bounded_heap(), mergeonerun(), tuplesort_gettuple_common(), tuplesort_heap_delete_top(), and tuplesort_puttuple_common().

◆ tuplesort_initialize_shared()

void tuplesort_initialize_shared ( Sharedsort shared,
int  nWorkers,
dsm_segment seg 
)

Definition at line 2976 of file tuplesort.c.

2977 {
2978  int i;
2979 
2980  Assert(nWorkers > 0);
2981 
2982  SpinLockInit(&shared->mutex);
2983  shared->currentWorker = 0;
2984  shared->workersFinished = 0;
2985  SharedFileSetInit(&shared->fileset, seg);
2986  shared->nTapes = nWorkers;
2987  for (i = 0; i < nWorkers; i++)
2988  {
2989  shared->tapes[i].firstblocknumber = 0L;
2990  }
2991 }
void SharedFileSetInit(SharedFileSet *fileset, dsm_segment *seg)
Definition: sharedfileset.c:44
#define SpinLockInit(lock)
Definition: spin.h:60
int nTapes
Definition: tuplesort.c:364
int currentWorker
Definition: tuplesort.c:357
long firstblocknumber
Definition: logtape.h:54

References Assert(), Sharedsort::currentWorker, Sharedsort::fileset, TapeShare::firstblocknumber, i, Sharedsort::mutex, Sharedsort::nTapes, SharedFileSetInit(), SpinLockInit, Sharedsort::tapes, and Sharedsort::workersFinished.

Referenced by _bt_begin_parallel().

◆ tuplesort_markpos()

void tuplesort_markpos ( Tuplesortstate state)

Definition at line 2473 of file tuplesort.c.

2474 {
2475  MemoryContext oldcontext = MemoryContextSwitchTo(state->base.sortcontext);
2476 
2477  Assert(state->base.sortopt & TUPLESORT_RANDOMACCESS);
2478 
2479  switch (state->status)
2480  {
2481  case TSS_SORTEDINMEM:
2482  state->markpos_offset = state->current;
2483  state->markpos_eof = state->eof_reached;
2484  break;
2485  case TSS_SORTEDONTAPE:
2486  LogicalTapeTell(state->result_tape,
2487  &state->markpos_block,
2488  &state->markpos_offset);
2489  state->markpos_eof = state->eof_reached;
2490  break;
2491  default:
2492  elog(ERROR, "invalid tuplesort state");
2493  break;
2494  }
2495 
2496  MemoryContextSwitchTo(oldcontext);
2497 }
void LogicalTapeTell(LogicalTape *lt, long *blocknum, int *offset)
Definition: logtape.c:1163

References Assert(), elog(), ERROR, LogicalTapeTell(), MemoryContextSwitchTo(), TSS_SORTEDINMEM, TSS_SORTEDONTAPE, and TUPLESORT_RANDOMACCESS.

Referenced by ExecSortMarkPos().

◆ tuplesort_merge_order()

int tuplesort_merge_order ( int64  allowedMem)

Definition at line 1804 of file tuplesort.c.

1805 {
1806  int mOrder;
1807 
1808  /*----------
1809  * In the merge phase, we need buffer space for each input and output tape.
1810  * Each pass in the balanced merge algorithm reads from M input tapes, and
1811  * writes to N output tapes. Each tape consumes TAPE_BUFFER_OVERHEAD bytes
1812  * of memory. In addition to that, we want MERGE_BUFFER_SIZE workspace per
1813  * input tape.
1814  *
1815  * totalMem = M * (TAPE_BUFFER_OVERHEAD + MERGE_BUFFER_SIZE) +
1816  * N * TAPE_BUFFER_OVERHEAD
1817  *
1818  * Except for the last and next-to-last merge passes, where there can be
1819  * fewer tapes left to process, M = N. We choose M so that we have the
1820  * desired amount of memory available for the input buffers
1821  * (TAPE_BUFFER_OVERHEAD + MERGE_BUFFER_SIZE), given the total memory
1822  * available for the tape buffers (allowedMem).
1823  *
1824  * Note: you might be thinking we need to account for the memtuples[]
1825  * array in this calculation, but we effectively treat that as part of the
1826  * MERGE_BUFFER_SIZE workspace.
1827  *----------
1828  */
1829  mOrder = allowedMem /
1831 
1832  /*
1833  * Even in minimum memory, use at least a MINORDER merge. On the other
1834  * hand, even when we have lots of memory, do not use more than a MAXORDER
1835  * merge. Tapes are pretty cheap, but they're not entirely free. Each
1836  * additional tape reduces the amount of memory available to build runs,
1837  * which in turn can cause the same sort to need more runs, which makes
1838  * merging slower even if it can still be done in a single pass. Also,
1839  * high order merges are quite slow due to CPU cache effects; it can be
1840  * faster to pay the I/O cost of a multi-pass merge than to perform a
1841  * single merge pass across many hundreds of tapes.
1842  */
1843  mOrder = Max(mOrder, MINORDER);
1844  mOrder = Min(mOrder, MAXORDER);
1845 
1846  return mOrder;
1847 }
#define MAXORDER
Definition: tuplesort.c:181
#define MERGE_BUFFER_SIZE
Definition: tuplesort.c:183

References Max, MAXORDER, MERGE_BUFFER_SIZE, Min, MINORDER, and TAPE_BUFFER_OVERHEAD.

Referenced by cost_tuplesort(), and inittapes().

◆ tuplesort_method_name()

const char* tuplesort_method_name ( TuplesortMethod  m)

Definition at line 2581 of file tuplesort.c.

2582 {
2583  switch (m)
2584  {
2586  return "still in progress";
2588  return "top-N heapsort";
2589  case SORT_TYPE_QUICKSORT:
2590  return "quicksort";
2592  return "external sort";
2594  return "external merge";
2595  }
2596 
2597  return "unknown";
2598 }

References SORT_TYPE_EXTERNAL_MERGE, SORT_TYPE_EXTERNAL_SORT, SORT_TYPE_QUICKSORT, SORT_TYPE_STILL_IN_PROGRESS, and SORT_TYPE_TOP_N_HEAPSORT.

Referenced by show_incremental_sort_group_info(), and show_sort_info().

◆ tuplesort_performsort()

void tuplesort_performsort ( Tuplesortstate state)

Definition at line 1385 of file tuplesort.c.

1386 {
1387  MemoryContext oldcontext = MemoryContextSwitchTo(state->base.sortcontext);
1388 
1389 #ifdef TRACE_SORT
1390  if (trace_sort)
1391  elog(LOG, "performsort of worker %d starting: %s",
1392  state->worker, pg_rusage_show(&state->ru_start));
1393 #endif
1394 
1395  switch (state->status)
1396  {
1397  case TSS_INITIAL:
1398 
1399  /*
1400  * We were able to accumulate all the tuples within the allowed
1401  * amount of memory, or leader to take over worker tapes
1402  */
1403  if (SERIAL(state))
1404  {
1405  /* Just qsort 'em and we're done */
1407  state->status = TSS_SORTEDINMEM;
1408  }
1409  else if (WORKER(state))
1410  {
1411  /*
1412  * Parallel workers must still dump out tuples to tape. No
1413  * merge is required to produce single output run, though.
1414  */
1415  inittapes(state, false);
1416  dumptuples(state, true);
1418  state->status = TSS_SORTEDONTAPE;
1419  }
1420  else
1421  {
1422  /*
1423  * Leader will take over worker tapes and merge worker runs.
1424  * Note that mergeruns sets the correct state->status.
1425  */
1427  mergeruns(state);
1428  }
1429  state->current = 0;
1430  state->eof_reached = false;
1431  state->markpos_block = 0L;
1432  state->markpos_offset = 0;
1433  state->markpos_eof = false;
1434  break;
1435 
1436  case TSS_BOUNDED:
1437 
1438  /*
1439  * We were able to accumulate all the tuples required for output
1440  * in memory, using a heap to eliminate excess tuples. Now we
1441  * have to transform the heap to a properly-sorted array.
1442  */
1444  state->current = 0;
1445  state->eof_reached = false;
1446  state->markpos_offset = 0;
1447  state->markpos_eof = false;
1448  state->status = TSS_SORTEDINMEM;
1449  break;
1450 
1451  case TSS_BUILDRUNS:
1452 
1453  /*
1454  * Finish tape-based sort. First, flush all tuples remaining in
1455  * memory out to tape; then merge until we have a single remaining
1456  * run (or, if !randomAccess and !WORKER(), one run per tape).
1457  * Note that mergeruns sets the correct state->status.
1458  */
1459  dumptuples(state, true);
1460  mergeruns(state);
1461  state->eof_reached = false;
1462  state->markpos_block = 0L;
1463  state->markpos_offset = 0;
1464  state->markpos_eof = false;
1465  break;
1466 
1467  default:
1468  elog(ERROR, "invalid tuplesort state");
1469  break;
1470  }
1471 
1472 #ifdef TRACE_SORT
1473  if (trace_sort)
1474  {
1475  if (state->status == TSS_FINALMERGE)
1476  elog(LOG, "performsort of worker %d done (except %d-way final merge): %s",
1477  state->worker, state->nInputTapes,
1478  pg_rusage_show(&state->ru_start));
1479  else
1480  elog(LOG, "performsort of worker %d done: %s",
1481  state->worker, pg_rusage_show(&state->ru_start));
1482  }
1483 #endif
1484 
1485  MemoryContextSwitchTo(oldcontext);
1486 }
static void sort_bounded_heap(Tuplesortstate *state)
Definition: tuplesort.c:2674
static void leader_takeover_tapes(Tuplesortstate *state)
Definition: tuplesort.c:3107
static void inittapes(Tuplesortstate *state, bool mergeruns)
Definition: tuplesort.c:1891
static void worker_nomergeruns(Tuplesortstate *state)
Definition: tuplesort.c:3085
static void dumptuples(Tuplesortstate *state, bool alltuples)
Definition: tuplesort.c:2339

References dumptuples(), elog(), ERROR, inittapes(), leader_takeover_tapes(), LOG, MemoryContextSwitchTo(), mergeruns(), pg_rusage_show(), SERIAL, sort_bounded_heap(), trace_sort, TSS_BOUNDED, TSS_BUILDRUNS, TSS_FINALMERGE, TSS_INITIAL, TSS_SORTEDINMEM, TSS_SORTEDONTAPE, tuplesort_sort_memtuples(), WORKER, and worker_nomergeruns().

Referenced by _bt_leafbuild(), _bt_parallel_scan_and_sort(), _h_indexbuild(), ExecIncrementalSort(), ExecSort(), gistbuild(), heapam_relation_copy_for_cluster(), hypothetical_dense_rank_final(), hypothetical_rank_common(), initialize_phase(), mode_final(), percentile_cont_final_common(), percentile_cont_multi_final_common(), percentile_disc_final(), percentile_disc_multi_final(), process_ordered_aggregate_multi(), process_ordered_aggregate_single(), switchToPresortedPrefixMode(), and validate_index().

◆ tuplesort_puttuple_common()

void tuplesort_puttuple_common ( Tuplesortstate state,
SortTuple tuple,
bool  useAbbrev 
)

Definition at line 1190 of file tuplesort.c.

1191 {
1192  MemoryContext oldcontext = MemoryContextSwitchTo(state->base.sortcontext);
1193 
1194  Assert(!LEADER(state));
1195 
1196  /* Count the size of the out-of-line data */
1197  if (tuple->tuple != NULL)
1199 
1200  if (!useAbbrev)
1201  {
1202  /*
1203  * Leave ordinary Datum representation, or NULL value. If there is a
1204  * converter it won't expect NULL values, and cost model is not
1205  * required to account for NULL, so in that case we avoid calling
1206  * converter and just set datum1 to zeroed representation (to be
1207  * consistent, and to support cheap inequality tests for NULL
1208  * abbreviated keys).
1209  */
1210  }
1211  else if (!consider_abort_common(state))
1212  {
1213  /* Store abbreviated key representation */
1214  tuple->datum1 = state->base.sortKeys->abbrev_converter(tuple->datum1,
1215  state->base.sortKeys);
1216  }
1217  else
1218  {
1219  /*
1220  * Set state to be consistent with never trying abbreviation.
1221  *
1222  * Alter datum1 representation in already-copied tuples, so as to
1223  * ensure a consistent representation (current tuple was just
1224  * handled). It does not matter if some dumped tuples are already
1225  * sorted on tape, since serialized tuples lack abbreviated keys
1226  * (TSS_BUILDRUNS state prevents control reaching here in any case).
1227  */
1228  REMOVEABBREV(state, state->memtuples, state->memtupcount);
1229  }
1230 
1231  switch (state->status)
1232  {
1233  case TSS_INITIAL:
1234 
1235  /*
1236  * Save the tuple into the unsorted array. First, grow the array
1237  * as needed. Note that we try to grow the array when there is
1238  * still one free slot remaining --- if we fail, there'll still be
1239  * room to store the incoming tuple, and then we'll switch to
1240  * tape-based operation.
1241  */
1242  if (state->memtupcount >= state->memtupsize - 1)
1243  {
1244  (void) grow_memtuples(state);
1245  Assert(state->memtupcount < state->memtupsize);
1246  }
1247  state->memtuples[state->memtupcount++] = *tuple;
1248 
1249  /*
1250  * Check if it's time to switch over to a bounded heapsort. We do
1251  * so if the input tuple count exceeds twice the desired tuple
1252  * count (this is a heuristic for where heapsort becomes cheaper
1253  * than a quicksort), or if we've just filled workMem and have
1254  * enough tuples to meet the bound.
1255  *
1256  * Note that once we enter TSS_BOUNDED state we will always try to
1257  * complete the sort that way. In the worst case, if later input
1258  * tuples are larger than earlier ones, this might cause us to
1259  * exceed workMem significantly.
1260  */
1261  if (state->bounded &&
1262  (state->memtupcount > state->bound * 2 ||
1263  (state->memtupcount > state->bound && LACKMEM(state))))
1264  {
1265 #ifdef TRACE_SORT
1266  if (trace_sort)
1267  elog(LOG, "switching to bounded heapsort at %d tuples: %s",
1268  state->memtupcount,
1269  pg_rusage_show(&state->ru_start));
1270 #endif
1272  MemoryContextSwitchTo(oldcontext);
1273  return;
1274  }
1275 
1276  /*
1277  * Done if we still fit in available memory and have array slots.
1278  */
1279  if (state->memtupcount < state->memtupsize && !LACKMEM(state))
1280  {
1281  MemoryContextSwitchTo(oldcontext);
1282  return;
1283  }
1284 
1285  /*
1286  * Nope; time to switch to tape-based operation.
1287  */
1288  inittapes(state, true);
1289 
1290  /*
1291  * Dump all tuples.
1292  */
1293  dumptuples(state, false);
1294  break;
1295 
1296  case TSS_BOUNDED:
1297 
1298  /*
1299  * We don't want to grow the array here, so check whether the new
1300  * tuple can be discarded before putting it in. This should be a
1301  * good speed optimization, too, since when there are many more
1302  * input tuples than the bound, most input tuples can be discarded
1303  * with just this one comparison. Note that because we currently
1304  * have the sort direction reversed, we must check for <= not >=.
1305  */
1306  if (COMPARETUP(state, tuple, &state->memtuples[0]) <= 0)
1307  {
1308  /* new tuple <= top of the heap, so we can discard it */
1309  free_sort_tuple(state, tuple);
1311  }
1312  else
1313  {
1314  /* discard top of heap, replacing it with the new tuple */
1315  free_sort_tuple(state, &state->memtuples[0]);
1317  }
1318  break;
1319 
1320  case TSS_BUILDRUNS:
1321 
1322  /*
1323  * Save the tuple into the unsorted array (there must be space)
1324  */
1325  state->memtuples[state->memtupcount++] = *tuple;
1326 
1327  /*
1328  * If we are over the memory limit, dump all tuples.
1329  */
1330  dumptuples(state, false);
1331  break;
1332 
1333  default:
1334  elog(ERROR, "invalid tuplesort state");
1335  break;
1336  }
1337  MemoryContextSwitchTo(oldcontext);
1338 }
Datum datum1
Definition: tuplesort.h:139
#define REMOVEABBREV(state, stup, count)
Definition: tuplesort.c:396
static bool grow_memtuples(Tuplesortstate *state)
Definition: tuplesort.c:1073
static void make_bounded_heap(Tuplesortstate *state)
Definition: tuplesort.c:2625
static bool consider_abort_common(Tuplesortstate *state)
Definition: tuplesort.c:1341

References Assert(), CHECK_FOR_INTERRUPTS, COMPARETUP, consider_abort_common(), SortTuple::datum1, dumptuples(), elog(), ERROR, free_sort_tuple(), GetMemoryChunkSpace(), grow_memtuples(), inittapes(), LACKMEM, LEADER, LOG, make_bounded_heap(), MemoryContextSwitchTo(), pg_rusage_show(), REMOVEABBREV, trace_sort, TSS_BOUNDED, TSS_BUILDRUNS, TSS_INITIAL, SortTuple::tuple, tuplesort_heap_replace_top(), and USEMEM.

Referenced by tuplesort_putdatum(), tuplesort_putheaptuple(), tuplesort_putindextuplevalues(), and tuplesort_puttupleslot().

◆ tuplesort_readtup_alloc()

void* tuplesort_readtup_alloc ( Tuplesortstate state,
Size  tuplen 
)

Definition at line 2921 of file tuplesort.c.

2922 {
2923  SlabSlot *buf;
2924 
2925  /*
2926  * We pre-allocate enough slots in the slab arena that we should never run
2927  * out.
2928  */
2929  Assert(state->slabFreeHead);
2930 
2931  if (tuplen > SLAB_SLOT_SIZE || !state->slabFreeHead)
2932  return MemoryContextAlloc(state->base.sortcontext, tuplen);
2933  else
2934  {
2935  buf = state->slabFreeHead;
2936  /* Reuse this slot */
2937  state->slabFreeHead = buf->nextfree;
2938 
2939  return buf;
2940  }
2941 }

References Assert(), buf, MemoryContextAlloc(), and SLAB_SLOT_SIZE.

Referenced by readtup_cluster(), readtup_datum(), readtup_heap(), and readtup_index().

◆ tuplesort_rescan()

void tuplesort_rescan ( Tuplesortstate state)

Definition at line 2440 of file tuplesort.c.

2441 {
2442  MemoryContext oldcontext = MemoryContextSwitchTo(state->base.sortcontext);
2443 
2444  Assert(state->base.sortopt & TUPLESORT_RANDOMACCESS);
2445 
2446  switch (state->status)
2447  {
2448  case TSS_SORTEDINMEM:
2449  state->current = 0;
2450  state->eof_reached = false;
2451  state->markpos_offset = 0;
2452  state->markpos_eof = false;
2453  break;
2454  case TSS_SORTEDONTAPE:
2455  LogicalTapeRewindForRead(state->result_tape, 0);
2456  state->eof_reached = false;
2457  state->markpos_block = 0L;
2458  state->markpos_offset = 0;
2459  state->markpos_eof = false;
2460  break;
2461  default:
2462  elog(ERROR, "invalid tuplesort state");
2463  break;
2464  }
2465 
2466  MemoryContextSwitchTo(oldcontext);
2467 }

References Assert(), elog(), ERROR, LogicalTapeRewindForRead(), MemoryContextSwitchTo(), TSS_SORTEDINMEM, TSS_SORTEDONTAPE, and TUPLESORT_RANDOMACCESS.

Referenced by ExecReScanSort(), mode_final(), percentile_cont_final_common(), percentile_cont_multi_final_common(), percentile_disc_final(), and percentile_disc_multi_final().

◆ tuplesort_reset()

void tuplesort_reset ( Tuplesortstate state)

Definition at line 1040 of file tuplesort.c.

1041 {
1044 
1045  /*
1046  * After we've freed up per-batch memory, re-setup all of the state common
1047  * to both the first batch and any subsequent batch.
1048  */
1050 
1051  state->lastReturnedTuple = NULL;
1052  state->slabMemoryBegin = NULL;
1053  state->slabMemoryEnd = NULL;
1054  state->slabFreeHead = NULL;
1055 }

References tuplesort_begin_batch(), tuplesort_free(), and tuplesort_updatemax().

Referenced by ExecIncrementalSort(), ExecReScanIncrementalSort(), and switchToPresortedPrefixMode().

◆ tuplesort_restorepos()

void tuplesort_restorepos ( Tuplesortstate state)

Definition at line 2504 of file tuplesort.c.

2505 {
2506  MemoryContext oldcontext = MemoryContextSwitchTo(state->base.sortcontext);
2507 
2508  Assert(state->base.sortopt & TUPLESORT_RANDOMACCESS);
2509 
2510  switch (state->status)
2511  {
2512  case TSS_SORTEDINMEM:
2513  state->current = state->markpos_offset;
2514  state->eof_reached = state->markpos_eof;
2515  break;
2516  case TSS_SORTEDONTAPE:
2517  LogicalTapeSeek(state->result_tape,
2518  state->markpos_block,
2519  state->markpos_offset);
2520  state->eof_reached = state->markpos_eof;
2521  break;
2522  default:
2523  elog(ERROR, "invalid tuplesort state");
2524  break;
2525  }
2526 
2527  MemoryContextSwitchTo(oldcontext);
2528 }
void LogicalTapeSeek(LogicalTape *lt, long blocknum, int offset)
Definition: logtape.c:1134

References Assert(), elog(), ERROR, LogicalTapeSeek(), MemoryContextSwitchTo(), TSS_SORTEDINMEM, TSS_SORTEDONTAPE, and TUPLESORT_RANDOMACCESS.

Referenced by ExecSortRestrPos().

◆ tuplesort_set_bound()

void tuplesort_set_bound ( Tuplesortstate state,
int64  bound 
)

Definition at line 844 of file tuplesort.c.

845 {
846  /* Assert we're called before loading any tuples */
847  Assert(state->status == TSS_INITIAL && state->memtupcount == 0);
848  /* Assert we allow bounded sorts */
849  Assert(state->base.sortopt & TUPLESORT_ALLOWBOUNDED);
850  /* Can't set the bound twice, either */
851  Assert(!state->bounded);
852  /* Also, this shouldn't be called in a parallel worker */
853  Assert(!WORKER(state));
854 
855  /* Parallel leader allows but ignores hint */
856  if (LEADER(state))
857  return;
858 
859 #ifdef DEBUG_BOUNDED_SORT
860  /* Honor GUC setting that disables the feature (for easy testing) */
861  if (!optimize_bounded_sort)
862  return;
863 #endif
864 
865  /* We want to be able to compute bound * 2, so limit the setting */
866  if (bound > (int64) (INT_MAX / 2))
867  return;
868 
869  state->bounded = true;
870  state->bound = (int) bound;
871 
872  /*
873  * Bounded sorts are not an effective target for abbreviated key
874  * optimization. Disable by setting state to be consistent with no
875  * abbreviation support.
876  */
877  state->base.sortKeys->abbrev_converter = NULL;
878  if (state->base.sortKeys->abbrev_full_comparator)
879  state->base.sortKeys->comparator = state->base.sortKeys->abbrev_full_comparator;
880 
881  /* Not strictly necessary, but be tidy */
882  state->base.sortKeys->abbrev_abort = NULL;
883  state->base.sortKeys->abbrev_full_comparator = NULL;
884 }

References Assert(), LEADER, TSS_INITIAL, TUPLESORT_ALLOWBOUNDED, and WORKER.

Referenced by ExecIncrementalSort(), ExecSort(), and switchToPresortedPrefixMode().

◆ tuplesort_skiptuples()

bool tuplesort_skiptuples ( Tuplesortstate state,
int64  ntuples,
bool  forward 
)

Definition at line 1736 of file tuplesort.c.

1737 {
1738  MemoryContext oldcontext;
1739 
1740  /*
1741  * We don't actually support backwards skip yet, because no callers need
1742  * it. The API is designed to allow for that later, though.
1743  */
1744  Assert(forward);
1745  Assert(ntuples >= 0);
1746  Assert(!WORKER(state));
1747 
1748  switch (state->status)
1749  {
1750  case TSS_SORTEDINMEM:
1751  if (state->memtupcount - state->current >= ntuples)
1752  {
1753  state->current += ntuples;
1754  return true;
1755  }
1756  state->current = state->memtupcount;
1757  state->eof_reached = true;
1758 
1759  /*
1760  * Complain if caller tries to retrieve more tuples than
1761  * originally asked for in a bounded sort. This is because
1762  * returning EOF here might be the wrong thing.
1763  */
1764  if (state->bounded && state->current >= state->bound)
1765  elog(ERROR, "retrieved too many tuples in a bounded sort");
1766 
1767  return false;
1768 
1769  case TSS_SORTEDONTAPE:
1770  case TSS_FINALMERGE:
1771 
1772  /*
1773  * We could probably optimize these cases better, but for now it's
1774  * not worth the trouble.
1775  */
1776  oldcontext = MemoryContextSwitchTo(state->base.sortcontext);
1777  while (ntuples-- > 0)
1778  {
1779  SortTuple stup;
1780 
1781  if (!tuplesort_gettuple_common(state, forward, &stup))
1782  {
1783  MemoryContextSwitchTo(oldcontext);
1784  return false;
1785  }
1787  }
1788  MemoryContextSwitchTo(oldcontext);
1789  return true;
1790 
1791  default:
1792  elog(ERROR, "invalid tuplesort state");
1793  return false; /* keep compiler quiet */
1794  }
1795 }
bool tuplesort_gettuple_common(Tuplesortstate *state, bool forward, SortTuple *stup)
Definition: tuplesort.c:1496

References Assert(), CHECK_FOR_INTERRUPTS, elog(), ERROR, MemoryContextSwitchTo(), TSS_FINALMERGE, TSS_SORTEDINMEM, TSS_SORTEDONTAPE, tuplesort_gettuple_common(), and WORKER.

Referenced by percentile_cont_final_common(), percentile_cont_multi_final_common(), percentile_disc_final(), and percentile_disc_multi_final().

◆ tuplesort_sort_memtuples()

static void tuplesort_sort_memtuples ( Tuplesortstate state)
static

Definition at line 2714 of file tuplesort.c.

2715 {
2716  Assert(!LEADER(state));
2717 
2718  if (state->memtupcount > 1)
2719  {
2720  /*
2721  * Do we have the leading column's value or abbreviation in datum1,
2722  * and is there a specialization for its comparator?
2723  */
2724  if (state->base.haveDatum1 && state->base.sortKeys)
2725  {
2726  if (state->base.sortKeys[0].comparator == ssup_datum_unsigned_cmp)
2727  {
2728  qsort_tuple_unsigned(state->memtuples,
2729  state->memtupcount,
2730  state);
2731  return;
2732  }
2733 #if SIZEOF_DATUM >= 8
2734  else if (state->base.sortKeys[0].comparator == ssup_datum_signed_cmp)
2735  {
2736  qsort_tuple_signed(state->memtuples,
2737  state->memtupcount,
2738  state);
2739  return;
2740  }
2741 #endif
2742  else if (state->base.sortKeys[0].comparator == ssup_datum_int32_cmp)
2743  {
2744  qsort_tuple_int32(state->memtuples,
2745  state->memtupcount,
2746  state);
2747  return;
2748  }
2749  }
2750 
2751  /* Can we use the single-key sort function? */
2752  if (state->base.onlyKey != NULL)
2753  {
2754  qsort_ssup(state->memtuples, state->memtupcount,
2755  state->base.onlyKey);
2756  }
2757  else
2758  {
2759  qsort_tuple(state->memtuples,
2760  state->memtupcount,
2761  state->base.comparetup,
2762  state);
2763  }
2764  }
2765 }
int ssup_datum_unsigned_cmp(Datum x, Datum y, SortSupport ssup)
Definition: tuplesort.c:3177
int ssup_datum_int32_cmp(Datum x, Datum y, SortSupport ssup)
Definition: tuplesort.c:3204

References Assert(), LEADER, ssup_datum_int32_cmp(), and ssup_datum_unsigned_cmp().

Referenced by dumptuples(), and tuplesort_performsort().

◆ tuplesort_space_type_name()

const char* tuplesort_space_type_name ( TuplesortSpaceType  t)

Definition at line 2604 of file tuplesort.c.

2605 {
2607  return t == SORT_SPACE_TYPE_DISK ? "Disk" : "Memory";
2608 }

References Assert(), SORT_SPACE_TYPE_DISK, and SORT_SPACE_TYPE_MEMORY.

Referenced by show_incremental_sort_group_info(), and show_sort_info().

◆ tuplesort_updatemax()

static void tuplesort_updatemax ( Tuplesortstate state)
static

Definition at line 989 of file tuplesort.c.

990 {
991  int64 spaceUsed;
992  bool isSpaceDisk;
993 
994  /*
995  * Note: it might seem we should provide both memory and disk usage for a
996  * disk-based sort. However, the current code doesn't track memory space
997  * accurately once we have begun to return tuples to the caller (since we
998  * don't account for pfree's the caller is expected to do), so we cannot
999  * rely on availMem in a disk sort. This does not seem worth the overhead
1000  * to fix. Is it worth creating an API for the memory context code to
1001  * tell us how much is actually used in sortcontext?
1002  */
1003  if (state->tapeset)
1004  {
1005  isSpaceDisk = true;
1006  spaceUsed = LogicalTapeSetBlocks(state->tapeset) * BLCKSZ;
1007  }
1008  else
1009  {
1010  isSpaceDisk = false;
1011  spaceUsed = state->allowedMem - state->availMem;
1012  }
1013 
1014  /*
1015  * Sort evicts data to the disk when it wasn't able to fit that data into
1016  * main memory. This is why we assume space used on the disk to be more
1017  * important for tracking resource usage than space used in memory. Note
1018  * that the amount of space occupied by some tupleset on the disk might be
1019  * less than amount of space occupied by the same tupleset in memory due
1020  * to more compact representation.
1021  */
1022  if ((isSpaceDisk && !state->isMaxSpaceDisk) ||
1023  (isSpaceDisk == state->isMaxSpaceDisk && spaceUsed > state->maxSpace))
1024  {
1025  state->maxSpace = spaceUsed;
1026  state->isMaxSpaceDisk = isSpaceDisk;
1027  state->maxSpaceStatus = state->status;
1028  }
1029 }

References LogicalTapeSetBlocks().

Referenced by tuplesort_get_stats(), and tuplesort_reset().

◆ tuplesort_used_bound()

bool tuplesort_used_bound ( Tuplesortstate state)

Definition at line 892 of file tuplesort.c.

893 {
894  return state->boundUsed;
895 }

Referenced by ExecIncrementalSort().

◆ worker_freeze_result_tape()

static void worker_freeze_result_tape ( Tuplesortstate state)
static

Definition at line 3047 of file tuplesort.c.

3048 {
3049  Sharedsort *shared = state->shared;
3050  TapeShare output;
3051 
3052  Assert(WORKER(state));
3053  Assert(state->result_tape != NULL);
3054  Assert(state->memtupcount == 0);
3055 
3056  /*
3057  * Free most remaining memory, in case caller is sensitive to our holding
3058  * on to it. memtuples may not be a tiny merge heap at this point.
3059  */
3060  pfree(state->memtuples);
3061  /* Be tidy */
3062  state->memtuples = NULL;
3063  state->memtupsize = 0;
3064 
3065  /*
3066  * Parallel worker requires result tape metadata, which is to be stored in
3067  * shared memory for leader
3068  */
3069  LogicalTapeFreeze(state->result_tape, &output);
3070 
3071  /* Store properties of output tape, and update finished worker count */
3072  SpinLockAcquire(&shared->mutex);
3073  shared->tapes[state->worker] = output;
3074  shared->workersFinished++;
3075  SpinLockRelease(&shared->mutex);
3076 }
static void output(uint64 loop_count)

References Assert(), LogicalTapeFreeze(), Sharedsort::mutex, output(), pfree(), SpinLockAcquire, SpinLockRelease, Sharedsort::tapes, WORKER, and Sharedsort::workersFinished.

Referenced by mergeruns(), and worker_nomergeruns().

◆ worker_get_identifier()

static int worker_get_identifier ( Tuplesortstate state)
static

Definition at line 3019 of file tuplesort.c.

3020 {
3021  Sharedsort *shared = state->shared;
3022  int worker;
3023 
3024  Assert(WORKER(state));
3025 
3026  SpinLockAcquire(&shared->mutex);
3027  worker = shared->currentWorker++;
3028  SpinLockRelease(&shared->mutex);
3029 
3030  return worker;
3031 }

References Assert(), Sharedsort::currentWorker, Sharedsort::mutex, SpinLockAcquire, SpinLockRelease, and WORKER.

Referenced by tuplesort_begin_common().

◆ worker_nomergeruns()

static void worker_nomergeruns ( Tuplesortstate state)
static

Definition at line 3085 of file tuplesort.c.

3086 {
3087  Assert(WORKER(state));
3088  Assert(state->result_tape == NULL);
3089  Assert(state->nOutputRuns == 1);
3090 
3091  state->result_tape = state->destTape;
3093 }

References Assert(), WORKER, and worker_freeze_result_tape().

Referenced by tuplesort_performsort().

Variable Documentation

◆ trace_sort