/* --------------------------------------------------------------------------------------- * * batchsort.h * Generalized tuple sorting routines. * * This module handles sorting of heap tuples, index tuples, or single * Datums (and could easily support other kinds of sortable objects, * if necessary). It works efficiently for both small and large amounts * of data. Small amounts are sorted in-memory using qsort(). Large * amounts are sorted using temporary files and a standard external sort * algorithm. * * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION * src/include/utils/batchsort.h * * --------------------------------------------------------------------------------------- */ #ifndef BATCHSORT_H #define BATCHSORT_H #include "access/itup.h" #include "executor/tuptable.h" #include "fmgr.h" #ifdef PGXC #include "pgxc/execRemote.h" #include "vecexecutor/vecnodes.h" #endif #include "utils/elog.h" #include "utils/relcache.h" #include "vecexecutor/vecstore.h" #include "utils/logtape.h" #include "utils/pg_rusage.h" extern THR_LOCAL int vsort_mem; extern const int MINORDER; /* * Possible states of a BatchSort object. */ typedef enum { BS_INITIAL = 0, BS_BOUNDED, BS_BUILDRUNS, BS_SORTEDINMEM, BS_SORTEDONTAPE, BS_FINALMERGE } BatchSortStatus; /* * Private state of a batchsort operation. */ class Batchsortstate : public VecStore { public: // Memory context for main allocation MemoryContext sortcontext; /* * enumerated value as shown above */ BatchSortStatus m_status; #ifdef PGXC RemoteQueryState* combiner; VectorBatch** m_connDNBatch; int* m_datanodeFetchCursor; // merge sort VecStreamState* streamstate; #endif /* * number of columns in sort key */ int m_nKeys; /* * Scan keys */ ScanKey m_scanKeys; /* * The sortKeys variable is used by every case other than the hash index * case; it is set by tuplesort_begin_xxx. */ SortSupport sortKeys; /* array of length nKeys */ /* * Additional state for managing "abbreviated key" sortsupport routines * (which currently may be used by all cases except the hash index case). * Tracks the intervals at which the optimization's effectiveness is * tested. */ int64 abbrevNext; /* Tuple # at which to next check * applicability */ /* * did caller request random access? */ bool m_randomAccess; /* * did caller specify a maximum number of tuples to return */ bool m_bounded; /* * Top N sort */ bool m_boundUsed; /* * if bounded, the maximum number of tuples */ int m_bound; MultiColumnsData m_unsortColumns; bool* m_isSortKey; long m_lastBlock; int m_lastOffset; /* * variables which relate to external sort */ int m_maxTapes; int m_tapeRange; LogicalTapeSet* m_tapeset; /* * While building initial runs, this is the current output run number * (starting at 0). Afterwards, it is the number of initial runs we made. */ int m_curRun; /* * Unless otherwise noted, all pointer variables below are pointers to * arrays of length maxTapes, holding per-tape data. */ /* * These variables are only used during merge passes. mergeactive[i] is * true if we are reading an input run from (actual) tape number i and * have not yet exhausted that run. mergenext[i] is the memtuples index * of the next pre-read tuple (next to be loaded into the heap) for tape * i, or 0 if we are out of pre-read tuples. mergelast[i] similarly * points to the last pre-read tuple from each tape. mergeavailslots[i] * is the number of unused memtuples[] slots reserved for tape i, and * mergeavailmem[i] is the amount of unused space allocated for tape i. * mergefreelist and mergefirstfree keep track of unused locations in the * memtuples[] array. The memtuples[].tupindex fields link together * pre-read tuples for each tape as well as recycled locations in * mergefreelist. It is OK to use 0 as a null link in these lists, because * memtuples[0] is part of the merge heap and is never a pre-read tuple. */ bool* m_mergeActive; /* active input run source? */ int* m_mergeNext; /* first preread tuple for each source */ int* m_mergeLast; /* last preread tuple for each source */ int* m_mergeAvailslots; /* slots left for prereading each tape */ long* m_mergeAvailmem; /* availMem for prereading each tape */ int m_mergeFreeList; /* head of freelist of recycled slots */ int m_mergeFirstFree; /* first slot never used in this merge */ /* * Variables for Algorithm D. Note that destTape is a "logical" tape * number, ie, an index into the tp_xxx[] arrays. Be careful to keep * "logical" and "actual" tape numbers straight! */ int m_level; /* Knuth's l */ int m_destTape; /* current output tape (Knuth's j, less 1) */ int* m_tpFib; /* Target Fibonacci run counts (A[]) */ int* m_tpRuns; /* # of real runs on each tape */ int* m_tpDummy; /* # of dummy runs for each tape (D[]) */ int* m_tpNum; /* Actual tape numbers (TAPE[]) */ int m_activeTapes; /* # of active input tapes in merge pass */ /* * These variables are used after completion of sorting to keep track of * the next tuple to return. (In the tape case, the tape's current read * position is also critical state.) */ int m_resultTape; /* actual tape number of finished output */ bool m_eofReached; /* reached EOF (needed for cursors) */ /* markpos_xxx holds marked position for mark and restore */ long m_markposBlock; /* tape block# (only used if SORTEDONTAPE) */ int m_markposOffset; /* saved "current", or offset in tape block */ bool m_markposEof; /* saved "eof_reached" */ long m_lastFileBlocks; /* last file blocks used in underlying file */ int64 peakMemorySize; /* memory size before writeMultiColumn*/ // Resource snapshot for time of sort start. // #ifdef TRACE_SORT PGRUsage m_ruStart; #endif char* jitted_CompareMultiColumn; /* jitted function for CompareMultiColumn */ char* jitted_CompareMultiColumn_TOPN; /* jitted function for CompareMultiColumn used by Top N sort */ /* * Initialize variables. */ void InitCommon(int64 workMem, bool randomAccess); /* * Based on hyper log log estimation, consider wheather exit abbreviate * mode or not. */ bool ConsiderAbortCommon(); void SortInMem(); int GetSortMergeOrder(); void InitTapes(); void DumpMultiColumn(bool all); int HeapCompare(MultiColumns* a, MultiColumns* b, bool checkIdx); template int THeapCompare(MultiColumns* a, MultiColumns* b); template void BatchSortHeapInsert(MultiColumns* multiColumn, int multiColumnIdx); template void BatchSortHeapSiftup(); void MarkRunEnd(int tapenum); void SelectNewTape(); void MergeRuns(); void BeginMerge(); void MergeOneRun(); void MergePreReadDone(int srcTape); void MergePreRead(); void GetBatch(bool forward, VectorBatch* batch); void (Batchsortstate::*m_getBatchFun)(bool forward, VectorBatch* batch); void GetBatchInMemory(bool forward, VectorBatch* batch); void GetBatchDisk(bool forward, VectorBatch* batch); void GetBatchFinalMerge(bool forward, VectorBatch* batch); void BindingGetBatchFun(); inline void ReverseDirectionHeap(); void MakeBoundedHeap(); void SortBoundedHeap(); void DumpUnsortColumns(bool all); /* * These function pointers decouple the routines that must know what kind * of tuple we are sorting from the routines that don't need to know it. * They are set up by the tuplesort_begin_xxx routines. * * Function to compare two tuples; result is per qsort() convention, ie: * <0, 0, >0 according as ab. The API must match * qsort_arg_comparator. */ int (*compareMultiColumn)(const MultiColumns* a, const MultiColumns* b, Batchsortstate* state); /* * Function to copy a supplied input tuple into palloc'd space and set up * its SortTuple representation (ie, set tuple/datum1/isnull1). Also, * state->availMem must be decreased by the amount of space used for the * tuple copy (note the SortTuple struct itself is not counted). */ void (*copyMultiColumn)(Batchsortstate* state, MultiColumns* stup, void* tup); /* * Function to write a stored tuple onto tape. The representation of the * tuple on tape need not be the same as it is in memory; requirements on * the tape representation are given below. After writing the tuple, * pfree() the out-of-line data (not the SortTuple struct!), and increase * state->availMem by the amount of memory space thereby released. */ void (*writeMultiColumn)(Batchsortstate* state, int tapenum, MultiColumns* stup); /* * Function to read a stored tuple from tape back into memory. 'len' is * the already-read length of the stored tuple. Create a palloc'd copy, * initialize tuple/datum1/isnull1 in the target SortTuple struct, and * decrease state->availMem by the amount of memory space consumed. */ void (*readMultiColumn)(Batchsortstate* state, MultiColumns& stup, int tapenum, unsigned int len); #ifdef PGXC /* * Function to read length of next stored tuple. * Used as 'len' parameter for readtup function. */ unsigned int (*getlen)(Batchsortstate* state, int tapenum, bool eofOK); #endif /* * Function to reverse the sort direction from its current state. (We * could dispense with this if we wanted to enforce that all variants * represent the sort key information alike.) */ void (*reversedirection)(Batchsortstate* state); /* * Function to accept one batch while collecting input data for sort. * Note that the input data is always copied; the caller need not save it. * * We choose batchsort_putbatch for normal case, choose * batchsort_putbatch for fast abbreviate comparison function for sort. * So we need to implement different function of batchsort_putbatch. */ void (*sort_putbatch)(Batchsortstate* state, VectorBatch* batch, int start, int end); }; extern Batchsortstate* batchsort_begin_heap(TupleDesc tupDesc, int nkeys, AttrNumber* attNums, Oid* sortOperators, Oid* sortCollations, const bool* nullsFirstFlags, int64 workMem, bool randomAccess, int64 maxMem = 0, int planId = 0, int dop = 1); #ifdef PGXC extern Batchsortstate* batchsort_begin_merge(TupleDesc tupDesc, int nkeys, AttrNumber* attNums, Oid* sortOperators, Oid* sortCollations, const bool* nullsFirstFlags, void* combiner, int64 workMem); #endif extern void batchsort_set_bound(Batchsortstate* state, int64 bound); /* * abbreSortOptimize used to mark whether allocate one more Datum for * fast compare of two data(text or numeric type) */ template void putbatch(Batchsortstate* state, VectorBatch* batch, int start, int end) { int64 memorySize = 0; for (int row = start; row < end; ++row) { MultiColumns multiColumn = state->CopyMultiColumn(batch, row); if (abbrevSortOptimize) { if (state->sortKeys->abbrev_converter && !IS_NULL(multiColumn.m_nulls[state->sortKeys->ssup_attno - 1]) && !state->ConsiderAbortCommon()) /* Store abbreviated key representation */ multiColumn.m_values[batch->m_cols] = state->sortKeys->abbrev_converter( multiColumn.m_values[state->sortKeys->ssup_attno - 1], state->sortKeys); else multiColumn.m_values[batch->m_cols] = multiColumn.m_values[state->sortKeys->ssup_attno - 1]; } switch (state->m_status) { case BS_INITIAL: if ((!state->HasFreeSlot() || state->m_availMem <= 0) && state->m_storeColumns.m_memRowNum >= MINORDER * 2) { /* if mem is used up, then adjust capacity */ if (state->m_availMem <= 0) state->m_storeColumns.m_capacity = state->m_storeColumns.m_memRowNum + 1; state->GrowMemValueSlots("VecSort", state->m_planId, state->sortcontext); } state->PutValue(multiColumn); /* * Check if it's time to switch over to a bounded heapsort. We do * so if the input tuple count exceeds twice the desired tuple * count (this is a heuristic for where heapsort becomes cheaper * than a quicksort), or if we've just filled workMem and have * enough tuples to meet the bound. * * Note that once we enter TSS_BOUNDED state we will always try to * complete the sort that way. In the worst case, if later input * tuples are larger than earlier ones, this might cause us to * exceed workMem significantly. */ if (state->m_bounded && (state->m_storeColumns.m_memRowNum > state->m_bound * 2 || (state->m_storeColumns.m_memRowNum > state->m_bound && state->LackMem()))) { #ifdef TRACE_SORT if (u_sess->attr.attr_common.trace_sort) { elog(LOG, "switching to bounded heapsort at %d tuples: %s", state->m_storeColumns.m_memRowNum, pg_rusage_show(&state->m_ruStart)); } #endif if (state->m_storeColumns.m_memRowNum > 0) { state->m_colWidth /= state->m_storeColumns.m_memRowNum; state->m_addWidth = false; } state->MakeBoundedHeap(); continue; } /* * Once we do not have enough memory, we will turn to external sort. But before we do * InitTapes, we should have at least MINORDER * 2 MultiColumns in MultiColumnsData. * Since the minimal number of tapes we need in external sort is MINORDER. */ while (state->LackMem() && (state->m_storeColumns.m_memRowNum < MINORDER * 2)) { state->m_availMem += state->m_allowedMem; } /* * Done if we still fit in available memory and have at least MINORDER * 2 array slots. */ if ((state->m_storeColumns.m_memRowNum < state->m_storeColumns.m_capacity && !state->LackMem()) || state->m_storeColumns.m_memRowNum < MINORDER * 2) continue; if (state->m_storeColumns.m_memRowNum > 0) { state->m_colWidth /= state->m_storeColumns.m_memRowNum; state->m_addWidth = false; } if (state->LackMem()) { ereport(LOG, (errmodule(MOD_VEC_EXECUTOR), errmsg("Profiling Warning: " "VecSort(%d) Disk Spilled : workmem: %ldKB, availmem: %ldKB, " "memRowNum: %d, memCapacity: %d", state->m_planId, state->m_allowedMem / 1024L, state->m_availMem / 1024L, state->m_storeColumns.m_memRowNum, state->m_storeColumns.m_capacity))); } state->InitTapes(); /* * Cache memory size info Batchsortstate before write to tapes. * If the memory size has been cached during previous dump, * do not update the size with current context size. * note: state->peakMemorySize is only used to log memory size before dump for now. * Keep caution once it is adopted for other cases in the future.. */ if (state->peakMemorySize <= 0) { memorySize = 0; CalculateContextSize(state->sortcontext, &memorySize); state->peakMemorySize = memorySize; } #ifdef TRACE_SORT if (u_sess->attr.attr_common.trace_sort) ereport(LOG, (errmodule(MOD_VEC_EXECUTOR), errmsg("Profiling LOG: " "VecSort(%d) Disk Spilled : workmem: %ldKB, availmem: %ldKB, " "memRowNum: %d, memCapacity: %d", state->m_planId, state->m_allowedMem / 1024L, state->m_availMem / 1024L, state->m_storeColumns.m_memRowNum, state->m_storeColumns.m_capacity))); #endif state->m_storeColumns.m_capacity = state->m_storeColumns.m_memRowNum; /* * If we are over the memory limit, dump tuples till we're under. */ state->DumpMultiColumn(false); #ifdef TRACE_SORT if (u_sess->attr.attr_common.trace_sort) ereport(LOG, (errmodule(MOD_VEC_EXECUTOR), errmsg("Profiling LOG: " "VecSort(%d) Disk Spilled : workmem: %ldKB, availmem: %ldKB, " "memRowNum: %d, memCapacity: %d", state->m_planId, state->m_allowedMem / 1024L, state->m_availMem / 1024L, state->m_storeColumns.m_memRowNum, state->m_storeColumns.m_capacity))); #endif break; case BS_BOUNDED: if (state->compareMultiColumn(&multiColumn, state->m_storeColumns.m_memValues, state) <= 0) { state->FreeMultiColumn(&multiColumn); } else { state->FreeMultiColumn(state->m_storeColumns.m_memValues); state->BatchSortHeapSiftup(); state->BatchSortHeapInsert(&multiColumn, 0); } break; case BS_BUILDRUNS: if (state->compareMultiColumn(&multiColumn, &state->m_storeColumns.m_memValues[0], state) >= 0) { state->BatchSortHeapInsert(&multiColumn, state->m_curRun); } else { state->BatchSortHeapInsert(&multiColumn, state->m_curRun + 1); } /* * If we are over the memory limit, dump tuples till we're under. */ state->DumpMultiColumn(false); break; default: elog(ERROR, "invalid BatchSort state"); break; } } } /* * Accept one tuple while collecting input data for sort. * Note that the input data is always copied; the caller need not save it. * * When the first column of order_by_columns is text or numeric type, we * use bttextcmp_abbrev or numeric_cmp_abbrev to speed up compare operation, * meanwhile we need to allocate one more Datum to store prefix info, so * set abbreSortOptimize to be true. For other case set abbreSortOptimize * to be false. */ template void batchsort_putbatch(Batchsortstate* state, VectorBatch* batch, int start, int end) { MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); /* * it is only initialized once. */ if (!state->m_colInfo) { state->InitColInfo(batch); } putbatch(state, batch, start, end); MemoryContextSwitchTo(oldcontext); } extern void batchsort_performsort(Batchsortstate* state); extern void batchsort_getbatch(Batchsortstate* state, bool forward, VectorBatch* batch); extern void batchsort_end(Batchsortstate* state); extern void batchsort_get_stats(Batchsortstate* state, int* sortMethodId, int* spaceTypeId, long* spaceUsed); /* * These routines may only be called if randomAccess was specified 'true'. * Likewise, backwards scan in gettuple/getdatum is only allowed if * randomAccess was specified. */ extern void batchsort_rescan(Batchsortstate* state); extern void batchsort_markpos(Batchsortstate* state); extern void batchsort_restorepos(Batchsortstate* state); extern void batchsort_get_stats(Batchsortstate* state, int* sortMethodId, int* spaceTypeId, long* spaceUsed); #endif /* BATCHSORT_H */