Files
postgresql/src/backend/executor/execGrouping.c
Bruce Momjian 451c43974f Update copyright for 2026
Backpatch-through: 14
2026-01-01 13:24:10 -05:00

625 lines
20 KiB
C

/*-------------------------------------------------------------------------
*
* execGrouping.c
* executor utility routines for grouping, hashing, and aggregation
*
* Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
* src/backend/executor/execGrouping.c
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include <math.h>
#include "access/htup_details.h"
#include "access/parallel.h"
#include "common/hashfn.h"
#include "executor/executor.h"
#include "miscadmin.h"
#include "utils/lsyscache.h"
static int TupleHashTableMatch(struct tuplehash_hash *tb, MinimalTuple tuple1, MinimalTuple tuple2);
static inline uint32 TupleHashTableHash_internal(struct tuplehash_hash *tb,
MinimalTuple tuple);
static inline TupleHashEntry LookupTupleHashEntry_internal(TupleHashTable hashtable,
TupleTableSlot *slot,
bool *isnew, uint32 hash);
/*
* Define parameters for tuple hash table code generation. The interface is
* *also* declared in execnodes.h (to generate the types, which are externally
* visible).
*/
#define SH_PREFIX tuplehash
#define SH_ELEMENT_TYPE TupleHashEntryData
#define SH_KEY_TYPE MinimalTuple
#define SH_KEY firstTuple
#define SH_HASH_KEY(tb, key) TupleHashTableHash_internal(tb, key)
#define SH_EQUAL(tb, a, b) TupleHashTableMatch(tb, a, b) == 0
#define SH_SCOPE extern
#define SH_STORE_HASH
#define SH_GET_HASH(tb, a) a->hash
#define SH_DEFINE
#include "lib/simplehash.h"
/*****************************************************************************
* Utility routines for grouping tuples together
*****************************************************************************/
/*
* execTuplesMatchPrepare
* Build expression that can be evaluated using ExecQual(), returning
* whether an ExprContext's inner/outer tuples are NOT DISTINCT
*/
ExprState *
execTuplesMatchPrepare(TupleDesc desc,
int numCols,
const AttrNumber *keyColIdx,
const Oid *eqOperators,
const Oid *collations,
PlanState *parent)
{
Oid *eqFunctions;
int i;
ExprState *expr;
if (numCols == 0)
return NULL;
eqFunctions = (Oid *) palloc(numCols * sizeof(Oid));
/* lookup equality functions */
for (i = 0; i < numCols; i++)
eqFunctions[i] = get_opcode(eqOperators[i]);
/* build actual expression */
expr = ExecBuildGroupingEqual(desc, desc, NULL, NULL,
numCols, keyColIdx, eqFunctions, collations,
parent);
return expr;
}
/*
* execTuplesHashPrepare
* Look up the equality and hashing functions needed for a TupleHashTable.
*
* This is similar to execTuplesMatchPrepare, but we also need to find the
* hash functions associated with the equality operators. *eqFunctions and
* *hashFunctions receive the palloc'd result arrays.
*
* Note: we expect that the given operators are not cross-type comparisons.
*/
void
execTuplesHashPrepare(int numCols,
const Oid *eqOperators,
Oid **eqFuncOids,
FmgrInfo **hashFunctions)
{
int i;
*eqFuncOids = (Oid *) palloc(numCols * sizeof(Oid));
*hashFunctions = (FmgrInfo *) palloc(numCols * sizeof(FmgrInfo));
for (i = 0; i < numCols; i++)
{
Oid eq_opr = eqOperators[i];
Oid eq_function;
Oid left_hash_function;
Oid right_hash_function;
eq_function = get_opcode(eq_opr);
if (!get_op_hash_functions(eq_opr,
&left_hash_function, &right_hash_function))
elog(ERROR, "could not find hash function for hash operator %u",
eq_opr);
/* We're not supporting cross-type cases here */
Assert(left_hash_function == right_hash_function);
(*eqFuncOids)[i] = eq_function;
fmgr_info(right_hash_function, &(*hashFunctions)[i]);
}
}
/*****************************************************************************
* Utility routines for all-in-memory hash tables
*
* These routines build hash tables for grouping tuples together (eg, for
* hash aggregation). There is one entry for each not-distinct set of tuples
* presented.
*****************************************************************************/
/*
* Construct an empty TupleHashTable
*
* parent: PlanState node that will own this hash table
* inputDesc: tuple descriptor for input tuples
* inputOps: slot ops for input tuples, or NULL if unknown or not fixed
* numCols: number of columns to be compared (length of next 4 arrays)
* keyColIdx: indexes of tuple columns to compare
* eqfuncoids: OIDs of equality comparison functions to use
* hashfunctions: FmgrInfos of datatype-specific hashing functions to use
* collations: collations to use in comparisons
* nelements: initial estimate of hashtable size
* additionalsize: size of data that may be stored along with the hash entry
* metacxt: memory context for long-lived data and the simplehash table
* tuplescxt: memory context in which to store the hashed tuples themselves
* tempcxt: short-lived context for evaluation hash and comparison functions
* use_variable_hash_iv: if true, adjust hash IV per-parallel-worker
*
* The hashfunctions array may be made with execTuplesHashPrepare(). Note they
* are not cross-type functions, but expect to see the table datatype(s)
* on both sides.
*
* Note that the keyColIdx, hashfunctions, and collations arrays must be
* allocated in storage that will live as long as the hashtable does.
*
* The metacxt and tuplescxt are separate because it's usually desirable for
* tuplescxt to be a BumpContext to avoid memory wastage, while metacxt must
* support pfree in case the simplehash table needs to be enlarged. (We could
* simplify the API of TupleHashTables by managing the tuplescxt internally.
* But that would be disadvantageous to nodeAgg.c and nodeSubplan.c, which use
* a single tuplescxt for multiple TupleHashTables that are reset together.)
*
* LookupTupleHashEntry, FindTupleHashEntry, and related functions may leak
* memory in the tempcxt. It is caller's responsibility to reset that context
* reasonably often, typically once per tuple. (We do it that way, rather
* than managing an extra context within the hashtable, because in many cases
* the caller can specify a tempcxt that it needs to reset per-tuple anyway.)
*
* We don't currently provide DestroyTupleHashTable functionality; the hash
* table will be cleaned up at destruction of the metacxt. (Some callers
* bother to delete the tuplescxt explicitly, though it'd be sufficient to
* ensure it's a child of the metacxt.) There's not much point in working
* harder than this so long as the expression-evaluation infrastructure
* behaves similarly.
*/
TupleHashTable
BuildTupleHashTable(PlanState *parent,
TupleDesc inputDesc,
const TupleTableSlotOps *inputOps,
int numCols,
AttrNumber *keyColIdx,
const Oid *eqfuncoids,
FmgrInfo *hashfunctions,
Oid *collations,
double nelements,
Size additionalsize,
MemoryContext metacxt,
MemoryContext tuplescxt,
MemoryContext tempcxt,
bool use_variable_hash_iv)
{
TupleHashTable hashtable;
uint32 nbuckets;
MemoryContext oldcontext;
uint32 hash_iv = 0;
/*
* tuplehash_create requires a uint32 element count, so we had better
* clamp the given nelements to fit in that. As long as we have to do
* that, we might as well protect against completely insane input like
* zero or NaN. But it is not our job here to enforce issues like staying
* within hash_mem: the caller should have done that, and we don't have
* enough info to second-guess.
*/
if (isnan(nelements) || nelements <= 0)
nbuckets = 1;
else if (nelements >= PG_UINT32_MAX)
nbuckets = PG_UINT32_MAX;
else
nbuckets = (uint32) nelements;
/* tuplescxt must be separate, else ResetTupleHashTable breaks things */
Assert(metacxt != tuplescxt);
/* ensure additionalsize is maxalign'ed */
additionalsize = MAXALIGN(additionalsize);
oldcontext = MemoryContextSwitchTo(metacxt);
hashtable = palloc_object(TupleHashTableData);
hashtable->numCols = numCols;
hashtable->keyColIdx = keyColIdx;
hashtable->tab_collations = collations;
hashtable->tuplescxt = tuplescxt;
hashtable->tempcxt = tempcxt;
hashtable->additionalsize = additionalsize;
hashtable->tableslot = NULL; /* will be made on first lookup */
hashtable->inputslot = NULL;
hashtable->in_hash_expr = NULL;
hashtable->cur_eq_func = NULL;
/*
* If parallelism is in use, even if the leader backend is performing the
* scan itself, we don't want to create the hashtable exactly the same way
* in all workers. As hashtables are iterated over in keyspace-order,
* doing so in all processes in the same way is likely to lead to
* "unbalanced" hashtables when the table size initially is
* underestimated.
*/
if (use_variable_hash_iv)
hash_iv = murmurhash32(ParallelWorkerNumber);
hashtable->hashtab = tuplehash_create(metacxt, nbuckets, hashtable);
/*
* We copy the input tuple descriptor just for safety --- we assume all
* input tuples will have equivalent descriptors.
*/
hashtable->tableslot = MakeSingleTupleTableSlot(CreateTupleDescCopy(inputDesc),
&TTSOpsMinimalTuple);
/* build hash ExprState for all columns */
hashtable->tab_hash_expr = ExecBuildHash32FromAttrs(inputDesc,
inputOps,
hashfunctions,
collations,
numCols,
keyColIdx,
parent,
hash_iv);
/* build comparator for all columns */
hashtable->tab_eq_func = ExecBuildGroupingEqual(inputDesc, inputDesc,
inputOps,
&TTSOpsMinimalTuple,
numCols,
keyColIdx, eqfuncoids, collations,
parent);
/*
* While not pretty, it's ok to not shut down this context, but instead
* rely on the containing memory context being reset, as
* ExecBuildGroupingEqual() only builds a very simple expression calling
* functions (i.e. nothing that'd employ RegisterExprContextCallback()).
*/
hashtable->exprcontext = CreateStandaloneExprContext();
MemoryContextSwitchTo(oldcontext);
return hashtable;
}
/*
* Reset contents of the hashtable to be empty, preserving all the non-content
* state.
*
* Note: in usages where several TupleHashTables share a tuplescxt, all must
* be reset together, as the first one's reset call will destroy all their
* data. The additional reset calls for the rest will redundantly reset the
* tuplescxt. But because of mcxt.c's isReset flag, that's cheap enough that
* we need not avoid it.
*/
void
ResetTupleHashTable(TupleHashTable hashtable)
{
tuplehash_reset(hashtable->hashtab);
MemoryContextReset(hashtable->tuplescxt);
}
/*
* Estimate the amount of space needed for a TupleHashTable with nentries
* entries, if the tuples have average data width tupleWidth and the caller
* requires additionalsize extra space per entry.
*
* Return SIZE_MAX if it'd overflow size_t.
*
* nentries is "double" because this is meant for use by the planner,
* which typically works with double rowcount estimates. So we'd need to
* clamp to integer somewhere and that might as well be here. We do expect
* the value not to be NaN or negative, else the result will be garbage.
*/
Size
EstimateTupleHashTableSpace(double nentries,
Size tupleWidth,
Size additionalsize)
{
Size sh_space;
double tuples_space;
/* First estimate the space needed for the simplehash table */
sh_space = tuplehash_estimate_space(nentries);
/* Give up if that's already too big */
if (sh_space >= SIZE_MAX)
return sh_space;
/*
* Compute space needed for hashed tuples with additional data. nentries
* must be somewhat sane, so it should be safe to compute this product.
*
* We assume that the hashed tuples will be kept in a BumpContext so that
* there is not additional per-tuple overhead.
*
* (Note that this is only accurate if MEMORY_CONTEXT_CHECKING is off,
* else bump.c will add a MemoryChunk header to each tuple. However, it
* seems undesirable for debug builds to make different planning choices
* than production builds, so we assume the production behavior always.)
*/
tuples_space = nentries * (MAXALIGN(SizeofMinimalTupleHeader) +
MAXALIGN(tupleWidth) +
MAXALIGN(additionalsize));
/*
* Check for size_t overflow. This coding is trickier than it may appear,
* because on 64-bit machines SIZE_MAX cannot be represented exactly as a
* double. We must cast it explicitly to suppress compiler warnings about
* an inexact conversion, and we must trust that any double value that
* compares strictly less than "(double) SIZE_MAX" will cast to a
* representable size_t value.
*/
if (sh_space + tuples_space >= (double) SIZE_MAX)
return SIZE_MAX;
/* We don't bother estimating size of the miscellaneous overhead data */
return (Size) (sh_space + tuples_space);
}
/*
* Find or create a hashtable entry for the tuple group containing the
* given tuple. The tuple must be the same type as the hashtable entries.
*
* If isnew is NULL, we do not create new entries; we return NULL if no
* match is found.
*
* If hash is not NULL, we set it to the calculated hash value. This allows
* callers access to the hash value even if no entry is returned.
*
* If isnew isn't NULL, then a new entry is created if no existing entry
* matches. On return, *isnew is true if the entry is newly created,
* false if it existed already. The additional data in the new entry has
* been zeroed.
*/
TupleHashEntry
LookupTupleHashEntry(TupleHashTable hashtable, TupleTableSlot *slot,
bool *isnew, uint32 *hash)
{
TupleHashEntry entry;
MemoryContext oldContext;
uint32 local_hash;
/* Need to run the hash functions in short-lived context */
oldContext = MemoryContextSwitchTo(hashtable->tempcxt);
/* set up data needed by hash and match functions */
hashtable->inputslot = slot;
hashtable->in_hash_expr = hashtable->tab_hash_expr;
hashtable->cur_eq_func = hashtable->tab_eq_func;
local_hash = TupleHashTableHash_internal(hashtable->hashtab, NULL);
entry = LookupTupleHashEntry_internal(hashtable, slot, isnew, local_hash);
if (hash != NULL)
*hash = local_hash;
Assert(entry == NULL || entry->hash == local_hash);
MemoryContextSwitchTo(oldContext);
return entry;
}
/*
* Compute the hash value for a tuple
*/
uint32
TupleHashTableHash(TupleHashTable hashtable, TupleTableSlot *slot)
{
MemoryContext oldContext;
uint32 hash;
hashtable->inputslot = slot;
hashtable->in_hash_expr = hashtable->tab_hash_expr;
/* Need to run the hash functions in short-lived context */
oldContext = MemoryContextSwitchTo(hashtable->tempcxt);
hash = TupleHashTableHash_internal(hashtable->hashtab, NULL);
MemoryContextSwitchTo(oldContext);
return hash;
}
/*
* A variant of LookupTupleHashEntry for callers that have already computed
* the hash value.
*/
TupleHashEntry
LookupTupleHashEntryHash(TupleHashTable hashtable, TupleTableSlot *slot,
bool *isnew, uint32 hash)
{
TupleHashEntry entry;
MemoryContext oldContext;
/* Need to run the hash functions in short-lived context */
oldContext = MemoryContextSwitchTo(hashtable->tempcxt);
/* set up data needed by hash and match functions */
hashtable->inputslot = slot;
hashtable->in_hash_expr = hashtable->tab_hash_expr;
hashtable->cur_eq_func = hashtable->tab_eq_func;
entry = LookupTupleHashEntry_internal(hashtable, slot, isnew, hash);
Assert(entry == NULL || entry->hash == hash);
MemoryContextSwitchTo(oldContext);
return entry;
}
/*
* Search for a hashtable entry matching the given tuple. No entry is
* created if there's not a match. This is similar to the non-creating
* case of LookupTupleHashEntry, except that it supports cross-type
* comparisons, in which the given tuple is not of the same type as the
* table entries. The caller must provide the hash ExprState to use for
* the input tuple, as well as the equality ExprState, since these may be
* different from the table's internal functions.
*/
TupleHashEntry
FindTupleHashEntry(TupleHashTable hashtable, TupleTableSlot *slot,
ExprState *eqcomp,
ExprState *hashexpr)
{
TupleHashEntry entry;
MemoryContext oldContext;
MinimalTuple key;
/* Need to run the hash functions in short-lived context */
oldContext = MemoryContextSwitchTo(hashtable->tempcxt);
/* Set up data needed by hash and match functions */
hashtable->inputslot = slot;
hashtable->in_hash_expr = hashexpr;
hashtable->cur_eq_func = eqcomp;
/* Search the hash table */
key = NULL; /* flag to reference inputslot */
entry = tuplehash_lookup(hashtable->hashtab, key);
MemoryContextSwitchTo(oldContext);
return entry;
}
/*
* If tuple is NULL, use the input slot instead. This convention avoids the
* need to materialize virtual input tuples unless they actually need to get
* copied into the table.
*
* Also, the caller must select an appropriate memory context for running
* the hash functions.
*/
static uint32
TupleHashTableHash_internal(struct tuplehash_hash *tb,
MinimalTuple tuple)
{
TupleHashTable hashtable = (TupleHashTable) tb->private_data;
uint32 hashkey;
TupleTableSlot *slot;
bool isnull;
if (tuple == NULL)
{
/* Process the current input tuple for the table */
hashtable->exprcontext->ecxt_innertuple = hashtable->inputslot;
hashkey = DatumGetUInt32(ExecEvalExpr(hashtable->in_hash_expr,
hashtable->exprcontext,
&isnull));
}
else
{
/*
* Process a tuple already stored in the table.
*
* (this case never actually occurs due to the way simplehash.h is
* used, as the hash-value is stored in the entries)
*/
slot = hashtable->exprcontext->ecxt_innertuple = hashtable->tableslot;
ExecStoreMinimalTuple(tuple, slot, false);
hashkey = DatumGetUInt32(ExecEvalExpr(hashtable->tab_hash_expr,
hashtable->exprcontext,
&isnull));
}
/*
* The hashing done above, even with an initial value, doesn't tend to
* result in good hash perturbation. Running the value produced above
* through murmurhash32 leads to near perfect hash perturbation.
*/
return murmurhash32(hashkey);
}
/*
* Does the work of LookupTupleHashEntry and LookupTupleHashEntryHash. Useful
* so that we can avoid switching the memory context multiple times for
* LookupTupleHashEntry.
*
* NB: This function may or may not change the memory context. Caller is
* expected to change it back.
*/
static inline TupleHashEntry
LookupTupleHashEntry_internal(TupleHashTable hashtable, TupleTableSlot *slot,
bool *isnew, uint32 hash)
{
TupleHashEntryData *entry;
bool found;
MinimalTuple key;
key = NULL; /* flag to reference inputslot */
if (isnew)
{
entry = tuplehash_insert_hash(hashtable->hashtab, key, hash, &found);
if (found)
{
/* found pre-existing entry */
*isnew = false;
}
else
{
/* created new entry */
*isnew = true;
MemoryContextSwitchTo(hashtable->tuplescxt);
/*
* Copy the first tuple into the tuples context, and request
* additionalsize extra bytes before the allocation.
*
* The caller can get a pointer to the additional data with
* TupleHashEntryGetAdditional(), and store arbitrary data there.
* Placing both the tuple and additional data in the same
* allocation avoids the need to store an extra pointer in
* TupleHashEntryData or allocate an additional chunk.
*/
entry->firstTuple = ExecCopySlotMinimalTupleExtra(slot,
hashtable->additionalsize);
}
}
else
{
entry = tuplehash_lookup_hash(hashtable->hashtab, key, hash);
}
return entry;
}
/*
* See whether two tuples (presumably of the same hash value) match
*/
static int
TupleHashTableMatch(struct tuplehash_hash *tb, MinimalTuple tuple1, MinimalTuple tuple2)
{
TupleTableSlot *slot1;
TupleTableSlot *slot2;
TupleHashTable hashtable = (TupleHashTable) tb->private_data;
ExprContext *econtext = hashtable->exprcontext;
/*
* We assume that simplehash.h will only ever call us with the first
* argument being an actual table entry, and the second argument being
* LookupTupleHashEntry's dummy TupleHashEntryData. The other direction
* could be supported too, but is not currently required.
*/
Assert(tuple1 != NULL);
slot1 = hashtable->tableslot;
ExecStoreMinimalTuple(tuple1, slot1, false);
Assert(tuple2 == NULL);
slot2 = hashtable->inputslot;
/* For crosstype comparisons, the inputslot must be first */
econtext->ecxt_innertuple = slot2;
econtext->ecxt_outertuple = slot1;
return !ExecQualAndReset(hashtable->cur_eq_func, econtext);
}